1 // Protocol Buffers - Google's data interchange format
2 // Copyright 2008 Google Inc. All rights reserved.
3 //
4 // Use of this source code is governed by a BSD-style
5 // license that can be found in the LICENSE file or at
6 // https://developers.google.com/open-source/licenses/bsd
7
8 // Author: kenton@google.com (Kenton Varda)
9 // Based on original Protocol Buffers design by
10 // Sanjay Ghemawat, Jeff Dean, and others.
11 //
12 // Class for parsing tokenized text from a ZeroCopyInputStream.
13
14 #ifndef GOOGLE_PROTOBUF_IO_TOKENIZER_H__
15 #define GOOGLE_PROTOBUF_IO_TOKENIZER_H__
16
17 #include <string>
18 #include <vector>
19
20 #include "google/protobuf/stubs/common.h"
21 #include "absl/log/absl_log.h"
22 #include "absl/strings/string_view.h"
23 #include "google/protobuf/port.h"
24
25 // Must be included last.
26 #include "google/protobuf/port_def.inc"
27
28 namespace google {
29 namespace protobuf {
30 namespace io {
31
32 class ZeroCopyInputStream; // zero_copy_stream.h
33
34 // Defined in this file.
35 class ErrorCollector;
36 class Tokenizer;
37
38 // By "column number", the proto compiler refers to a count of the number
39 // of bytes before a given byte, except that a tab character advances to
40 // the next multiple of 8 bytes. Note in particular that column numbers
41 // are zero-based, while many user interfaces use one-based column numbers.
42 typedef int ColumnNumber;
43
44 // Abstract interface for an object which collects the errors that occur
45 // during parsing. A typical implementation might simply print the errors
46 // to stdout.
47 class PROTOBUF_EXPORT ErrorCollector {
48 public:
ErrorCollector()49 inline ErrorCollector() {}
50 ErrorCollector(const ErrorCollector&) = delete;
51 ErrorCollector& operator=(const ErrorCollector&) = delete;
52 virtual ~ErrorCollector();
53
54 // Indicates that there was an error in the input at the given line and
55 // column numbers. The numbers are zero-based, so you may want to add
56 // 1 to each before printing them.
57 virtual void RecordError(int line, ColumnNumber column,
58 absl::string_view message)
59 = 0;
60
61 // Indicates that there was a warning in the input at the given line and
62 // column numbers. The numbers are zero-based, so you may want to add
63 // 1 to each before printing them.
RecordWarning(int line,ColumnNumber column,absl::string_view message)64 virtual void RecordWarning(int line, ColumnNumber column,
65 absl::string_view message) {
66 }
67
68 };
69
70 // This class converts a stream of raw text into a stream of tokens for
71 // the protocol definition parser to parse. The tokens recognized are
72 // similar to those that make up the C language; see the TokenType enum for
73 // precise descriptions. Whitespace and comments are skipped. By default,
74 // C- and C++-style comments are recognized, but other styles can be used by
75 // calling set_comment_style().
76 class PROTOBUF_EXPORT Tokenizer {
77 public:
78 // Construct a Tokenizer that reads and tokenizes text from the given
79 // input stream and writes errors to the given error_collector.
80 // The caller keeps ownership of input and error_collector.
81 Tokenizer(ZeroCopyInputStream* input, ErrorCollector* error_collector);
82 Tokenizer(const Tokenizer&) = delete;
83 Tokenizer& operator=(const Tokenizer&) = delete;
84 ~Tokenizer();
85
86 enum TokenType {
87 TYPE_START, // Next() has not yet been called.
88 TYPE_END, // End of input reached. "text" is empty.
89
90 TYPE_IDENTIFIER, // A sequence of letters, digits, and underscores, not
91 // starting with a digit. It is an error for a number
92 // to be followed by an identifier with no space in
93 // between.
94 TYPE_INTEGER, // A sequence of digits representing an integer. Normally
95 // the digits are decimal, but a prefix of "0x" indicates
96 // a hex number and a leading zero indicates octal, just
97 // like with C numeric literals. A leading negative sign
98 // is NOT included in the token; it's up to the parser to
99 // interpret the unary minus operator on its own.
100 TYPE_FLOAT, // A floating point literal, with a fractional part and/or
101 // an exponent. Always in decimal. Again, never
102 // negative.
103 TYPE_STRING, // A quoted sequence of escaped characters. Either single
104 // or double quotes can be used, but they must match.
105 // A string literal cannot cross a line break.
106 TYPE_SYMBOL, // Any other printable character, like '!' or '+'.
107 // Symbols are always a single character, so "!+$%" is
108 // four tokens.
109 TYPE_WHITESPACE, // A sequence of whitespace. This token type is only
110 // produced if report_whitespace() is true. It is not
111 // reported for whitespace within comments or strings.
112 TYPE_NEWLINE, // A newline (\n). This token type is only
113 // produced if report_whitespace() is true and
114 // report_newlines() is true. It is not reported for
115 // newlines in comments or strings.
116 };
117
118 // Structure representing a token read from the token stream.
119 struct Token {
120 TokenType type;
121 std::string text; // The exact text of the token as it appeared in
122 // the input. e.g. tokens of TYPE_STRING will still
123 // be escaped and in quotes.
124
125 // "line" and "column" specify the position of the first character of
126 // the token within the input stream. They are zero-based.
127 int line;
128 ColumnNumber column;
129 ColumnNumber end_column;
130 };
131
132 // Get the current token. This is updated when Next() is called. Before
133 // the first call to Next(), current() has type TYPE_START and no contents.
134 const Token& current() const;
135
136 // Return the previous token -- i.e. what current() returned before the
137 // previous call to Next().
138 const Token& previous() const;
139
140 // Advance to the next token. Returns false if the end of the input is
141 // reached.
142 bool Next();
143
144 // Like Next(), but also collects comments which appear between the previous
145 // and next tokens.
146 //
147 // Comments which appear to be attached to the previous token are stored
148 // in *prev_tailing_comments. Comments which appear to be attached to the
149 // next token are stored in *next_leading_comments. Comments appearing in
150 // between which do not appear to be attached to either will be added to
151 // detached_comments. Any of these parameters can be NULL to simply discard
152 // the comments.
153 //
154 // A series of line comments appearing on consecutive lines, with no other
155 // tokens appearing on those lines, will be treated as a single comment.
156 //
157 // Only the comment content is returned; comment markers (e.g. //) are
158 // stripped out. For block comments, leading whitespace and an asterisk will
159 // be stripped from the beginning of each line other than the first. Newlines
160 // are included in the output.
161 //
162 // Examples:
163 //
164 // optional int32 foo = 1; // Comment attached to foo.
165 // // Comment attached to bar.
166 // optional int32 bar = 2;
167 //
168 // optional string baz = 3;
169 // // Comment attached to baz.
170 // // Another line attached to baz.
171 //
172 // // Comment attached to qux.
173 // //
174 // // Another line attached to qux.
175 // optional double qux = 4;
176 //
177 // // Detached comment. This is not attached to qux or corge
178 // // because there are blank lines separating it from both.
179 //
180 // optional string corge = 5;
181 // /* Block comment attached
182 // * to corge. Leading asterisks
183 // * will be removed. */
184 // /* Block comment attached to
185 // * grault. */
186 // optional int32 grault = 6;
187 bool NextWithComments(std::string* prev_trailing_comments,
188 std::vector<std::string>* detached_comments,
189 std::string* next_leading_comments);
190
191 // Parse helpers ---------------------------------------------------
192
193 // Parses a TYPE_FLOAT token. This never fails, so long as the text actually
194 // comes from a TYPE_FLOAT token parsed by Tokenizer. If it doesn't, the
195 // result is undefined (possibly an assert failure).
196 static double ParseFloat(const std::string& text);
197
198 // Parses given text as if it were a TYPE_FLOAT token. Returns false if the
199 // given text is not actually a valid float literal.
200 static bool TryParseFloat(const std::string& text, double* result);
201
202 // Parses a TYPE_STRING token. This never fails, so long as the text actually
203 // comes from a TYPE_STRING token parsed by Tokenizer. If it doesn't, the
204 // result is undefined (possibly an assert failure).
205 static void ParseString(const std::string& text, std::string* output);
206
207 // Identical to ParseString, but appends to output.
208 static void ParseStringAppend(const std::string& text, std::string* output);
209
210 // Parses a TYPE_INTEGER token. Returns false if the result would be
211 // greater than max_value. Otherwise, returns true and sets *output to the
212 // result. If the text is not from a Token of type TYPE_INTEGER originally
213 // parsed by a Tokenizer, the result is undefined (possibly an assert
214 // failure).
215 static bool ParseInteger(const std::string& text, uint64_t max_value,
216 uint64_t* output);
217
218 // Options ---------------------------------------------------------
219
220 // Set true to allow floats to be suffixed with the letter 'f'. Tokens
221 // which would otherwise be integers but which have the 'f' suffix will be
222 // forced to be interpreted as floats. For all other purposes, the 'f' is
223 // ignored.
set_allow_f_after_float(bool value)224 void set_allow_f_after_float(bool value) { allow_f_after_float_ = value; }
225
226 // Valid values for set_comment_style().
227 enum CommentStyle {
228 // Line comments begin with "//", block comments are delimited by "/*" and
229 // "*/".
230 CPP_COMMENT_STYLE,
231 // Line comments begin with "#". No way to write block comments.
232 SH_COMMENT_STYLE
233 };
234
235 // Sets the comment style.
set_comment_style(CommentStyle style)236 void set_comment_style(CommentStyle style) { comment_style_ = style; }
237
238 // Whether to require whitespace between a number and a field name.
239 // Default is true. Do not use this; for Google-internal cleanup only.
set_require_space_after_number(bool require)240 void set_require_space_after_number(bool require) {
241 require_space_after_number_ = require;
242 }
243
244 // Whether to allow string literals to span multiple lines. Default is false.
245 // Do not use this; for Google-internal cleanup only.
set_allow_multiline_strings(bool allow)246 void set_allow_multiline_strings(bool allow) {
247 allow_multiline_strings_ = allow;
248 }
249
250 // If true, whitespace tokens are reported by Next().
251 // Note: `set_report_whitespace(false)` implies `set_report_newlines(false)`.
252 bool report_whitespace() const;
253 void set_report_whitespace(bool report);
254
255 // If true, newline tokens are reported by Next().
256 // Note: `set_report_newlines(true)` implies `set_report_whitespace(true)`.
257 bool report_newlines() const;
258 void set_report_newlines(bool report);
259
260 // External helper: validate an identifier.
261 static bool IsIdentifier(const std::string& text);
262
263 // -----------------------------------------------------------------
264 private:
265 Token current_; // Returned by current().
266 Token previous_; // Returned by previous().
267
268 ZeroCopyInputStream* input_;
269 ErrorCollector* error_collector_;
270
271 char current_char_; // == buffer_[buffer_pos_], updated by NextChar().
272 const char* buffer_; // Current buffer returned from input_.
273 int buffer_size_; // Size of buffer_.
274 int buffer_pos_; // Current position within the buffer.
275 bool read_error_; // Did we previously encounter a read error?
276
277 // Line and column number of current_char_ within the whole input stream.
278 int line_;
279 ColumnNumber column_;
280
281 // String to which text should be appended as we advance through it.
282 // Call RecordTo(&str) to start recording and StopRecording() to stop.
283 // E.g. StartToken() calls RecordTo(¤t_.text). record_start_ is the
284 // position within the current buffer where recording started.
285 std::string* record_target_;
286 int record_start_;
287
288 // Options.
289 bool allow_f_after_float_;
290 CommentStyle comment_style_;
291 bool require_space_after_number_;
292 bool allow_multiline_strings_;
293 bool report_whitespace_ = false;
294 bool report_newlines_ = false;
295
296 // Since we count columns we need to interpret tabs somehow. We'll take
297 // the standard 8-character definition for lack of any way to do better.
298 // This must match the documentation of ColumnNumber.
299 static const int kTabWidth = 8;
300
301 // -----------------------------------------------------------------
302 // Helper methods.
303
304 // Consume this character and advance to the next one.
305 void NextChar();
306
307 // Read a new buffer from the input.
308 void Refresh();
309
310 inline void RecordTo(std::string* target);
311 inline void StopRecording();
312
313 // Called when the current character is the first character of a new
314 // token (not including whitespace or comments).
315 inline void StartToken();
316 // Called when the current character is the first character after the
317 // end of the last token. After this returns, current_.text will
318 // contain all text consumed since StartToken() was called.
319 inline void EndToken();
320
321 // Convenience method to add an error at the current line and column.
AddError(const std::string & message)322 void AddError(const std::string& message) {
323 error_collector_->RecordError(line_, column_, message);
324 }
325
326 // -----------------------------------------------------------------
327 // The following four methods are used to consume tokens of specific
328 // types. They are actually used to consume all characters *after*
329 // the first, since the calling function consumes the first character
330 // in order to decide what kind of token is being read.
331
332 // Read and consume a string, ending when the given delimiter is
333 // consumed.
334 void ConsumeString(char delimiter);
335
336 // Read and consume a number, returning TYPE_FLOAT or TYPE_INTEGER
337 // depending on what was read. This needs to know if the first
338 // character was a zero in order to correctly recognize hex and octal
339 // numbers.
340 // It also needs to know if the first character was a . to parse floating
341 // point correctly.
342 TokenType ConsumeNumber(bool started_with_zero, bool started_with_dot);
343
344 // Consume the rest of a line.
345 void ConsumeLineComment(std::string* content);
346 // Consume until "*/".
347 void ConsumeBlockComment(std::string* content);
348
349 enum NextCommentStatus {
350 // Started a line comment.
351 LINE_COMMENT,
352
353 // Started a block comment.
354 BLOCK_COMMENT,
355
356 // Consumed a slash, then realized it wasn't a comment. current_ has
357 // been filled in with a slash token. The caller should return it.
358 SLASH_NOT_COMMENT,
359
360 // We do not appear to be starting a comment here.
361 NO_COMMENT
362 };
363
364 // If we're at the start of a new comment, consume it and return what kind
365 // of comment it is.
366 NextCommentStatus TryConsumeCommentStart();
367
368 // If we're looking at a TYPE_WHITESPACE token and `report_whitespace_` is
369 // true, consume it and return true.
370 bool TryConsumeWhitespace();
371
372 // If we're looking at a TYPE_NEWLINE token and `report_newlines_` is true,
373 // consume it and return true.
374 bool TryConsumeNewline();
375
376 // -----------------------------------------------------------------
377 // These helper methods make the parsing code more readable. The
378 // "character classes" referred to are defined at the top of the .cc file.
379 // Basically it is a C++ class with one method:
380 // static bool InClass(char c);
381 // The method returns true if c is a member of this "class", like "Letter"
382 // or "Digit".
383
384 // Returns true if the current character is of the given character
385 // class, but does not consume anything.
386 template <typename CharacterClass>
387 inline bool LookingAt();
388
389 // If the current character is in the given class, consume it and return
390 // true. Otherwise return false.
391 // e.g. TryConsumeOne<Letter>()
392 template <typename CharacterClass>
393 inline bool TryConsumeOne();
394
395 // Like above, but try to consume the specific character indicated.
396 inline bool TryConsume(char c);
397
398 // Consume zero or more of the given character class.
399 template <typename CharacterClass>
400 inline void ConsumeZeroOrMore();
401
402 // Consume one or more of the given character class or log the given
403 // error message.
404 // e.g. ConsumeOneOrMore<Digit>("Expected digits.");
405 template <typename CharacterClass>
406 inline void ConsumeOneOrMore(const char* error);
407 };
408
409 // inline methods ====================================================
current()410 inline const Tokenizer::Token& Tokenizer::current() const { return current_; }
411
previous()412 inline const Tokenizer::Token& Tokenizer::previous() const { return previous_; }
413
ParseString(const std::string & text,std::string * output)414 inline void Tokenizer::ParseString(const std::string& text,
415 std::string* output) {
416 output->clear();
417 ParseStringAppend(text, output);
418 }
419
420 } // namespace io
421 } // namespace protobuf
422 } // namespace google
423
424 #include "google/protobuf/port_undef.inc"
425
426 #endif // GOOGLE_PROTOBUF_IO_TOKENIZER_H__
427