• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Protocol Buffers - Google's data interchange format
2 // Copyright 2008 Google Inc.  All rights reserved.
3 // http://code.google.com/p/protobuf/
4 //
5 // Redistribution and use in source and binary forms, with or without
6 // modification, are permitted provided that the following conditions are
7 // met:
8 //
9 //     * Redistributions of source code must retain the above copyright
10 // notice, this list of conditions and the following disclaimer.
11 //     * Redistributions in binary form must reproduce the above
12 // copyright notice, this list of conditions and the following disclaimer
13 // in the documentation and/or other materials provided with the
14 // distribution.
15 //     * Neither the name of Google Inc. nor the names of its
16 // contributors may be used to endorse or promote products derived from
17 // this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 
31 // Author: kenton@google.com (Kenton Varda)
32 //  Based on original Protocol Buffers design by
33 //  Sanjay Ghemawat, Jeff Dean, and others.
34 //
35 // Class for parsing tokenized text from a ZeroCopyInputStream.
36 
37 #ifndef GOOGLE_PROTOBUF_IO_TOKENIZER_H__
38 #define GOOGLE_PROTOBUF_IO_TOKENIZER_H__
39 
40 #include <string>
41 #include <google/protobuf/stubs/common.h>
42 
43 namespace google {
44 namespace protobuf {
45 namespace io {
46 
47 class ZeroCopyInputStream;     // zero_copy_stream.h
48 
49 // Defined in this file.
50 class ErrorCollector;
51 class Tokenizer;
52 
53 // Abstract interface for an object which collects the errors that occur
54 // during parsing.  A typical implementation might simply print the errors
55 // to stdout.
56 class LIBPROTOBUF_EXPORT ErrorCollector {
57  public:
ErrorCollector()58   inline ErrorCollector() {}
59   virtual ~ErrorCollector();
60 
61   // Indicates that there was an error in the input at the given line and
62   // column numbers.  The numbers are zero-based, so you may want to add
63   // 1 to each before printing them.
64   virtual void AddError(int line, int column, const string& message) = 0;
65 
66   // Indicates that there was a warning in the input at the given line and
67   // column numbers.  The numbers are zero-based, so you may want to add
68   // 1 to each before printing them.
AddWarning(int line,int column,const string & message)69   virtual void AddWarning(int line, int column, const string& message) { }
70 
71  private:
72   GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(ErrorCollector);
73 };
74 
75 // This class converts a stream of raw text into a stream of tokens for
76 // the protocol definition parser to parse.  The tokens recognized are
77 // similar to those that make up the C language; see the TokenType enum for
78 // precise descriptions.  Whitespace and comments are skipped.  By default,
79 // C- and C++-style comments are recognized, but other styles can be used by
80 // calling set_comment_style().
81 class LIBPROTOBUF_EXPORT Tokenizer {
82  public:
83   // Construct a Tokenizer that reads and tokenizes text from the given
84   // input stream and writes errors to the given error_collector.
85   // The caller keeps ownership of input and error_collector.
86   Tokenizer(ZeroCopyInputStream* input, ErrorCollector* error_collector);
87   ~Tokenizer();
88 
89   enum TokenType {
90     TYPE_START,       // Next() has not yet been called.
91     TYPE_END,         // End of input reached.  "text" is empty.
92 
93     TYPE_IDENTIFIER,  // A sequence of letters, digits, and underscores, not
94                       // starting with a digit.  It is an error for a number
95                       // to be followed by an identifier with no space in
96                       // between.
97     TYPE_INTEGER,     // A sequence of digits representing an integer.  Normally
98                       // the digits are decimal, but a prefix of "0x" indicates
99                       // a hex number and a leading zero indicates octal, just
100                       // like with C numeric literals.  A leading negative sign
101                       // is NOT included in the token; it's up to the parser to
102                       // interpret the unary minus operator on its own.
103     TYPE_FLOAT,       // A floating point literal, with a fractional part and/or
104                       // an exponent.  Always in decimal.  Again, never
105                       // negative.
106     TYPE_STRING,      // A quoted sequence of escaped characters.  Either single
107                       // or double quotes can be used, but they must match.
108                       // A string literal cannot cross a line break.
109     TYPE_SYMBOL,      // Any other printable character, like '!' or '+'.
110                       // Symbols are always a single character, so "!+$%" is
111                       // four tokens.
112   };
113 
114   // Structure representing a token read from the token stream.
115   struct Token {
116     TokenType type;
117     string text;       // The exact text of the token as it appeared in
118                        // the input.  e.g. tokens of TYPE_STRING will still
119                        // be escaped and in quotes.
120 
121     // "line" and "column" specify the position of the first character of
122     // the token within the input stream.  They are zero-based.
123     int line;
124     int column;
125   };
126 
127   // Get the current token.  This is updated when Next() is called.  Before
128   // the first call to Next(), current() has type TYPE_START and no contents.
129   const Token& current();
130 
131   // Advance to the next token.  Returns false if the end of the input is
132   // reached.
133   bool Next();
134 
135   // Parse helpers ---------------------------------------------------
136 
137   // Parses a TYPE_FLOAT token.  This never fails, so long as the text actually
138   // comes from a TYPE_FLOAT token parsed by Tokenizer.  If it doesn't, the
139   // result is undefined (possibly an assert failure).
140   static double ParseFloat(const string& text);
141 
142   // Parses a TYPE_STRING token.  This never fails, so long as the text actually
143   // comes from a TYPE_STRING token parsed by Tokenizer.  If it doesn't, the
144   // result is undefined (possibly an assert failure).
145   static void ParseString(const string& text, string* output);
146 
147   // Identical to ParseString, but appends to output.
148   static void ParseStringAppend(const string& text, string* output);
149 
150   // Parses a TYPE_INTEGER token.  Returns false if the result would be
151   // greater than max_value.  Otherwise, returns true and sets *output to the
152   // result.  If the text is not from a Token of type TYPE_INTEGER originally
153   // parsed by a Tokenizer, the result is undefined (possibly an assert
154   // failure).
155   static bool ParseInteger(const string& text, uint64 max_value,
156                            uint64* output);
157 
158   // Options ---------------------------------------------------------
159 
160   // Set true to allow floats to be suffixed with the letter 'f'.  Tokens
161   // which would otherwise be integers but which have the 'f' suffix will be
162   // forced to be interpreted as floats.  For all other purposes, the 'f' is
163   // ignored.
set_allow_f_after_float(bool value)164   void set_allow_f_after_float(bool value) { allow_f_after_float_ = value; }
165 
166   // Valid values for set_comment_style().
167   enum CommentStyle {
168     // Line comments begin with "//", block comments are delimited by "/*" and
169     // "*/".
170     CPP_COMMENT_STYLE,
171     // Line comments begin with "#".  No way to write block comments.
172     SH_COMMENT_STYLE
173   };
174 
175   // Sets the comment style.
set_comment_style(CommentStyle style)176   void set_comment_style(CommentStyle style) { comment_style_ = style; }
177 
178   // -----------------------------------------------------------------
179  private:
180   GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(Tokenizer);
181 
182   Token current_;           // Returned by current().
183 
184   ZeroCopyInputStream* input_;
185   ErrorCollector* error_collector_;
186 
187   char current_char_;       // == buffer_[buffer_pos_], updated by NextChar().
188   const char* buffer_;      // Current buffer returned from input_.
189   int buffer_size_;         // Size of buffer_.
190   int buffer_pos_;          // Current position within the buffer.
191   bool read_error_;         // Did we previously encounter a read error?
192 
193   // Line and column number of current_char_ within the whole input stream.
194   int line_;
195   int column_;
196 
197   // Position in buffer_ where StartToken() was called.  If the token
198   // started in the previous buffer, this is zero, and current_.text already
199   // contains the part of the token from the previous buffer.  If not
200   // currently parsing a token, this is -1.
201   int token_start_;
202 
203   // Options.
204   bool allow_f_after_float_;
205   CommentStyle comment_style_;
206 
207   // Since we count columns we need to interpret tabs somehow.  We'll take
208   // the standard 8-character definition for lack of any way to do better.
209   static const int kTabWidth = 8;
210 
211   // -----------------------------------------------------------------
212   // Helper methods.
213 
214   // Consume this character and advance to the next one.
215   void NextChar();
216 
217   // Read a new buffer from the input.
218   void Refresh();
219 
220   // Called when the current character is the first character of a new
221   // token (not including whitespace or comments).
222   inline void StartToken();
223   // Called when the current character is the first character after the
224   // end of the last token.  After this returns, current_.text will
225   // contain all text consumed since StartToken() was called.
226   inline void EndToken();
227 
228   // Convenience method to add an error at the current line and column.
AddError(const string & message)229   void AddError(const string& message) {
230     error_collector_->AddError(line_, column_, message);
231   }
232 
233   // -----------------------------------------------------------------
234   // The following four methods are used to consume tokens of specific
235   // types.  They are actually used to consume all characters *after*
236   // the first, since the calling function consumes the first character
237   // in order to decide what kind of token is being read.
238 
239   // Read and consume a string, ending when the given delimiter is
240   // consumed.
241   void ConsumeString(char delimiter);
242 
243   // Read and consume a number, returning TYPE_FLOAT or TYPE_INTEGER
244   // depending on what was read.  This needs to know if the first
245   // character was a zero in order to correctly recognize hex and octal
246   // numbers.
247   // It also needs to know if the first characted was a . to parse floating
248   // point correctly.
249   TokenType ConsumeNumber(bool started_with_zero, bool started_with_dot);
250 
251   // Consume the rest of a line.
252   void ConsumeLineComment();
253   // Consume until "*/".
254   void ConsumeBlockComment();
255 
256   // -----------------------------------------------------------------
257   // These helper methods make the parsing code more readable.  The
258   // "character classes" refered to are defined at the top of the .cc file.
259   // Basically it is a C++ class with one method:
260   //   static bool InClass(char c);
261   // The method returns true if c is a member of this "class", like "Letter"
262   // or "Digit".
263 
264   // Returns true if the current character is of the given character
265   // class, but does not consume anything.
266   template<typename CharacterClass>
267   inline bool LookingAt();
268 
269   // If the current character is in the given class, consume it and return
270   // true.  Otherwise return false.
271   // e.g. TryConsumeOne<Letter>()
272   template<typename CharacterClass>
273   inline bool TryConsumeOne();
274 
275   // Like above, but try to consume the specific character indicated.
276   inline bool TryConsume(char c);
277 
278   // Consume zero or more of the given character class.
279   template<typename CharacterClass>
280   inline void ConsumeZeroOrMore();
281 
282   // Consume one or more of the given character class or log the given
283   // error message.
284   // e.g. ConsumeOneOrMore<Digit>("Expected digits.");
285   template<typename CharacterClass>
286   inline void ConsumeOneOrMore(const char* error);
287 };
288 
289 // inline methods ====================================================
current()290 inline const Tokenizer::Token& Tokenizer::current() {
291   return current_;
292 }
293 
ParseString(const string & text,string * output)294 inline void Tokenizer::ParseString(const string& text, string* output) {
295   output->clear();
296   ParseStringAppend(text, output);
297 }
298 
299 }  // namespace io
300 }  // namespace protobuf
301 
302 }  // namespace google
303 #endif  // GOOGLE_PROTOBUF_IO_TOKENIZER_H__
304