1 // Protocol Buffers - Google's data interchange format 2 // Copyright 2008 Google Inc. All rights reserved. 3 // https://developers.google.com/protocol-buffers/ 4 // 5 // Redistribution and use in source and binary forms, with or without 6 // modification, are permitted provided that the following conditions are 7 // met: 8 // 9 // * Redistributions of source code must retain the above copyright 10 // notice, this list of conditions and the following disclaimer. 11 // * Redistributions in binary form must reproduce the above 12 // copyright notice, this list of conditions and the following disclaimer 13 // in the documentation and/or other materials provided with the 14 // distribution. 15 // * Neither the name of Google Inc. nor the names of its 16 // contributors may be used to endorse or promote products derived from 17 // this software without specific prior written permission. 18 // 19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 31 // Author: kenton@google.com (Kenton Varda) 32 // Based on original Protocol Buffers design by 33 // Sanjay Ghemawat, Jeff Dean, and others. 34 // 35 // Implements parsing of .proto files to FileDescriptorProtos. 36 37 #ifndef GOOGLE_PROTOBUF_COMPILER_PARSER_H__ 38 #define GOOGLE_PROTOBUF_COMPILER_PARSER_H__ 39 40 #include <map> 41 #include <string> 42 #include <utility> 43 #include <google/protobuf/descriptor.h> 44 #include <google/protobuf/descriptor.pb.h> 45 #include <google/protobuf/repeated_field.h> 46 #include <google/protobuf/io/tokenizer.h> 47 48 namespace google { 49 namespace protobuf { class Message; } 50 51 namespace protobuf { 52 namespace compiler { 53 54 // Defined in this file. 55 class Parser; 56 class SourceLocationTable; 57 58 // Implements parsing of protocol definitions (such as .proto files). 59 // 60 // Note that most users will be more interested in the Importer class. 61 // Parser is a lower-level class which simply converts a single .proto file 62 // to a FileDescriptorProto. It does not resolve import directives or perform 63 // many other kinds of validation needed to construct a complete 64 // FileDescriptor. 65 class LIBPROTOBUF_EXPORT Parser { 66 public: 67 Parser(); 68 ~Parser(); 69 70 // Parse the entire input and construct a FileDescriptorProto representing 71 // it. Returns true if no errors occurred, false otherwise. 72 bool Parse(io::Tokenizer* input, FileDescriptorProto* file); 73 74 // Optional fetaures: 75 76 // DEPRECATED: New code should use the SourceCodeInfo embedded in the 77 // FileDescriptorProto. 78 // 79 // Requests that locations of certain definitions be recorded to the given 80 // SourceLocationTable while parsing. This can be used to look up exact line 81 // and column numbers for errors reported by DescriptorPool during validation. 82 // Set to NULL (the default) to discard source location information. RecordSourceLocationsTo(SourceLocationTable * location_table)83 void RecordSourceLocationsTo(SourceLocationTable* location_table) { 84 source_location_table_ = location_table; 85 } 86 87 // Requests that errors be recorded to the given ErrorCollector while 88 // parsing. Set to NULL (the default) to discard error messages. RecordErrorsTo(io::ErrorCollector * error_collector)89 void RecordErrorsTo(io::ErrorCollector* error_collector) { 90 error_collector_ = error_collector; 91 } 92 93 // Returns the identifier used in the "syntax = " declaration, if one was 94 // seen during the last call to Parse(), or the empty string otherwise. GetSyntaxIdentifier()95 const string& GetSyntaxIdentifier() { return syntax_identifier_; } 96 97 // If set true, input files will be required to begin with a syntax 98 // identifier. Otherwise, files may omit this. If a syntax identifier 99 // is provided, it must be 'syntax = "proto2";' and must appear at the 100 // top of this file regardless of whether or not it was required. SetRequireSyntaxIdentifier(bool value)101 void SetRequireSyntaxIdentifier(bool value) { 102 require_syntax_identifier_ = value; 103 } 104 105 // Call SetStopAfterSyntaxIdentifier(true) to tell the parser to stop 106 // parsing as soon as it has seen the syntax identifier, or lack thereof. 107 // This is useful for quickly identifying the syntax of the file without 108 // parsing the whole thing. If this is enabled, no error will be recorded 109 // if the syntax identifier is something other than "proto2" (since 110 // presumably the caller intends to deal with that), but other kinds of 111 // errors (e.g. parse errors) will still be reported. When this is enabled, 112 // you may pass a NULL FileDescriptorProto to Parse(). SetStopAfterSyntaxIdentifier(bool value)113 void SetStopAfterSyntaxIdentifier(bool value) { 114 stop_after_syntax_identifier_ = value; 115 } 116 117 private: 118 class LocationRecorder; 119 120 // ================================================================= 121 // Error recovery helpers 122 123 // Consume the rest of the current statement. This consumes tokens 124 // until it sees one of: 125 // ';' Consumes the token and returns. 126 // '{' Consumes the brace then calls SkipRestOfBlock(). 127 // '}' Returns without consuming. 128 // EOF Returns (can't consume). 129 // The Parser often calls SkipStatement() after encountering a syntax 130 // error. This allows it to go on parsing the following lines, allowing 131 // it to report more than just one error in the file. 132 void SkipStatement(); 133 134 // Consume the rest of the current block, including nested blocks, 135 // ending after the closing '}' is encountered and consumed, or at EOF. 136 void SkipRestOfBlock(); 137 138 // ----------------------------------------------------------------- 139 // Single-token consuming helpers 140 // 141 // These make parsing code more readable. 142 143 // True if the current token is TYPE_END. 144 inline bool AtEnd(); 145 146 // True if the next token matches the given text. 147 inline bool LookingAt(const char* text); 148 // True if the next token is of the given type. 149 inline bool LookingAtType(io::Tokenizer::TokenType token_type); 150 151 // If the next token exactly matches the text given, consume it and return 152 // true. Otherwise, return false without logging an error. 153 bool TryConsume(const char* text); 154 155 // These attempt to read some kind of token from the input. If successful, 156 // they return true. Otherwise they return false and add the given error 157 // to the error list. 158 159 // Consume a token with the exact text given. 160 bool Consume(const char* text, const char* error); 161 // Same as above, but automatically generates the error "Expected \"text\".", 162 // where "text" is the expected token text. 163 bool Consume(const char* text); 164 // Consume a token of type IDENTIFIER and store its text in "output". 165 bool ConsumeIdentifier(string* output, const char* error); 166 // Consume an integer and store its value in "output". 167 bool ConsumeInteger(int* output, const char* error); 168 // Consume a signed integer and store its value in "output". 169 bool ConsumeSignedInteger(int* output, const char* error); 170 // Consume a 64-bit integer and store its value in "output". If the value 171 // is greater than max_value, an error will be reported. 172 bool ConsumeInteger64(uint64 max_value, uint64* output, const char* error); 173 // Consume a number and store its value in "output". This will accept 174 // tokens of either INTEGER or FLOAT type. 175 bool ConsumeNumber(double* output, const char* error); 176 // Consume a string literal and store its (unescaped) value in "output". 177 bool ConsumeString(string* output, const char* error); 178 179 // Consume a token representing the end of the statement. Comments between 180 // this token and the next will be harvested for documentation. The given 181 // LocationRecorder should refer to the declaration that was just parsed; 182 // it will be populated with these comments. 183 // 184 // TODO(kenton): The LocationRecorder is const because historically locations 185 // have been passed around by const reference, for no particularly good 186 // reason. We should probably go through and change them all to mutable 187 // pointer to make this more intuitive. 188 bool TryConsumeEndOfDeclaration(const char* text, 189 const LocationRecorder* location); 190 bool ConsumeEndOfDeclaration(const char* text, 191 const LocationRecorder* location); 192 193 // ----------------------------------------------------------------- 194 // Error logging helpers 195 196 // Invokes error_collector_->AddError(), if error_collector_ is not NULL. 197 void AddError(int line, int column, const string& error); 198 199 // Invokes error_collector_->AddError() with the line and column number 200 // of the current token. 201 void AddError(const string& error); 202 203 // Records a location in the SourceCodeInfo.location table (see 204 // descriptor.proto). We use RAII to ensure that the start and end locations 205 // are recorded -- the constructor records the start location and the 206 // destructor records the end location. Since the parser is 207 // recursive-descent, this works out beautifully. 208 class LIBPROTOBUF_EXPORT LocationRecorder { 209 public: 210 // Construct the file's "root" location. 211 LocationRecorder(Parser* parser); 212 213 // Construct a location that represents a declaration nested within the 214 // given parent. E.g. a field's location is nested within the location 215 // for a message type. The parent's path will be copied, so you should 216 // call AddPath() only to add the path components leading from the parent 217 // to the child (as opposed to leading from the root to the child). 218 LocationRecorder(const LocationRecorder& parent); 219 220 // Convenience constructors that call AddPath() one or two times. 221 LocationRecorder(const LocationRecorder& parent, int path1); 222 LocationRecorder(const LocationRecorder& parent, int path1, int path2); 223 224 ~LocationRecorder(); 225 226 // Add a path component. See SourceCodeInfo.Location.path in 227 // descriptor.proto. 228 void AddPath(int path_component); 229 230 // By default the location is considered to start at the current token at 231 // the time the LocationRecorder is created. StartAt() sets the start 232 // location to the given token instead. 233 void StartAt(const io::Tokenizer::Token& token); 234 235 // Start at the same location as some other LocationRecorder. 236 void StartAt(const LocationRecorder& other); 237 238 // By default the location is considered to end at the previous token at 239 // the time the LocationRecorder is destroyed. EndAt() sets the end 240 // location to the given token instead. 241 void EndAt(const io::Tokenizer::Token& token); 242 243 // Records the start point of this location to the SourceLocationTable that 244 // was passed to RecordSourceLocationsTo(), if any. SourceLocationTable 245 // is an older way of keeping track of source locations which is still 246 // used in some places. 247 void RecordLegacyLocation(const Message* descriptor, 248 DescriptorPool::ErrorCollector::ErrorLocation location); 249 250 // Attaches leading and trailing comments to the location. The two strings 251 // will be swapped into place, so after this is called *leading and 252 // *trailing will be empty. 253 // 254 // TODO(kenton): See comment on TryConsumeEndOfDeclaration(), above, for 255 // why this is const. 256 void AttachComments(string* leading, string* trailing) const; 257 258 private: 259 Parser* parser_; 260 SourceCodeInfo::Location* location_; 261 262 void Init(const LocationRecorder& parent); 263 }; 264 265 // ================================================================= 266 // Parsers for various language constructs 267 268 // Parses the "syntax = \"proto2\";" line at the top of the file. Returns 269 // false if it failed to parse or if the syntax identifier was not 270 // recognized. 271 bool ParseSyntaxIdentifier(); 272 273 // These methods parse various individual bits of code. They return 274 // false if they completely fail to parse the construct. In this case, 275 // it is probably necessary to skip the rest of the statement to recover. 276 // However, if these methods return true, it does NOT mean that there 277 // were no errors; only that there were no *syntax* errors. For instance, 278 // if a service method is defined using proper syntax but uses a primitive 279 // type as its input or output, ParseMethodField() still returns true 280 // and only reports the error by calling AddError(). In practice, this 281 // makes logic much simpler for the caller. 282 283 // Parse a top-level message, enum, service, etc. 284 bool ParseTopLevelStatement(FileDescriptorProto* file, 285 const LocationRecorder& root_location); 286 287 // Parse various language high-level language construrcts. 288 bool ParseMessageDefinition(DescriptorProto* message, 289 const LocationRecorder& message_location, 290 const FileDescriptorProto* containing_file); 291 bool ParseEnumDefinition(EnumDescriptorProto* enum_type, 292 const LocationRecorder& enum_location, 293 const FileDescriptorProto* containing_file); 294 bool ParseServiceDefinition(ServiceDescriptorProto* service, 295 const LocationRecorder& service_location, 296 const FileDescriptorProto* containing_file); 297 bool ParsePackage(FileDescriptorProto* file, 298 const LocationRecorder& root_location, 299 const FileDescriptorProto* containing_file); 300 bool ParseImport(RepeatedPtrField<string>* dependency, 301 RepeatedField<int32>* public_dependency, 302 RepeatedField<int32>* weak_dependency, 303 const LocationRecorder& root_location, 304 const FileDescriptorProto* containing_file); 305 bool ParseOption(Message* options, 306 const LocationRecorder& options_location, 307 const FileDescriptorProto* containing_file); 308 309 // These methods parse the contents of a message, enum, or service type and 310 // add them to the given object. They consume the entire block including 311 // the beginning and ending brace. 312 bool ParseMessageBlock(DescriptorProto* message, 313 const LocationRecorder& message_location, 314 const FileDescriptorProto* containing_file); 315 bool ParseEnumBlock(EnumDescriptorProto* enum_type, 316 const LocationRecorder& enum_location, 317 const FileDescriptorProto* containing_file); 318 bool ParseServiceBlock(ServiceDescriptorProto* service, 319 const LocationRecorder& service_location, 320 const FileDescriptorProto* containing_file); 321 322 // Parse one statement within a message, enum, or service block, inclunding 323 // final semicolon. 324 bool ParseMessageStatement(DescriptorProto* message, 325 const LocationRecorder& message_location, 326 const FileDescriptorProto* containing_file); 327 bool ParseEnumStatement(EnumDescriptorProto* message, 328 const LocationRecorder& enum_location, 329 const FileDescriptorProto* containing_file); 330 bool ParseServiceStatement(ServiceDescriptorProto* message, 331 const LocationRecorder& service_location, 332 const FileDescriptorProto* containing_file); 333 334 // Parse a field of a message. If the field is a group, its type will be 335 // added to "messages". 336 // 337 // parent_location and location_field_number_for_nested_type are needed when 338 // parsing groups -- we need to generate a nested message type within the 339 // parent and record its location accordingly. Since the parent could be 340 // either a FileDescriptorProto or a DescriptorProto, we must pass in the 341 // correct field number to use. 342 bool ParseMessageField(FieldDescriptorProto* field, 343 RepeatedPtrField<DescriptorProto>* messages, 344 const LocationRecorder& parent_location, 345 int location_field_number_for_nested_type, 346 const LocationRecorder& field_location, 347 const FileDescriptorProto* containing_file); 348 349 // Like ParseMessageField() but expects the label has already been filled in 350 // by the caller. 351 bool ParseMessageFieldNoLabel(FieldDescriptorProto* field, 352 RepeatedPtrField<DescriptorProto>* messages, 353 const LocationRecorder& parent_location, 354 int location_field_number_for_nested_type, 355 const LocationRecorder& field_location, 356 const FileDescriptorProto* containing_file); 357 358 // Parse an "extensions" declaration. 359 bool ParseExtensions(DescriptorProto* message, 360 const LocationRecorder& extensions_location, 361 const FileDescriptorProto* containing_file); 362 363 // Parse an "extend" declaration. (See also comments for 364 // ParseMessageField().) 365 bool ParseExtend(RepeatedPtrField<FieldDescriptorProto>* extensions, 366 RepeatedPtrField<DescriptorProto>* messages, 367 const LocationRecorder& parent_location, 368 int location_field_number_for_nested_type, 369 const LocationRecorder& extend_location, 370 const FileDescriptorProto* containing_file); 371 372 // Parse a "oneof" declaration. The caller is responsible for setting 373 // oneof_decl->label() since it will have had to parse the label before it 374 // knew it was parsing a oneof. 375 bool ParseOneof(OneofDescriptorProto* oneof_decl, 376 DescriptorProto* containing_type, 377 int oneof_index, 378 const LocationRecorder& oneof_location, 379 const LocationRecorder& containing_type_location, 380 const FileDescriptorProto* containing_file); 381 382 // Parse a single enum value within an enum block. 383 bool ParseEnumConstant(EnumValueDescriptorProto* enum_value, 384 const LocationRecorder& enum_value_location, 385 const FileDescriptorProto* containing_file); 386 387 // Parse enum constant options, i.e. the list in square brackets at the end 388 // of the enum constant value definition. 389 bool ParseEnumConstantOptions(EnumValueDescriptorProto* value, 390 const LocationRecorder& enum_value_location, 391 const FileDescriptorProto* containing_file); 392 393 // Parse a single method within a service definition. 394 bool ParseServiceMethod(MethodDescriptorProto* method, 395 const LocationRecorder& method_location, 396 const FileDescriptorProto* containing_file); 397 398 399 // Parse options of a single method or stream. 400 bool ParseOptions(const LocationRecorder& parent_location, 401 const FileDescriptorProto* containing_file, 402 const int optionsFieldNumber, 403 Message* mutable_options); 404 405 // Parse "required", "optional", or "repeated" and fill in "label" 406 // with the value. 407 bool ParseLabel(FieldDescriptorProto::Label* label, 408 const FileDescriptorProto* containing_file); 409 410 // Parse a type name and fill in "type" (if it is a primitive) or 411 // "type_name" (if it is not) with the type parsed. 412 bool ParseType(FieldDescriptorProto::Type* type, 413 string* type_name); 414 // Parse a user-defined type and fill in "type_name" with the name. 415 // If a primitive type is named, it is treated as an error. 416 bool ParseUserDefinedType(string* type_name); 417 418 // Parses field options, i.e. the stuff in square brackets at the end 419 // of a field definition. Also parses default value. 420 bool ParseFieldOptions(FieldDescriptorProto* field, 421 const LocationRecorder& field_location, 422 const FileDescriptorProto* containing_file); 423 424 // Parse the "default" option. This needs special handling because its 425 // type is the field's type. 426 bool ParseDefaultAssignment(FieldDescriptorProto* field, 427 const LocationRecorder& field_location, 428 const FileDescriptorProto* containing_file); 429 430 enum OptionStyle { 431 OPTION_ASSIGNMENT, // just "name = value" 432 OPTION_STATEMENT // "option name = value;" 433 }; 434 435 // Parse a single option name/value pair, e.g. "ctype = CORD". The name 436 // identifies a field of the given Message, and the value of that field 437 // is set to the parsed value. 438 bool ParseOption(Message* options, 439 const LocationRecorder& options_location, 440 const FileDescriptorProto* containing_file, 441 OptionStyle style); 442 443 // Parses a single part of a multipart option name. A multipart name consists 444 // of names separated by dots. Each name is either an identifier or a series 445 // of identifiers separated by dots and enclosed in parentheses. E.g., 446 // "foo.(bar.baz).qux". 447 bool ParseOptionNamePart(UninterpretedOption* uninterpreted_option, 448 const LocationRecorder& part_location, 449 const FileDescriptorProto* containing_file); 450 451 // Parses a string surrounded by balanced braces. Strips off the outer 452 // braces and stores the enclosed string in *value. 453 // E.g., 454 // { foo } *value gets 'foo' 455 // { foo { bar: box } } *value gets 'foo { bar: box }' 456 // {} *value gets '' 457 // 458 // REQUIRES: LookingAt("{") 459 // When finished successfully, we are looking at the first token past 460 // the ending brace. 461 bool ParseUninterpretedBlock(string* value); 462 463 // ================================================================= 464 465 io::Tokenizer* input_; 466 io::ErrorCollector* error_collector_; 467 SourceCodeInfo* source_code_info_; 468 SourceLocationTable* source_location_table_; // legacy 469 bool had_errors_; 470 bool require_syntax_identifier_; 471 bool stop_after_syntax_identifier_; 472 string syntax_identifier_; 473 474 // Leading doc comments for the next declaration. These are not complete 475 // yet; use ConsumeEndOfDeclaration() to get the complete comments. 476 string upcoming_doc_comments_; 477 478 GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(Parser); 479 }; 480 481 // A table mapping (descriptor, ErrorLocation) pairs -- as reported by 482 // DescriptorPool when validating descriptors -- to line and column numbers 483 // within the original source code. 484 // 485 // This is semi-obsolete: FileDescriptorProto.source_code_info now contains 486 // far more complete information about source locations. However, as of this 487 // writing you still need to use SourceLocationTable when integrating with 488 // DescriptorPool. 489 class LIBPROTOBUF_EXPORT SourceLocationTable { 490 public: 491 SourceLocationTable(); 492 ~SourceLocationTable(); 493 494 // Finds the precise location of the given error and fills in *line and 495 // *column with the line and column numbers. If not found, sets *line to 496 // -1 and *column to 0 (since line = -1 is used to mean "error has no exact 497 // location" in the ErrorCollector interface). Returns true if found, false 498 // otherwise. 499 bool Find(const Message* descriptor, 500 DescriptorPool::ErrorCollector::ErrorLocation location, 501 int* line, int* column) const; 502 503 // Adds a location to the table. 504 void Add(const Message* descriptor, 505 DescriptorPool::ErrorCollector::ErrorLocation location, 506 int line, int column); 507 508 // Clears the contents of the table. 509 void Clear(); 510 511 private: 512 typedef map< 513 pair<const Message*, DescriptorPool::ErrorCollector::ErrorLocation>, 514 pair<int, int> > LocationMap; 515 LocationMap location_map_; 516 }; 517 518 } // namespace compiler 519 } // namespace protobuf 520 521 } // namespace google 522 #endif // GOOGLE_PROTOBUF_COMPILER_PARSER_H__ 523