1# Protocol Buffers - Google's data interchange format 2# Copyright 2008 Google Inc. All rights reserved. 3# https://developers.google.com/protocol-buffers/ 4# 5# Redistribution and use in source and binary forms, with or without 6# modification, are permitted provided that the following conditions are 7# met: 8# 9# * Redistributions of source code must retain the above copyright 10# notice, this list of conditions and the following disclaimer. 11# * Redistributions in binary form must reproduce the above 12# copyright notice, this list of conditions and the following disclaimer 13# in the documentation and/or other materials provided with the 14# distribution. 15# * Neither the name of Google Inc. nor the names of its 16# contributors may be used to endorse or promote products derived from 17# this software without specific prior written permission. 18# 19# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 23# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 25# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 31"""Contains routines for printing protocol messages in text format. 32 33Simple usage example:: 34 35 # Create a proto object and serialize it to a text proto string. 36 message = my_proto_pb2.MyMessage(foo='bar') 37 text_proto = text_format.MessageToString(message) 38 39 # Parse a text proto string. 40 message = text_format.Parse(text_proto, my_proto_pb2.MyMessage()) 41""" 42 43__author__ = 'kenton@google.com (Kenton Varda)' 44 45# TODO(b/129989314) Import thread contention leads to test failures. 46import encodings.raw_unicode_escape # pylint: disable=unused-import 47import encodings.unicode_escape # pylint: disable=unused-import 48import io 49import math 50import re 51 52from google.protobuf.internal import decoder 53from google.protobuf.internal import type_checkers 54from google.protobuf import descriptor 55from google.protobuf import text_encoding 56 57# pylint: disable=g-import-not-at-top 58__all__ = ['MessageToString', 'Parse', 'PrintMessage', 'PrintField', 59 'PrintFieldValue', 'Merge', 'MessageToBytes'] 60 61_INTEGER_CHECKERS = (type_checkers.Uint32ValueChecker(), 62 type_checkers.Int32ValueChecker(), 63 type_checkers.Uint64ValueChecker(), 64 type_checkers.Int64ValueChecker()) 65_FLOAT_INFINITY = re.compile('-?inf(?:inity)?f?$', re.IGNORECASE) 66_FLOAT_NAN = re.compile('nanf?$', re.IGNORECASE) 67_QUOTES = frozenset(("'", '"')) 68_ANY_FULL_TYPE_NAME = 'google.protobuf.Any' 69 70 71class Error(Exception): 72 """Top-level module error for text_format.""" 73 74 75class ParseError(Error): 76 """Thrown in case of text parsing or tokenizing error.""" 77 78 def __init__(self, message=None, line=None, column=None): 79 if message is not None and line is not None: 80 loc = str(line) 81 if column is not None: 82 loc += ':{0}'.format(column) 83 message = '{0} : {1}'.format(loc, message) 84 if message is not None: 85 super(ParseError, self).__init__(message) 86 else: 87 super(ParseError, self).__init__() 88 self._line = line 89 self._column = column 90 91 def GetLine(self): 92 return self._line 93 94 def GetColumn(self): 95 return self._column 96 97 98class TextWriter(object): 99 100 def __init__(self, as_utf8): 101 self._writer = io.StringIO() 102 103 def write(self, val): 104 return self._writer.write(val) 105 106 def close(self): 107 return self._writer.close() 108 109 def getvalue(self): 110 return self._writer.getvalue() 111 112 113def MessageToString( 114 message, 115 as_utf8=False, 116 as_one_line=False, 117 use_short_repeated_primitives=False, 118 pointy_brackets=False, 119 use_index_order=False, 120 float_format=None, 121 double_format=None, 122 use_field_number=False, 123 descriptor_pool=None, 124 indent=0, 125 message_formatter=None, 126 print_unknown_fields=False, 127 force_colon=False): 128 # type: (...) -> str 129 """Convert protobuf message to text format. 130 131 Double values can be formatted compactly with 15 digits of 132 precision (which is the most that IEEE 754 "double" can guarantee) 133 using double_format='.15g'. To ensure that converting to text and back to a 134 proto will result in an identical value, double_format='.17g' should be used. 135 136 Args: 137 message: The protocol buffers message. 138 as_utf8: Return unescaped Unicode for non-ASCII characters. 139 In Python 3 actual Unicode characters may appear as is in strings. 140 In Python 2 the return value will be valid UTF-8 rather than only ASCII. 141 as_one_line: Don't introduce newlines between fields. 142 use_short_repeated_primitives: Use short repeated format for primitives. 143 pointy_brackets: If True, use angle brackets instead of curly braces for 144 nesting. 145 use_index_order: If True, fields of a proto message will be printed using 146 the order defined in source code instead of the field number, extensions 147 will be printed at the end of the message and their relative order is 148 determined by the extension number. By default, use the field number 149 order. 150 float_format (str): If set, use this to specify float field formatting 151 (per the "Format Specification Mini-Language"); otherwise, shortest float 152 that has same value in wire will be printed. Also affect double field 153 if double_format is not set but float_format is set. 154 double_format (str): If set, use this to specify double field formatting 155 (per the "Format Specification Mini-Language"); if it is not set but 156 float_format is set, use float_format. Otherwise, use ``str()`` 157 use_field_number: If True, print field numbers instead of names. 158 descriptor_pool (DescriptorPool): Descriptor pool used to resolve Any types. 159 indent (int): The initial indent level, in terms of spaces, for pretty 160 print. 161 message_formatter (function(message, indent, as_one_line) -> unicode|None): 162 Custom formatter for selected sub-messages (usually based on message 163 type). Use to pretty print parts of the protobuf for easier diffing. 164 print_unknown_fields: If True, unknown fields will be printed. 165 force_colon: If set, a colon will be added after the field name even if the 166 field is a proto message. 167 168 Returns: 169 str: A string of the text formatted protocol buffer message. 170 """ 171 out = TextWriter(as_utf8) 172 printer = _Printer( 173 out, 174 indent, 175 as_utf8, 176 as_one_line, 177 use_short_repeated_primitives, 178 pointy_brackets, 179 use_index_order, 180 float_format, 181 double_format, 182 use_field_number, 183 descriptor_pool, 184 message_formatter, 185 print_unknown_fields=print_unknown_fields, 186 force_colon=force_colon) 187 printer.PrintMessage(message) 188 result = out.getvalue() 189 out.close() 190 if as_one_line: 191 return result.rstrip() 192 return result 193 194 195def MessageToBytes(message, **kwargs): 196 # type: (...) -> bytes 197 """Convert protobuf message to encoded text format. See MessageToString.""" 198 text = MessageToString(message, **kwargs) 199 if isinstance(text, bytes): 200 return text 201 codec = 'utf-8' if kwargs.get('as_utf8') else 'ascii' 202 return text.encode(codec) 203 204 205def _IsMapEntry(field): 206 return (field.type == descriptor.FieldDescriptor.TYPE_MESSAGE and 207 field.message_type.has_options and 208 field.message_type.GetOptions().map_entry) 209 210 211def PrintMessage(message, 212 out, 213 indent=0, 214 as_utf8=False, 215 as_one_line=False, 216 use_short_repeated_primitives=False, 217 pointy_brackets=False, 218 use_index_order=False, 219 float_format=None, 220 double_format=None, 221 use_field_number=False, 222 descriptor_pool=None, 223 message_formatter=None, 224 print_unknown_fields=False, 225 force_colon=False): 226 printer = _Printer( 227 out=out, indent=indent, as_utf8=as_utf8, 228 as_one_line=as_one_line, 229 use_short_repeated_primitives=use_short_repeated_primitives, 230 pointy_brackets=pointy_brackets, 231 use_index_order=use_index_order, 232 float_format=float_format, 233 double_format=double_format, 234 use_field_number=use_field_number, 235 descriptor_pool=descriptor_pool, 236 message_formatter=message_formatter, 237 print_unknown_fields=print_unknown_fields, 238 force_colon=force_colon) 239 printer.PrintMessage(message) 240 241 242def PrintField(field, 243 value, 244 out, 245 indent=0, 246 as_utf8=False, 247 as_one_line=False, 248 use_short_repeated_primitives=False, 249 pointy_brackets=False, 250 use_index_order=False, 251 float_format=None, 252 double_format=None, 253 message_formatter=None, 254 print_unknown_fields=False, 255 force_colon=False): 256 """Print a single field name/value pair.""" 257 printer = _Printer(out, indent, as_utf8, as_one_line, 258 use_short_repeated_primitives, pointy_brackets, 259 use_index_order, float_format, double_format, 260 message_formatter=message_formatter, 261 print_unknown_fields=print_unknown_fields, 262 force_colon=force_colon) 263 printer.PrintField(field, value) 264 265 266def PrintFieldValue(field, 267 value, 268 out, 269 indent=0, 270 as_utf8=False, 271 as_one_line=False, 272 use_short_repeated_primitives=False, 273 pointy_brackets=False, 274 use_index_order=False, 275 float_format=None, 276 double_format=None, 277 message_formatter=None, 278 print_unknown_fields=False, 279 force_colon=False): 280 """Print a single field value (not including name).""" 281 printer = _Printer(out, indent, as_utf8, as_one_line, 282 use_short_repeated_primitives, pointy_brackets, 283 use_index_order, float_format, double_format, 284 message_formatter=message_formatter, 285 print_unknown_fields=print_unknown_fields, 286 force_colon=force_colon) 287 printer.PrintFieldValue(field, value) 288 289 290def _BuildMessageFromTypeName(type_name, descriptor_pool): 291 """Returns a protobuf message instance. 292 293 Args: 294 type_name: Fully-qualified protobuf message type name string. 295 descriptor_pool: DescriptorPool instance. 296 297 Returns: 298 A Message instance of type matching type_name, or None if the a Descriptor 299 wasn't found matching type_name. 300 """ 301 # pylint: disable=g-import-not-at-top 302 if descriptor_pool is None: 303 from google.protobuf import descriptor_pool as pool_mod 304 descriptor_pool = pool_mod.Default() 305 from google.protobuf import symbol_database 306 database = symbol_database.Default() 307 try: 308 message_descriptor = descriptor_pool.FindMessageTypeByName(type_name) 309 except KeyError: 310 return None 311 message_type = database.GetPrototype(message_descriptor) 312 return message_type() 313 314 315# These values must match WireType enum in google/protobuf/wire_format.h. 316WIRETYPE_LENGTH_DELIMITED = 2 317WIRETYPE_START_GROUP = 3 318 319 320class _Printer(object): 321 """Text format printer for protocol message.""" 322 323 def __init__( 324 self, 325 out, 326 indent=0, 327 as_utf8=False, 328 as_one_line=False, 329 use_short_repeated_primitives=False, 330 pointy_brackets=False, 331 use_index_order=False, 332 float_format=None, 333 double_format=None, 334 use_field_number=False, 335 descriptor_pool=None, 336 message_formatter=None, 337 print_unknown_fields=False, 338 force_colon=False): 339 """Initialize the Printer. 340 341 Double values can be formatted compactly with 15 digits of precision 342 (which is the most that IEEE 754 "double" can guarantee) using 343 double_format='.15g'. To ensure that converting to text and back to a proto 344 will result in an identical value, double_format='.17g' should be used. 345 346 Args: 347 out: To record the text format result. 348 indent: The initial indent level for pretty print. 349 as_utf8: Return unescaped Unicode for non-ASCII characters. 350 In Python 3 actual Unicode characters may appear as is in strings. 351 In Python 2 the return value will be valid UTF-8 rather than ASCII. 352 as_one_line: Don't introduce newlines between fields. 353 use_short_repeated_primitives: Use short repeated format for primitives. 354 pointy_brackets: If True, use angle brackets instead of curly braces for 355 nesting. 356 use_index_order: If True, print fields of a proto message using the order 357 defined in source code instead of the field number. By default, use the 358 field number order. 359 float_format: If set, use this to specify float field formatting 360 (per the "Format Specification Mini-Language"); otherwise, shortest 361 float that has same value in wire will be printed. Also affect double 362 field if double_format is not set but float_format is set. 363 double_format: If set, use this to specify double field formatting 364 (per the "Format Specification Mini-Language"); if it is not set but 365 float_format is set, use float_format. Otherwise, str() is used. 366 use_field_number: If True, print field numbers instead of names. 367 descriptor_pool: A DescriptorPool used to resolve Any types. 368 message_formatter: A function(message, indent, as_one_line): unicode|None 369 to custom format selected sub-messages (usually based on message type). 370 Use to pretty print parts of the protobuf for easier diffing. 371 print_unknown_fields: If True, unknown fields will be printed. 372 force_colon: If set, a colon will be added after the field name even if 373 the field is a proto message. 374 """ 375 self.out = out 376 self.indent = indent 377 self.as_utf8 = as_utf8 378 self.as_one_line = as_one_line 379 self.use_short_repeated_primitives = use_short_repeated_primitives 380 self.pointy_brackets = pointy_brackets 381 self.use_index_order = use_index_order 382 self.float_format = float_format 383 if double_format is not None: 384 self.double_format = double_format 385 else: 386 self.double_format = float_format 387 self.use_field_number = use_field_number 388 self.descriptor_pool = descriptor_pool 389 self.message_formatter = message_formatter 390 self.print_unknown_fields = print_unknown_fields 391 self.force_colon = force_colon 392 393 def _TryPrintAsAnyMessage(self, message): 394 """Serializes if message is a google.protobuf.Any field.""" 395 if '/' not in message.type_url: 396 return False 397 packed_message = _BuildMessageFromTypeName(message.TypeName(), 398 self.descriptor_pool) 399 if packed_message: 400 packed_message.MergeFromString(message.value) 401 colon = ':' if self.force_colon else '' 402 self.out.write('%s[%s]%s ' % (self.indent * ' ', message.type_url, colon)) 403 self._PrintMessageFieldValue(packed_message) 404 self.out.write(' ' if self.as_one_line else '\n') 405 return True 406 else: 407 return False 408 409 def _TryCustomFormatMessage(self, message): 410 formatted = self.message_formatter(message, self.indent, self.as_one_line) 411 if formatted is None: 412 return False 413 414 out = self.out 415 out.write(' ' * self.indent) 416 out.write(formatted) 417 out.write(' ' if self.as_one_line else '\n') 418 return True 419 420 def PrintMessage(self, message): 421 """Convert protobuf message to text format. 422 423 Args: 424 message: The protocol buffers message. 425 """ 426 if self.message_formatter and self._TryCustomFormatMessage(message): 427 return 428 if (message.DESCRIPTOR.full_name == _ANY_FULL_TYPE_NAME and 429 self._TryPrintAsAnyMessage(message)): 430 return 431 fields = message.ListFields() 432 if self.use_index_order: 433 fields.sort( 434 key=lambda x: x[0].number if x[0].is_extension else x[0].index) 435 for field, value in fields: 436 if _IsMapEntry(field): 437 for key in sorted(value): 438 # This is slow for maps with submessage entries because it copies the 439 # entire tree. Unfortunately this would take significant refactoring 440 # of this file to work around. 441 # 442 # TODO(haberman): refactor and optimize if this becomes an issue. 443 entry_submsg = value.GetEntryClass()(key=key, value=value[key]) 444 self.PrintField(field, entry_submsg) 445 elif field.label == descriptor.FieldDescriptor.LABEL_REPEATED: 446 if (self.use_short_repeated_primitives 447 and field.cpp_type != descriptor.FieldDescriptor.CPPTYPE_MESSAGE 448 and field.cpp_type != descriptor.FieldDescriptor.CPPTYPE_STRING): 449 self._PrintShortRepeatedPrimitivesValue(field, value) 450 else: 451 for element in value: 452 self.PrintField(field, element) 453 else: 454 self.PrintField(field, value) 455 456 if self.print_unknown_fields: 457 self._PrintUnknownFields(message.UnknownFields()) 458 459 def _PrintUnknownFields(self, unknown_fields): 460 """Print unknown fields.""" 461 out = self.out 462 for field in unknown_fields: 463 out.write(' ' * self.indent) 464 out.write(str(field.field_number)) 465 if field.wire_type == WIRETYPE_START_GROUP: 466 if self.as_one_line: 467 out.write(' { ') 468 else: 469 out.write(' {\n') 470 self.indent += 2 471 472 self._PrintUnknownFields(field.data) 473 474 if self.as_one_line: 475 out.write('} ') 476 else: 477 self.indent -= 2 478 out.write(' ' * self.indent + '}\n') 479 elif field.wire_type == WIRETYPE_LENGTH_DELIMITED: 480 try: 481 # If this field is parseable as a Message, it is probably 482 # an embedded message. 483 # pylint: disable=protected-access 484 (embedded_unknown_message, pos) = decoder._DecodeUnknownFieldSet( 485 memoryview(field.data), 0, len(field.data)) 486 except Exception: # pylint: disable=broad-except 487 pos = 0 488 489 if pos == len(field.data): 490 if self.as_one_line: 491 out.write(' { ') 492 else: 493 out.write(' {\n') 494 self.indent += 2 495 496 self._PrintUnknownFields(embedded_unknown_message) 497 498 if self.as_one_line: 499 out.write('} ') 500 else: 501 self.indent -= 2 502 out.write(' ' * self.indent + '}\n') 503 else: 504 # A string or bytes field. self.as_utf8 may not work. 505 out.write(': \"') 506 out.write(text_encoding.CEscape(field.data, False)) 507 out.write('\" ' if self.as_one_line else '\"\n') 508 else: 509 # varint, fixed32, fixed64 510 out.write(': ') 511 out.write(str(field.data)) 512 out.write(' ' if self.as_one_line else '\n') 513 514 def _PrintFieldName(self, field): 515 """Print field name.""" 516 out = self.out 517 out.write(' ' * self.indent) 518 if self.use_field_number: 519 out.write(str(field.number)) 520 else: 521 if field.is_extension: 522 out.write('[') 523 if (field.containing_type.GetOptions().message_set_wire_format and 524 field.type == descriptor.FieldDescriptor.TYPE_MESSAGE and 525 field.label == descriptor.FieldDescriptor.LABEL_OPTIONAL): 526 out.write(field.message_type.full_name) 527 else: 528 out.write(field.full_name) 529 out.write(']') 530 elif field.type == descriptor.FieldDescriptor.TYPE_GROUP: 531 # For groups, use the capitalized name. 532 out.write(field.message_type.name) 533 else: 534 out.write(field.name) 535 536 if (self.force_colon or 537 field.cpp_type != descriptor.FieldDescriptor.CPPTYPE_MESSAGE): 538 # The colon is optional in this case, but our cross-language golden files 539 # don't include it. Here, the colon is only included if force_colon is 540 # set to True 541 out.write(':') 542 543 def PrintField(self, field, value): 544 """Print a single field name/value pair.""" 545 self._PrintFieldName(field) 546 self.out.write(' ') 547 self.PrintFieldValue(field, value) 548 self.out.write(' ' if self.as_one_line else '\n') 549 550 def _PrintShortRepeatedPrimitivesValue(self, field, value): 551 """"Prints short repeated primitives value.""" 552 # Note: this is called only when value has at least one element. 553 self._PrintFieldName(field) 554 self.out.write(' [') 555 for i in range(len(value) - 1): 556 self.PrintFieldValue(field, value[i]) 557 self.out.write(', ') 558 self.PrintFieldValue(field, value[-1]) 559 self.out.write(']') 560 self.out.write(' ' if self.as_one_line else '\n') 561 562 def _PrintMessageFieldValue(self, value): 563 if self.pointy_brackets: 564 openb = '<' 565 closeb = '>' 566 else: 567 openb = '{' 568 closeb = '}' 569 570 if self.as_one_line: 571 self.out.write('%s ' % openb) 572 self.PrintMessage(value) 573 self.out.write(closeb) 574 else: 575 self.out.write('%s\n' % openb) 576 self.indent += 2 577 self.PrintMessage(value) 578 self.indent -= 2 579 self.out.write(' ' * self.indent + closeb) 580 581 def PrintFieldValue(self, field, value): 582 """Print a single field value (not including name). 583 584 For repeated fields, the value should be a single element. 585 586 Args: 587 field: The descriptor of the field to be printed. 588 value: The value of the field. 589 """ 590 out = self.out 591 if field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_MESSAGE: 592 self._PrintMessageFieldValue(value) 593 elif field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_ENUM: 594 enum_value = field.enum_type.values_by_number.get(value, None) 595 if enum_value is not None: 596 out.write(enum_value.name) 597 else: 598 out.write(str(value)) 599 elif field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_STRING: 600 out.write('\"') 601 if isinstance(value, str) and not self.as_utf8: 602 out_value = value.encode('utf-8') 603 else: 604 out_value = value 605 if field.type == descriptor.FieldDescriptor.TYPE_BYTES: 606 # We always need to escape all binary data in TYPE_BYTES fields. 607 out_as_utf8 = False 608 else: 609 out_as_utf8 = self.as_utf8 610 out.write(text_encoding.CEscape(out_value, out_as_utf8)) 611 out.write('\"') 612 elif field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_BOOL: 613 if value: 614 out.write('true') 615 else: 616 out.write('false') 617 elif field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_FLOAT: 618 if self.float_format is not None: 619 out.write('{1:{0}}'.format(self.float_format, value)) 620 else: 621 if math.isnan(value): 622 out.write(str(value)) 623 else: 624 out.write(str(type_checkers.ToShortestFloat(value))) 625 elif (field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_DOUBLE and 626 self.double_format is not None): 627 out.write('{1:{0}}'.format(self.double_format, value)) 628 else: 629 out.write(str(value)) 630 631 632def Parse(text, 633 message, 634 allow_unknown_extension=False, 635 allow_field_number=False, 636 descriptor_pool=None, 637 allow_unknown_field=False): 638 """Parses a text representation of a protocol message into a message. 639 640 NOTE: for historical reasons this function does not clear the input 641 message. This is different from what the binary msg.ParseFrom(...) does. 642 If text contains a field already set in message, the value is appended if the 643 field is repeated. Otherwise, an error is raised. 644 645 Example:: 646 647 a = MyProto() 648 a.repeated_field.append('test') 649 b = MyProto() 650 651 # Repeated fields are combined 652 text_format.Parse(repr(a), b) 653 text_format.Parse(repr(a), b) # repeated_field contains ["test", "test"] 654 655 # Non-repeated fields cannot be overwritten 656 a.singular_field = 1 657 b.singular_field = 2 658 text_format.Parse(repr(a), b) # ParseError 659 660 # Binary version: 661 b.ParseFromString(a.SerializeToString()) # repeated_field is now "test" 662 663 Caller is responsible for clearing the message as needed. 664 665 Args: 666 text (str): Message text representation. 667 message (Message): A protocol buffer message to merge into. 668 allow_unknown_extension: if True, skip over missing extensions and keep 669 parsing 670 allow_field_number: if True, both field number and field name are allowed. 671 descriptor_pool (DescriptorPool): Descriptor pool used to resolve Any types. 672 allow_unknown_field: if True, skip over unknown field and keep 673 parsing. Avoid to use this option if possible. It may hide some 674 errors (e.g. spelling error on field name) 675 676 Returns: 677 Message: The same message passed as argument. 678 679 Raises: 680 ParseError: On text parsing problems. 681 """ 682 return ParseLines(text.split(b'\n' if isinstance(text, bytes) else u'\n'), 683 message, 684 allow_unknown_extension, 685 allow_field_number, 686 descriptor_pool=descriptor_pool, 687 allow_unknown_field=allow_unknown_field) 688 689 690def Merge(text, 691 message, 692 allow_unknown_extension=False, 693 allow_field_number=False, 694 descriptor_pool=None, 695 allow_unknown_field=False): 696 """Parses a text representation of a protocol message into a message. 697 698 Like Parse(), but allows repeated values for a non-repeated field, and uses 699 the last one. This means any non-repeated, top-level fields specified in text 700 replace those in the message. 701 702 Args: 703 text (str): Message text representation. 704 message (Message): A protocol buffer message to merge into. 705 allow_unknown_extension: if True, skip over missing extensions and keep 706 parsing 707 allow_field_number: if True, both field number and field name are allowed. 708 descriptor_pool (DescriptorPool): Descriptor pool used to resolve Any types. 709 allow_unknown_field: if True, skip over unknown field and keep 710 parsing. Avoid to use this option if possible. It may hide some 711 errors (e.g. spelling error on field name) 712 713 Returns: 714 Message: The same message passed as argument. 715 716 Raises: 717 ParseError: On text parsing problems. 718 """ 719 return MergeLines( 720 text.split(b'\n' if isinstance(text, bytes) else u'\n'), 721 message, 722 allow_unknown_extension, 723 allow_field_number, 724 descriptor_pool=descriptor_pool, 725 allow_unknown_field=allow_unknown_field) 726 727 728def ParseLines(lines, 729 message, 730 allow_unknown_extension=False, 731 allow_field_number=False, 732 descriptor_pool=None, 733 allow_unknown_field=False): 734 """Parses a text representation of a protocol message into a message. 735 736 See Parse() for caveats. 737 738 Args: 739 lines: An iterable of lines of a message's text representation. 740 message: A protocol buffer message to merge into. 741 allow_unknown_extension: if True, skip over missing extensions and keep 742 parsing 743 allow_field_number: if True, both field number and field name are allowed. 744 descriptor_pool: A DescriptorPool used to resolve Any types. 745 allow_unknown_field: if True, skip over unknown field and keep 746 parsing. Avoid to use this option if possible. It may hide some 747 errors (e.g. spelling error on field name) 748 749 Returns: 750 The same message passed as argument. 751 752 Raises: 753 ParseError: On text parsing problems. 754 """ 755 parser = _Parser(allow_unknown_extension, 756 allow_field_number, 757 descriptor_pool=descriptor_pool, 758 allow_unknown_field=allow_unknown_field) 759 return parser.ParseLines(lines, message) 760 761 762def MergeLines(lines, 763 message, 764 allow_unknown_extension=False, 765 allow_field_number=False, 766 descriptor_pool=None, 767 allow_unknown_field=False): 768 """Parses a text representation of a protocol message into a message. 769 770 See Merge() for more details. 771 772 Args: 773 lines: An iterable of lines of a message's text representation. 774 message: A protocol buffer message to merge into. 775 allow_unknown_extension: if True, skip over missing extensions and keep 776 parsing 777 allow_field_number: if True, both field number and field name are allowed. 778 descriptor_pool: A DescriptorPool used to resolve Any types. 779 allow_unknown_field: if True, skip over unknown field and keep 780 parsing. Avoid to use this option if possible. It may hide some 781 errors (e.g. spelling error on field name) 782 783 Returns: 784 The same message passed as argument. 785 786 Raises: 787 ParseError: On text parsing problems. 788 """ 789 parser = _Parser(allow_unknown_extension, 790 allow_field_number, 791 descriptor_pool=descriptor_pool, 792 allow_unknown_field=allow_unknown_field) 793 return parser.MergeLines(lines, message) 794 795 796class _Parser(object): 797 """Text format parser for protocol message.""" 798 799 def __init__(self, 800 allow_unknown_extension=False, 801 allow_field_number=False, 802 descriptor_pool=None, 803 allow_unknown_field=False): 804 self.allow_unknown_extension = allow_unknown_extension 805 self.allow_field_number = allow_field_number 806 self.descriptor_pool = descriptor_pool 807 self.allow_unknown_field = allow_unknown_field 808 809 def ParseLines(self, lines, message): 810 """Parses a text representation of a protocol message into a message.""" 811 self._allow_multiple_scalars = False 812 self._ParseOrMerge(lines, message) 813 return message 814 815 def MergeLines(self, lines, message): 816 """Merges a text representation of a protocol message into a message.""" 817 self._allow_multiple_scalars = True 818 self._ParseOrMerge(lines, message) 819 return message 820 821 def _ParseOrMerge(self, lines, message): 822 """Converts a text representation of a protocol message into a message. 823 824 Args: 825 lines: Lines of a message's text representation. 826 message: A protocol buffer message to merge into. 827 828 Raises: 829 ParseError: On text parsing problems. 830 """ 831 # Tokenize expects native str lines. 832 str_lines = ( 833 line if isinstance(line, str) else line.decode('utf-8') 834 for line in lines) 835 tokenizer = Tokenizer(str_lines) 836 while not tokenizer.AtEnd(): 837 self._MergeField(tokenizer, message) 838 839 def _MergeField(self, tokenizer, message): 840 """Merges a single protocol message field into a message. 841 842 Args: 843 tokenizer: A tokenizer to parse the field name and values. 844 message: A protocol message to record the data. 845 846 Raises: 847 ParseError: In case of text parsing problems. 848 """ 849 message_descriptor = message.DESCRIPTOR 850 if (message_descriptor.full_name == _ANY_FULL_TYPE_NAME and 851 tokenizer.TryConsume('[')): 852 type_url_prefix, packed_type_name = self._ConsumeAnyTypeUrl(tokenizer) 853 tokenizer.Consume(']') 854 tokenizer.TryConsume(':') 855 if tokenizer.TryConsume('<'): 856 expanded_any_end_token = '>' 857 else: 858 tokenizer.Consume('{') 859 expanded_any_end_token = '}' 860 expanded_any_sub_message = _BuildMessageFromTypeName(packed_type_name, 861 self.descriptor_pool) 862 if not expanded_any_sub_message: 863 raise ParseError('Type %s not found in descriptor pool' % 864 packed_type_name) 865 while not tokenizer.TryConsume(expanded_any_end_token): 866 if tokenizer.AtEnd(): 867 raise tokenizer.ParseErrorPreviousToken('Expected "%s".' % 868 (expanded_any_end_token,)) 869 self._MergeField(tokenizer, expanded_any_sub_message) 870 deterministic = False 871 872 message.Pack(expanded_any_sub_message, 873 type_url_prefix=type_url_prefix, 874 deterministic=deterministic) 875 return 876 877 if tokenizer.TryConsume('['): 878 name = [tokenizer.ConsumeIdentifier()] 879 while tokenizer.TryConsume('.'): 880 name.append(tokenizer.ConsumeIdentifier()) 881 name = '.'.join(name) 882 883 if not message_descriptor.is_extendable: 884 raise tokenizer.ParseErrorPreviousToken( 885 'Message type "%s" does not have extensions.' % 886 message_descriptor.full_name) 887 # pylint: disable=protected-access 888 field = message.Extensions._FindExtensionByName(name) 889 # pylint: enable=protected-access 890 891 892 if not field: 893 if self.allow_unknown_extension: 894 field = None 895 else: 896 raise tokenizer.ParseErrorPreviousToken( 897 'Extension "%s" not registered. ' 898 'Did you import the _pb2 module which defines it? ' 899 'If you are trying to place the extension in the MessageSet ' 900 'field of another message that is in an Any or MessageSet field, ' 901 'that message\'s _pb2 module must be imported as well' % name) 902 elif message_descriptor != field.containing_type: 903 raise tokenizer.ParseErrorPreviousToken( 904 'Extension "%s" does not extend message type "%s".' % 905 (name, message_descriptor.full_name)) 906 907 tokenizer.Consume(']') 908 909 else: 910 name = tokenizer.ConsumeIdentifierOrNumber() 911 if self.allow_field_number and name.isdigit(): 912 number = ParseInteger(name, True, True) 913 field = message_descriptor.fields_by_number.get(number, None) 914 if not field and message_descriptor.is_extendable: 915 field = message.Extensions._FindExtensionByNumber(number) 916 else: 917 field = message_descriptor.fields_by_name.get(name, None) 918 919 # Group names are expected to be capitalized as they appear in the 920 # .proto file, which actually matches their type names, not their field 921 # names. 922 if not field: 923 field = message_descriptor.fields_by_name.get(name.lower(), None) 924 if field and field.type != descriptor.FieldDescriptor.TYPE_GROUP: 925 field = None 926 927 if (field and field.type == descriptor.FieldDescriptor.TYPE_GROUP and 928 field.message_type.name != name): 929 field = None 930 931 if not field and not self.allow_unknown_field: 932 raise tokenizer.ParseErrorPreviousToken( 933 'Message type "%s" has no field named "%s".' % 934 (message_descriptor.full_name, name)) 935 936 if field: 937 if not self._allow_multiple_scalars and field.containing_oneof: 938 # Check if there's a different field set in this oneof. 939 # Note that we ignore the case if the same field was set before, and we 940 # apply _allow_multiple_scalars to non-scalar fields as well. 941 which_oneof = message.WhichOneof(field.containing_oneof.name) 942 if which_oneof is not None and which_oneof != field.name: 943 raise tokenizer.ParseErrorPreviousToken( 944 'Field "%s" is specified along with field "%s", another member ' 945 'of oneof "%s" for message type "%s".' % 946 (field.name, which_oneof, field.containing_oneof.name, 947 message_descriptor.full_name)) 948 949 if field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_MESSAGE: 950 tokenizer.TryConsume(':') 951 merger = self._MergeMessageField 952 else: 953 tokenizer.Consume(':') 954 merger = self._MergeScalarField 955 956 if (field.label == descriptor.FieldDescriptor.LABEL_REPEATED and 957 tokenizer.TryConsume('[')): 958 # Short repeated format, e.g. "foo: [1, 2, 3]" 959 if not tokenizer.TryConsume(']'): 960 while True: 961 merger(tokenizer, message, field) 962 if tokenizer.TryConsume(']'): 963 break 964 tokenizer.Consume(',') 965 966 else: 967 merger(tokenizer, message, field) 968 969 else: # Proto field is unknown. 970 assert (self.allow_unknown_extension or self.allow_unknown_field) 971 _SkipFieldContents(tokenizer) 972 973 # For historical reasons, fields may optionally be separated by commas or 974 # semicolons. 975 if not tokenizer.TryConsume(','): 976 tokenizer.TryConsume(';') 977 978 979 def _ConsumeAnyTypeUrl(self, tokenizer): 980 """Consumes a google.protobuf.Any type URL and returns the type name.""" 981 # Consume "type.googleapis.com/". 982 prefix = [tokenizer.ConsumeIdentifier()] 983 tokenizer.Consume('.') 984 prefix.append(tokenizer.ConsumeIdentifier()) 985 tokenizer.Consume('.') 986 prefix.append(tokenizer.ConsumeIdentifier()) 987 tokenizer.Consume('/') 988 # Consume the fully-qualified type name. 989 name = [tokenizer.ConsumeIdentifier()] 990 while tokenizer.TryConsume('.'): 991 name.append(tokenizer.ConsumeIdentifier()) 992 return '.'.join(prefix), '.'.join(name) 993 994 def _MergeMessageField(self, tokenizer, message, field): 995 """Merges a single scalar field into a message. 996 997 Args: 998 tokenizer: A tokenizer to parse the field value. 999 message: The message of which field is a member. 1000 field: The descriptor of the field to be merged. 1001 1002 Raises: 1003 ParseError: In case of text parsing problems. 1004 """ 1005 is_map_entry = _IsMapEntry(field) 1006 1007 if tokenizer.TryConsume('<'): 1008 end_token = '>' 1009 else: 1010 tokenizer.Consume('{') 1011 end_token = '}' 1012 1013 if field.label == descriptor.FieldDescriptor.LABEL_REPEATED: 1014 if field.is_extension: 1015 sub_message = message.Extensions[field].add() 1016 elif is_map_entry: 1017 sub_message = getattr(message, field.name).GetEntryClass()() 1018 else: 1019 sub_message = getattr(message, field.name).add() 1020 else: 1021 if field.is_extension: 1022 if (not self._allow_multiple_scalars and 1023 message.HasExtension(field)): 1024 raise tokenizer.ParseErrorPreviousToken( 1025 'Message type "%s" should not have multiple "%s" extensions.' % 1026 (message.DESCRIPTOR.full_name, field.full_name)) 1027 sub_message = message.Extensions[field] 1028 else: 1029 # Also apply _allow_multiple_scalars to message field. 1030 # TODO(jieluo): Change to _allow_singular_overwrites. 1031 if (not self._allow_multiple_scalars and 1032 message.HasField(field.name)): 1033 raise tokenizer.ParseErrorPreviousToken( 1034 'Message type "%s" should not have multiple "%s" fields.' % 1035 (message.DESCRIPTOR.full_name, field.name)) 1036 sub_message = getattr(message, field.name) 1037 sub_message.SetInParent() 1038 1039 while not tokenizer.TryConsume(end_token): 1040 if tokenizer.AtEnd(): 1041 raise tokenizer.ParseErrorPreviousToken('Expected "%s".' % (end_token,)) 1042 self._MergeField(tokenizer, sub_message) 1043 1044 if is_map_entry: 1045 value_cpptype = field.message_type.fields_by_name['value'].cpp_type 1046 if value_cpptype == descriptor.FieldDescriptor.CPPTYPE_MESSAGE: 1047 value = getattr(message, field.name)[sub_message.key] 1048 value.CopyFrom(sub_message.value) 1049 else: 1050 getattr(message, field.name)[sub_message.key] = sub_message.value 1051 1052 @staticmethod 1053 def _IsProto3Syntax(message): 1054 message_descriptor = message.DESCRIPTOR 1055 return (hasattr(message_descriptor, 'syntax') and 1056 message_descriptor.syntax == 'proto3') 1057 1058 def _MergeScalarField(self, tokenizer, message, field): 1059 """Merges a single scalar field into a message. 1060 1061 Args: 1062 tokenizer: A tokenizer to parse the field value. 1063 message: A protocol message to record the data. 1064 field: The descriptor of the field to be merged. 1065 1066 Raises: 1067 ParseError: In case of text parsing problems. 1068 RuntimeError: On runtime errors. 1069 """ 1070 _ = self.allow_unknown_extension 1071 value = None 1072 1073 if field.type in (descriptor.FieldDescriptor.TYPE_INT32, 1074 descriptor.FieldDescriptor.TYPE_SINT32, 1075 descriptor.FieldDescriptor.TYPE_SFIXED32): 1076 value = _ConsumeInt32(tokenizer) 1077 elif field.type in (descriptor.FieldDescriptor.TYPE_INT64, 1078 descriptor.FieldDescriptor.TYPE_SINT64, 1079 descriptor.FieldDescriptor.TYPE_SFIXED64): 1080 value = _ConsumeInt64(tokenizer) 1081 elif field.type in (descriptor.FieldDescriptor.TYPE_UINT32, 1082 descriptor.FieldDescriptor.TYPE_FIXED32): 1083 value = _ConsumeUint32(tokenizer) 1084 elif field.type in (descriptor.FieldDescriptor.TYPE_UINT64, 1085 descriptor.FieldDescriptor.TYPE_FIXED64): 1086 value = _ConsumeUint64(tokenizer) 1087 elif field.type in (descriptor.FieldDescriptor.TYPE_FLOAT, 1088 descriptor.FieldDescriptor.TYPE_DOUBLE): 1089 value = tokenizer.ConsumeFloat() 1090 elif field.type == descriptor.FieldDescriptor.TYPE_BOOL: 1091 value = tokenizer.ConsumeBool() 1092 elif field.type == descriptor.FieldDescriptor.TYPE_STRING: 1093 value = tokenizer.ConsumeString() 1094 elif field.type == descriptor.FieldDescriptor.TYPE_BYTES: 1095 value = tokenizer.ConsumeByteString() 1096 elif field.type == descriptor.FieldDescriptor.TYPE_ENUM: 1097 value = tokenizer.ConsumeEnum(field) 1098 else: 1099 raise RuntimeError('Unknown field type %d' % field.type) 1100 1101 if field.label == descriptor.FieldDescriptor.LABEL_REPEATED: 1102 if field.is_extension: 1103 message.Extensions[field].append(value) 1104 else: 1105 getattr(message, field.name).append(value) 1106 else: 1107 if field.is_extension: 1108 if (not self._allow_multiple_scalars and 1109 not self._IsProto3Syntax(message) and 1110 message.HasExtension(field)): 1111 raise tokenizer.ParseErrorPreviousToken( 1112 'Message type "%s" should not have multiple "%s" extensions.' % 1113 (message.DESCRIPTOR.full_name, field.full_name)) 1114 else: 1115 message.Extensions[field] = value 1116 else: 1117 duplicate_error = False 1118 if not self._allow_multiple_scalars: 1119 if self._IsProto3Syntax(message): 1120 # Proto3 doesn't represent presence so we try best effort to check 1121 # multiple scalars by compare to default values. 1122 duplicate_error = bool(getattr(message, field.name)) 1123 else: 1124 duplicate_error = message.HasField(field.name) 1125 1126 if duplicate_error: 1127 raise tokenizer.ParseErrorPreviousToken( 1128 'Message type "%s" should not have multiple "%s" fields.' % 1129 (message.DESCRIPTOR.full_name, field.name)) 1130 else: 1131 setattr(message, field.name, value) 1132 1133 1134def _SkipFieldContents(tokenizer): 1135 """Skips over contents (value or message) of a field. 1136 1137 Args: 1138 tokenizer: A tokenizer to parse the field name and values. 1139 """ 1140 # Try to guess the type of this field. 1141 # If this field is not a message, there should be a ":" between the 1142 # field name and the field value and also the field value should not 1143 # start with "{" or "<" which indicates the beginning of a message body. 1144 # If there is no ":" or there is a "{" or "<" after ":", this field has 1145 # to be a message or the input is ill-formed. 1146 if tokenizer.TryConsume(':') and not tokenizer.LookingAt( 1147 '{') and not tokenizer.LookingAt('<'): 1148 _SkipFieldValue(tokenizer) 1149 else: 1150 _SkipFieldMessage(tokenizer) 1151 1152 1153def _SkipField(tokenizer): 1154 """Skips over a complete field (name and value/message). 1155 1156 Args: 1157 tokenizer: A tokenizer to parse the field name and values. 1158 """ 1159 if tokenizer.TryConsume('['): 1160 # Consume extension name. 1161 tokenizer.ConsumeIdentifier() 1162 while tokenizer.TryConsume('.'): 1163 tokenizer.ConsumeIdentifier() 1164 tokenizer.Consume(']') 1165 else: 1166 tokenizer.ConsumeIdentifierOrNumber() 1167 1168 _SkipFieldContents(tokenizer) 1169 1170 # For historical reasons, fields may optionally be separated by commas or 1171 # semicolons. 1172 if not tokenizer.TryConsume(','): 1173 tokenizer.TryConsume(';') 1174 1175 1176def _SkipFieldMessage(tokenizer): 1177 """Skips over a field message. 1178 1179 Args: 1180 tokenizer: A tokenizer to parse the field name and values. 1181 """ 1182 1183 if tokenizer.TryConsume('<'): 1184 delimiter = '>' 1185 else: 1186 tokenizer.Consume('{') 1187 delimiter = '}' 1188 1189 while not tokenizer.LookingAt('>') and not tokenizer.LookingAt('}'): 1190 _SkipField(tokenizer) 1191 1192 tokenizer.Consume(delimiter) 1193 1194 1195def _SkipFieldValue(tokenizer): 1196 """Skips over a field value. 1197 1198 Args: 1199 tokenizer: A tokenizer to parse the field name and values. 1200 1201 Raises: 1202 ParseError: In case an invalid field value is found. 1203 """ 1204 # String/bytes tokens can come in multiple adjacent string literals. 1205 # If we can consume one, consume as many as we can. 1206 if tokenizer.TryConsumeByteString(): 1207 while tokenizer.TryConsumeByteString(): 1208 pass 1209 return 1210 1211 if (not tokenizer.TryConsumeIdentifier() and 1212 not _TryConsumeInt64(tokenizer) and not _TryConsumeUint64(tokenizer) and 1213 not tokenizer.TryConsumeFloat()): 1214 raise ParseError('Invalid field value: ' + tokenizer.token) 1215 1216 1217class Tokenizer(object): 1218 """Protocol buffer text representation tokenizer. 1219 1220 This class handles the lower level string parsing by splitting it into 1221 meaningful tokens. 1222 1223 It was directly ported from the Java protocol buffer API. 1224 """ 1225 1226 _WHITESPACE = re.compile(r'\s+') 1227 _COMMENT = re.compile(r'(\s*#.*$)', re.MULTILINE) 1228 _WHITESPACE_OR_COMMENT = re.compile(r'(\s|(#.*$))+', re.MULTILINE) 1229 _TOKEN = re.compile('|'.join([ 1230 r'[a-zA-Z_][0-9a-zA-Z_+-]*', # an identifier 1231 r'([0-9+-]|(\.[0-9]))[0-9a-zA-Z_.+-]*', # a number 1232 ] + [ # quoted str for each quote mark 1233 # Avoid backtracking! https://stackoverflow.com/a/844267 1234 r'{qt}[^{qt}\n\\]*((\\.)+[^{qt}\n\\]*)*({qt}|\\?$)'.format(qt=mark) 1235 for mark in _QUOTES 1236 ])) 1237 1238 _IDENTIFIER = re.compile(r'[^\d\W]\w*') 1239 _IDENTIFIER_OR_NUMBER = re.compile(r'\w+') 1240 1241 def __init__(self, lines, skip_comments=True): 1242 self._position = 0 1243 self._line = -1 1244 self._column = 0 1245 self._token_start = None 1246 self.token = '' 1247 self._lines = iter(lines) 1248 self._current_line = '' 1249 self._previous_line = 0 1250 self._previous_column = 0 1251 self._more_lines = True 1252 self._skip_comments = skip_comments 1253 self._whitespace_pattern = (skip_comments and self._WHITESPACE_OR_COMMENT 1254 or self._WHITESPACE) 1255 self._SkipWhitespace() 1256 self.NextToken() 1257 1258 def LookingAt(self, token): 1259 return self.token == token 1260 1261 def AtEnd(self): 1262 """Checks the end of the text was reached. 1263 1264 Returns: 1265 True iff the end was reached. 1266 """ 1267 return not self.token 1268 1269 def _PopLine(self): 1270 while len(self._current_line) <= self._column: 1271 try: 1272 self._current_line = next(self._lines) 1273 except StopIteration: 1274 self._current_line = '' 1275 self._more_lines = False 1276 return 1277 else: 1278 self._line += 1 1279 self._column = 0 1280 1281 def _SkipWhitespace(self): 1282 while True: 1283 self._PopLine() 1284 match = self._whitespace_pattern.match(self._current_line, self._column) 1285 if not match: 1286 break 1287 length = len(match.group(0)) 1288 self._column += length 1289 1290 def TryConsume(self, token): 1291 """Tries to consume a given piece of text. 1292 1293 Args: 1294 token: Text to consume. 1295 1296 Returns: 1297 True iff the text was consumed. 1298 """ 1299 if self.token == token: 1300 self.NextToken() 1301 return True 1302 return False 1303 1304 def Consume(self, token): 1305 """Consumes a piece of text. 1306 1307 Args: 1308 token: Text to consume. 1309 1310 Raises: 1311 ParseError: If the text couldn't be consumed. 1312 """ 1313 if not self.TryConsume(token): 1314 raise self.ParseError('Expected "%s".' % token) 1315 1316 def ConsumeComment(self): 1317 result = self.token 1318 if not self._COMMENT.match(result): 1319 raise self.ParseError('Expected comment.') 1320 self.NextToken() 1321 return result 1322 1323 def ConsumeCommentOrTrailingComment(self): 1324 """Consumes a comment, returns a 2-tuple (trailing bool, comment str).""" 1325 1326 # Tokenizer initializes _previous_line and _previous_column to 0. As the 1327 # tokenizer starts, it looks like there is a previous token on the line. 1328 just_started = self._line == 0 and self._column == 0 1329 1330 before_parsing = self._previous_line 1331 comment = self.ConsumeComment() 1332 1333 # A trailing comment is a comment on the same line than the previous token. 1334 trailing = (self._previous_line == before_parsing 1335 and not just_started) 1336 1337 return trailing, comment 1338 1339 def TryConsumeIdentifier(self): 1340 try: 1341 self.ConsumeIdentifier() 1342 return True 1343 except ParseError: 1344 return False 1345 1346 def ConsumeIdentifier(self): 1347 """Consumes protocol message field identifier. 1348 1349 Returns: 1350 Identifier string. 1351 1352 Raises: 1353 ParseError: If an identifier couldn't be consumed. 1354 """ 1355 result = self.token 1356 if not self._IDENTIFIER.match(result): 1357 raise self.ParseError('Expected identifier.') 1358 self.NextToken() 1359 return result 1360 1361 def TryConsumeIdentifierOrNumber(self): 1362 try: 1363 self.ConsumeIdentifierOrNumber() 1364 return True 1365 except ParseError: 1366 return False 1367 1368 def ConsumeIdentifierOrNumber(self): 1369 """Consumes protocol message field identifier. 1370 1371 Returns: 1372 Identifier string. 1373 1374 Raises: 1375 ParseError: If an identifier couldn't be consumed. 1376 """ 1377 result = self.token 1378 if not self._IDENTIFIER_OR_NUMBER.match(result): 1379 raise self.ParseError('Expected identifier or number, got %s.' % result) 1380 self.NextToken() 1381 return result 1382 1383 def TryConsumeInteger(self): 1384 try: 1385 self.ConsumeInteger() 1386 return True 1387 except ParseError: 1388 return False 1389 1390 def ConsumeInteger(self): 1391 """Consumes an integer number. 1392 1393 Returns: 1394 The integer parsed. 1395 1396 Raises: 1397 ParseError: If an integer couldn't be consumed. 1398 """ 1399 try: 1400 result = _ParseAbstractInteger(self.token) 1401 except ValueError as e: 1402 raise self.ParseError(str(e)) 1403 self.NextToken() 1404 return result 1405 1406 def TryConsumeFloat(self): 1407 try: 1408 self.ConsumeFloat() 1409 return True 1410 except ParseError: 1411 return False 1412 1413 def ConsumeFloat(self): 1414 """Consumes an floating point number. 1415 1416 Returns: 1417 The number parsed. 1418 1419 Raises: 1420 ParseError: If a floating point number couldn't be consumed. 1421 """ 1422 try: 1423 result = ParseFloat(self.token) 1424 except ValueError as e: 1425 raise self.ParseError(str(e)) 1426 self.NextToken() 1427 return result 1428 1429 def ConsumeBool(self): 1430 """Consumes a boolean value. 1431 1432 Returns: 1433 The bool parsed. 1434 1435 Raises: 1436 ParseError: If a boolean value couldn't be consumed. 1437 """ 1438 try: 1439 result = ParseBool(self.token) 1440 except ValueError as e: 1441 raise self.ParseError(str(e)) 1442 self.NextToken() 1443 return result 1444 1445 def TryConsumeByteString(self): 1446 try: 1447 self.ConsumeByteString() 1448 return True 1449 except ParseError: 1450 return False 1451 1452 def ConsumeString(self): 1453 """Consumes a string value. 1454 1455 Returns: 1456 The string parsed. 1457 1458 Raises: 1459 ParseError: If a string value couldn't be consumed. 1460 """ 1461 the_bytes = self.ConsumeByteString() 1462 try: 1463 return str(the_bytes, 'utf-8') 1464 except UnicodeDecodeError as e: 1465 raise self._StringParseError(e) 1466 1467 def ConsumeByteString(self): 1468 """Consumes a byte array value. 1469 1470 Returns: 1471 The array parsed (as a string). 1472 1473 Raises: 1474 ParseError: If a byte array value couldn't be consumed. 1475 """ 1476 the_list = [self._ConsumeSingleByteString()] 1477 while self.token and self.token[0] in _QUOTES: 1478 the_list.append(self._ConsumeSingleByteString()) 1479 return b''.join(the_list) 1480 1481 def _ConsumeSingleByteString(self): 1482 """Consume one token of a string literal. 1483 1484 String literals (whether bytes or text) can come in multiple adjacent 1485 tokens which are automatically concatenated, like in C or Python. This 1486 method only consumes one token. 1487 1488 Returns: 1489 The token parsed. 1490 Raises: 1491 ParseError: When the wrong format data is found. 1492 """ 1493 text = self.token 1494 if len(text) < 1 or text[0] not in _QUOTES: 1495 raise self.ParseError('Expected string but found: %r' % (text,)) 1496 1497 if len(text) < 2 or text[-1] != text[0]: 1498 raise self.ParseError('String missing ending quote: %r' % (text,)) 1499 1500 try: 1501 result = text_encoding.CUnescape(text[1:-1]) 1502 except ValueError as e: 1503 raise self.ParseError(str(e)) 1504 self.NextToken() 1505 return result 1506 1507 def ConsumeEnum(self, field): 1508 try: 1509 result = ParseEnum(field, self.token) 1510 except ValueError as e: 1511 raise self.ParseError(str(e)) 1512 self.NextToken() 1513 return result 1514 1515 def ParseErrorPreviousToken(self, message): 1516 """Creates and *returns* a ParseError for the previously read token. 1517 1518 Args: 1519 message: A message to set for the exception. 1520 1521 Returns: 1522 A ParseError instance. 1523 """ 1524 return ParseError(message, self._previous_line + 1, 1525 self._previous_column + 1) 1526 1527 def ParseError(self, message): 1528 """Creates and *returns* a ParseError for the current token.""" 1529 return ParseError('\'' + self._current_line + '\': ' + message, 1530 self._line + 1, self._column + 1) 1531 1532 def _StringParseError(self, e): 1533 return self.ParseError('Couldn\'t parse string: ' + str(e)) 1534 1535 def NextToken(self): 1536 """Reads the next meaningful token.""" 1537 self._previous_line = self._line 1538 self._previous_column = self._column 1539 1540 self._column += len(self.token) 1541 self._SkipWhitespace() 1542 1543 if not self._more_lines: 1544 self.token = '' 1545 return 1546 1547 match = self._TOKEN.match(self._current_line, self._column) 1548 if not match and not self._skip_comments: 1549 match = self._COMMENT.match(self._current_line, self._column) 1550 if match: 1551 token = match.group(0) 1552 self.token = token 1553 else: 1554 self.token = self._current_line[self._column] 1555 1556# Aliased so it can still be accessed by current visibility violators. 1557# TODO(dbarnett): Migrate violators to textformat_tokenizer. 1558_Tokenizer = Tokenizer # pylint: disable=invalid-name 1559 1560 1561def _ConsumeInt32(tokenizer): 1562 """Consumes a signed 32bit integer number from tokenizer. 1563 1564 Args: 1565 tokenizer: A tokenizer used to parse the number. 1566 1567 Returns: 1568 The integer parsed. 1569 1570 Raises: 1571 ParseError: If a signed 32bit integer couldn't be consumed. 1572 """ 1573 return _ConsumeInteger(tokenizer, is_signed=True, is_long=False) 1574 1575 1576def _ConsumeUint32(tokenizer): 1577 """Consumes an unsigned 32bit integer number from tokenizer. 1578 1579 Args: 1580 tokenizer: A tokenizer used to parse the number. 1581 1582 Returns: 1583 The integer parsed. 1584 1585 Raises: 1586 ParseError: If an unsigned 32bit integer couldn't be consumed. 1587 """ 1588 return _ConsumeInteger(tokenizer, is_signed=False, is_long=False) 1589 1590 1591def _TryConsumeInt64(tokenizer): 1592 try: 1593 _ConsumeInt64(tokenizer) 1594 return True 1595 except ParseError: 1596 return False 1597 1598 1599def _ConsumeInt64(tokenizer): 1600 """Consumes a signed 32bit integer number from tokenizer. 1601 1602 Args: 1603 tokenizer: A tokenizer used to parse the number. 1604 1605 Returns: 1606 The integer parsed. 1607 1608 Raises: 1609 ParseError: If a signed 32bit integer couldn't be consumed. 1610 """ 1611 return _ConsumeInteger(tokenizer, is_signed=True, is_long=True) 1612 1613 1614def _TryConsumeUint64(tokenizer): 1615 try: 1616 _ConsumeUint64(tokenizer) 1617 return True 1618 except ParseError: 1619 return False 1620 1621 1622def _ConsumeUint64(tokenizer): 1623 """Consumes an unsigned 64bit integer number from tokenizer. 1624 1625 Args: 1626 tokenizer: A tokenizer used to parse the number. 1627 1628 Returns: 1629 The integer parsed. 1630 1631 Raises: 1632 ParseError: If an unsigned 64bit integer couldn't be consumed. 1633 """ 1634 return _ConsumeInteger(tokenizer, is_signed=False, is_long=True) 1635 1636 1637def _ConsumeInteger(tokenizer, is_signed=False, is_long=False): 1638 """Consumes an integer number from tokenizer. 1639 1640 Args: 1641 tokenizer: A tokenizer used to parse the number. 1642 is_signed: True if a signed integer must be parsed. 1643 is_long: True if a long integer must be parsed. 1644 1645 Returns: 1646 The integer parsed. 1647 1648 Raises: 1649 ParseError: If an integer with given characteristics couldn't be consumed. 1650 """ 1651 try: 1652 result = ParseInteger(tokenizer.token, is_signed=is_signed, is_long=is_long) 1653 except ValueError as e: 1654 raise tokenizer.ParseError(str(e)) 1655 tokenizer.NextToken() 1656 return result 1657 1658 1659def ParseInteger(text, is_signed=False, is_long=False): 1660 """Parses an integer. 1661 1662 Args: 1663 text: The text to parse. 1664 is_signed: True if a signed integer must be parsed. 1665 is_long: True if a long integer must be parsed. 1666 1667 Returns: 1668 The integer value. 1669 1670 Raises: 1671 ValueError: Thrown Iff the text is not a valid integer. 1672 """ 1673 # Do the actual parsing. Exception handling is propagated to caller. 1674 result = _ParseAbstractInteger(text) 1675 1676 # Check if the integer is sane. Exceptions handled by callers. 1677 checker = _INTEGER_CHECKERS[2 * int(is_long) + int(is_signed)] 1678 checker.CheckValue(result) 1679 return result 1680 1681 1682def _ParseAbstractInteger(text): 1683 """Parses an integer without checking size/signedness. 1684 1685 Args: 1686 text: The text to parse. 1687 1688 Returns: 1689 The integer value. 1690 1691 Raises: 1692 ValueError: Thrown Iff the text is not a valid integer. 1693 """ 1694 # Do the actual parsing. Exception handling is propagated to caller. 1695 orig_text = text 1696 c_octal_match = re.match(r'(-?)0(\d+)$', text) 1697 if c_octal_match: 1698 # Python 3 no longer supports 0755 octal syntax without the 'o', so 1699 # we always use the '0o' prefix for multi-digit numbers starting with 0. 1700 text = c_octal_match.group(1) + '0o' + c_octal_match.group(2) 1701 try: 1702 return int(text, 0) 1703 except ValueError: 1704 raise ValueError('Couldn\'t parse integer: %s' % orig_text) 1705 1706 1707def ParseFloat(text): 1708 """Parse a floating point number. 1709 1710 Args: 1711 text: Text to parse. 1712 1713 Returns: 1714 The number parsed. 1715 1716 Raises: 1717 ValueError: If a floating point number couldn't be parsed. 1718 """ 1719 try: 1720 # Assume Python compatible syntax. 1721 return float(text) 1722 except ValueError: 1723 # Check alternative spellings. 1724 if _FLOAT_INFINITY.match(text): 1725 if text[0] == '-': 1726 return float('-inf') 1727 else: 1728 return float('inf') 1729 elif _FLOAT_NAN.match(text): 1730 return float('nan') 1731 else: 1732 # assume '1.0f' format 1733 try: 1734 return float(text.rstrip('f')) 1735 except ValueError: 1736 raise ValueError('Couldn\'t parse float: %s' % text) 1737 1738 1739def ParseBool(text): 1740 """Parse a boolean value. 1741 1742 Args: 1743 text: Text to parse. 1744 1745 Returns: 1746 Boolean values parsed 1747 1748 Raises: 1749 ValueError: If text is not a valid boolean. 1750 """ 1751 if text in ('true', 't', '1', 'True'): 1752 return True 1753 elif text in ('false', 'f', '0', 'False'): 1754 return False 1755 else: 1756 raise ValueError('Expected "true" or "false".') 1757 1758 1759def ParseEnum(field, value): 1760 """Parse an enum value. 1761 1762 The value can be specified by a number (the enum value), or by 1763 a string literal (the enum name). 1764 1765 Args: 1766 field: Enum field descriptor. 1767 value: String value. 1768 1769 Returns: 1770 Enum value number. 1771 1772 Raises: 1773 ValueError: If the enum value could not be parsed. 1774 """ 1775 enum_descriptor = field.enum_type 1776 try: 1777 number = int(value, 0) 1778 except ValueError: 1779 # Identifier. 1780 enum_value = enum_descriptor.values_by_name.get(value, None) 1781 if enum_value is None: 1782 raise ValueError('Enum type "%s" has no value named %s.' % 1783 (enum_descriptor.full_name, value)) 1784 else: 1785 # Numeric value. 1786 if hasattr(field.file, 'syntax'): 1787 # Attribute is checked for compatibility. 1788 if field.file.syntax == 'proto3': 1789 # Proto3 accept numeric unknown enums. 1790 return number 1791 enum_value = enum_descriptor.values_by_number.get(number, None) 1792 if enum_value is None: 1793 raise ValueError('Enum type "%s" has no value with number %d.' % 1794 (enum_descriptor.full_name, number)) 1795 return enum_value.number 1796