1# Protocol Buffers - Google's data interchange format 2# Copyright 2008 Google Inc. All rights reserved. 3# https://developers.google.com/protocol-buffers/ 4# 5# Redistribution and use in source and binary forms, with or without 6# modification, are permitted provided that the following conditions are 7# met: 8# 9# * Redistributions of source code must retain the above copyright 10# notice, this list of conditions and the following disclaimer. 11# * Redistributions in binary form must reproduce the above 12# copyright notice, this list of conditions and the following disclaimer 13# in the documentation and/or other materials provided with the 14# distribution. 15# * Neither the name of Google Inc. nor the names of its 16# contributors may be used to endorse or promote products derived from 17# this software without specific prior written permission. 18# 19# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 23# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 25# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 31"""Contains routines for printing protocol messages in text format. 32 33Simple usage example:: 34 35 # Create a proto object and serialize it to a text proto string. 36 message = my_proto_pb2.MyMessage(foo='bar') 37 text_proto = text_format.MessageToString(message) 38 39 # Parse a text proto string. 40 message = text_format.Parse(text_proto, my_proto_pb2.MyMessage()) 41""" 42 43__author__ = 'kenton@google.com (Kenton Varda)' 44 45# TODO(b/129989314) Import thread contention leads to test failures. 46import encodings.raw_unicode_escape # pylint: disable=unused-import 47import encodings.unicode_escape # pylint: disable=unused-import 48import io 49import re 50 51import six 52 53if six.PY3: 54 long = int # pylint: disable=redefined-builtin,invalid-name 55 56# pylint: disable=g-import-not-at-top 57from google.protobuf.internal import decoder 58from google.protobuf.internal import type_checkers 59from google.protobuf import descriptor 60from google.protobuf import text_encoding 61 62__all__ = ['MessageToString', 'Parse', 'PrintMessage', 'PrintField', 63 'PrintFieldValue', 'Merge', 'MessageToBytes'] 64 65_INTEGER_CHECKERS = (type_checkers.Uint32ValueChecker(), 66 type_checkers.Int32ValueChecker(), 67 type_checkers.Uint64ValueChecker(), 68 type_checkers.Int64ValueChecker()) 69_FLOAT_INFINITY = re.compile('-?inf(?:inity)?f?$', re.IGNORECASE) 70_FLOAT_NAN = re.compile('nanf?$', re.IGNORECASE) 71_QUOTES = frozenset(("'", '"')) 72_ANY_FULL_TYPE_NAME = 'google.protobuf.Any' 73 74 75class Error(Exception): 76 """Top-level module error for text_format.""" 77 78 79class ParseError(Error): 80 """Thrown in case of text parsing or tokenizing error.""" 81 82 def __init__(self, message=None, line=None, column=None): 83 if message is not None and line is not None: 84 loc = str(line) 85 if column is not None: 86 loc += ':{0}'.format(column) 87 message = '{0} : {1}'.format(loc, message) 88 if message is not None: 89 super(ParseError, self).__init__(message) 90 else: 91 super(ParseError, self).__init__() 92 self._line = line 93 self._column = column 94 95 def GetLine(self): 96 return self._line 97 98 def GetColumn(self): 99 return self._column 100 101 102class TextWriter(object): 103 104 def __init__(self, as_utf8): 105 if six.PY2: 106 self._writer = io.BytesIO() 107 else: 108 self._writer = io.StringIO() 109 110 def write(self, val): 111 if six.PY2: 112 if isinstance(val, six.text_type): 113 val = val.encode('utf-8') 114 return self._writer.write(val) 115 116 def close(self): 117 return self._writer.close() 118 119 def getvalue(self): 120 return self._writer.getvalue() 121 122 123def MessageToString( 124 message, 125 as_utf8=False, 126 as_one_line=False, 127 use_short_repeated_primitives=False, 128 pointy_brackets=False, 129 use_index_order=False, 130 float_format=None, 131 double_format=None, 132 use_field_number=False, 133 descriptor_pool=None, 134 indent=0, 135 message_formatter=None, 136 print_unknown_fields=False, 137 force_colon=False): 138 # type: (...) -> str 139 """Convert protobuf message to text format. 140 141 Double values can be formatted compactly with 15 digits of 142 precision (which is the most that IEEE 754 "double" can guarantee) 143 using double_format='.15g'. To ensure that converting to text and back to a 144 proto will result in an identical value, double_format='.17g' should be used. 145 146 Args: 147 message: The protocol buffers message. 148 as_utf8: Return unescaped Unicode for non-ASCII characters. 149 In Python 3 actual Unicode characters may appear as is in strings. 150 In Python 2 the return value will be valid UTF-8 rather than only ASCII. 151 as_one_line: Don't introduce newlines between fields. 152 use_short_repeated_primitives: Use short repeated format for primitives. 153 pointy_brackets: If True, use angle brackets instead of curly braces for 154 nesting. 155 use_index_order: If True, fields of a proto message will be printed using 156 the order defined in source code instead of the field number, extensions 157 will be printed at the end of the message and their relative order is 158 determined by the extension number. By default, use the field number 159 order. 160 float_format (str): If set, use this to specify float field formatting 161 (per the "Format Specification Mini-Language"); otherwise, shortest float 162 that has same value in wire will be printed. Also affect double field 163 if double_format is not set but float_format is set. 164 double_format (str): If set, use this to specify double field formatting 165 (per the "Format Specification Mini-Language"); if it is not set but 166 float_format is set, use float_format. Otherwise, use ``str()`` 167 use_field_number: If True, print field numbers instead of names. 168 descriptor_pool (DescriptorPool): Descriptor pool used to resolve Any types. 169 indent (int): The initial indent level, in terms of spaces, for pretty 170 print. 171 message_formatter (function(message, indent, as_one_line) -> unicode|None): 172 Custom formatter for selected sub-messages (usually based on message 173 type). Use to pretty print parts of the protobuf for easier diffing. 174 print_unknown_fields: If True, unknown fields will be printed. 175 force_colon: If set, a colon will be added after the field name even if the 176 field is a proto message. 177 178 Returns: 179 str: A string of the text formatted protocol buffer message. 180 """ 181 out = TextWriter(as_utf8) 182 printer = _Printer( 183 out, 184 indent, 185 as_utf8, 186 as_one_line, 187 use_short_repeated_primitives, 188 pointy_brackets, 189 use_index_order, 190 float_format, 191 double_format, 192 use_field_number, 193 descriptor_pool, 194 message_formatter, 195 print_unknown_fields=print_unknown_fields, 196 force_colon=force_colon) 197 printer.PrintMessage(message) 198 result = out.getvalue() 199 out.close() 200 if as_one_line: 201 return result.rstrip() 202 return result 203 204 205def MessageToBytes(message, **kwargs): 206 # type: (...) -> bytes 207 """Convert protobuf message to encoded text format. See MessageToString.""" 208 text = MessageToString(message, **kwargs) 209 if isinstance(text, bytes): 210 return text 211 codec = 'utf-8' if kwargs.get('as_utf8') else 'ascii' 212 return text.encode(codec) 213 214 215def _IsMapEntry(field): 216 return (field.type == descriptor.FieldDescriptor.TYPE_MESSAGE and 217 field.message_type.has_options and 218 field.message_type.GetOptions().map_entry) 219 220 221def PrintMessage(message, 222 out, 223 indent=0, 224 as_utf8=False, 225 as_one_line=False, 226 use_short_repeated_primitives=False, 227 pointy_brackets=False, 228 use_index_order=False, 229 float_format=None, 230 double_format=None, 231 use_field_number=False, 232 descriptor_pool=None, 233 message_formatter=None, 234 print_unknown_fields=False, 235 force_colon=False): 236 printer = _Printer( 237 out=out, indent=indent, as_utf8=as_utf8, 238 as_one_line=as_one_line, 239 use_short_repeated_primitives=use_short_repeated_primitives, 240 pointy_brackets=pointy_brackets, 241 use_index_order=use_index_order, 242 float_format=float_format, 243 double_format=double_format, 244 use_field_number=use_field_number, 245 descriptor_pool=descriptor_pool, 246 message_formatter=message_formatter, 247 print_unknown_fields=print_unknown_fields, 248 force_colon=force_colon) 249 printer.PrintMessage(message) 250 251 252def PrintField(field, 253 value, 254 out, 255 indent=0, 256 as_utf8=False, 257 as_one_line=False, 258 use_short_repeated_primitives=False, 259 pointy_brackets=False, 260 use_index_order=False, 261 float_format=None, 262 double_format=None, 263 message_formatter=None, 264 print_unknown_fields=False, 265 force_colon=False): 266 """Print a single field name/value pair.""" 267 printer = _Printer(out, indent, as_utf8, as_one_line, 268 use_short_repeated_primitives, pointy_brackets, 269 use_index_order, float_format, double_format, 270 message_formatter=message_formatter, 271 print_unknown_fields=print_unknown_fields, 272 force_colon=force_colon) 273 printer.PrintField(field, value) 274 275 276def PrintFieldValue(field, 277 value, 278 out, 279 indent=0, 280 as_utf8=False, 281 as_one_line=False, 282 use_short_repeated_primitives=False, 283 pointy_brackets=False, 284 use_index_order=False, 285 float_format=None, 286 double_format=None, 287 message_formatter=None, 288 print_unknown_fields=False, 289 force_colon=False): 290 """Print a single field value (not including name).""" 291 printer = _Printer(out, indent, as_utf8, as_one_line, 292 use_short_repeated_primitives, pointy_brackets, 293 use_index_order, float_format, double_format, 294 message_formatter=message_formatter, 295 print_unknown_fields=print_unknown_fields, 296 force_colon=force_colon) 297 printer.PrintFieldValue(field, value) 298 299 300def _BuildMessageFromTypeName(type_name, descriptor_pool): 301 """Returns a protobuf message instance. 302 303 Args: 304 type_name: Fully-qualified protobuf message type name string. 305 descriptor_pool: DescriptorPool instance. 306 307 Returns: 308 A Message instance of type matching type_name, or None if the a Descriptor 309 wasn't found matching type_name. 310 """ 311 # pylint: disable=g-import-not-at-top 312 if descriptor_pool is None: 313 from google.protobuf import descriptor_pool as pool_mod 314 descriptor_pool = pool_mod.Default() 315 from google.protobuf import symbol_database 316 database = symbol_database.Default() 317 try: 318 message_descriptor = descriptor_pool.FindMessageTypeByName(type_name) 319 except KeyError: 320 return None 321 message_type = database.GetPrototype(message_descriptor) 322 return message_type() 323 324 325# These values must match WireType enum in google/protobuf/wire_format.h. 326WIRETYPE_LENGTH_DELIMITED = 2 327WIRETYPE_START_GROUP = 3 328 329 330class _Printer(object): 331 """Text format printer for protocol message.""" 332 333 def __init__( 334 self, 335 out, 336 indent=0, 337 as_utf8=False, 338 as_one_line=False, 339 use_short_repeated_primitives=False, 340 pointy_brackets=False, 341 use_index_order=False, 342 float_format=None, 343 double_format=None, 344 use_field_number=False, 345 descriptor_pool=None, 346 message_formatter=None, 347 print_unknown_fields=False, 348 force_colon=False): 349 """Initialize the Printer. 350 351 Double values can be formatted compactly with 15 digits of precision 352 (which is the most that IEEE 754 "double" can guarantee) using 353 double_format='.15g'. To ensure that converting to text and back to a proto 354 will result in an identical value, double_format='.17g' should be used. 355 356 Args: 357 out: To record the text format result. 358 indent: The initial indent level for pretty print. 359 as_utf8: Return unescaped Unicode for non-ASCII characters. 360 In Python 3 actual Unicode characters may appear as is in strings. 361 In Python 2 the return value will be valid UTF-8 rather than ASCII. 362 as_one_line: Don't introduce newlines between fields. 363 use_short_repeated_primitives: Use short repeated format for primitives. 364 pointy_brackets: If True, use angle brackets instead of curly braces for 365 nesting. 366 use_index_order: If True, print fields of a proto message using the order 367 defined in source code instead of the field number. By default, use the 368 field number order. 369 float_format: If set, use this to specify float field formatting 370 (per the "Format Specification Mini-Language"); otherwise, shortest 371 float that has same value in wire will be printed. Also affect double 372 field if double_format is not set but float_format is set. 373 double_format: If set, use this to specify double field formatting 374 (per the "Format Specification Mini-Language"); if it is not set but 375 float_format is set, use float_format. Otherwise, str() is used. 376 use_field_number: If True, print field numbers instead of names. 377 descriptor_pool: A DescriptorPool used to resolve Any types. 378 message_formatter: A function(message, indent, as_one_line): unicode|None 379 to custom format selected sub-messages (usually based on message type). 380 Use to pretty print parts of the protobuf for easier diffing. 381 print_unknown_fields: If True, unknown fields will be printed. 382 force_colon: If set, a colon will be added after the field name even if 383 the field is a proto message. 384 """ 385 self.out = out 386 self.indent = indent 387 self.as_utf8 = as_utf8 388 self.as_one_line = as_one_line 389 self.use_short_repeated_primitives = use_short_repeated_primitives 390 self.pointy_brackets = pointy_brackets 391 self.use_index_order = use_index_order 392 self.float_format = float_format 393 if double_format is not None: 394 self.double_format = double_format 395 else: 396 self.double_format = float_format 397 self.use_field_number = use_field_number 398 self.descriptor_pool = descriptor_pool 399 self.message_formatter = message_formatter 400 self.print_unknown_fields = print_unknown_fields 401 self.force_colon = force_colon 402 403 def _TryPrintAsAnyMessage(self, message): 404 """Serializes if message is a google.protobuf.Any field.""" 405 if '/' not in message.type_url: 406 return False 407 packed_message = _BuildMessageFromTypeName(message.TypeName(), 408 self.descriptor_pool) 409 if packed_message: 410 packed_message.MergeFromString(message.value) 411 colon = ':' if self.force_colon else '' 412 self.out.write('%s[%s]%s ' % (self.indent * ' ', message.type_url, colon)) 413 self._PrintMessageFieldValue(packed_message) 414 self.out.write(' ' if self.as_one_line else '\n') 415 return True 416 else: 417 return False 418 419 def _TryCustomFormatMessage(self, message): 420 formatted = self.message_formatter(message, self.indent, self.as_one_line) 421 if formatted is None: 422 return False 423 424 out = self.out 425 out.write(' ' * self.indent) 426 out.write(formatted) 427 out.write(' ' if self.as_one_line else '\n') 428 return True 429 430 def PrintMessage(self, message): 431 """Convert protobuf message to text format. 432 433 Args: 434 message: The protocol buffers message. 435 """ 436 if self.message_formatter and self._TryCustomFormatMessage(message): 437 return 438 if (message.DESCRIPTOR.full_name == _ANY_FULL_TYPE_NAME and 439 self._TryPrintAsAnyMessage(message)): 440 return 441 fields = message.ListFields() 442 if self.use_index_order: 443 fields.sort( 444 key=lambda x: x[0].number if x[0].is_extension else x[0].index) 445 for field, value in fields: 446 if _IsMapEntry(field): 447 for key in sorted(value): 448 # This is slow for maps with submessage entries because it copies the 449 # entire tree. Unfortunately this would take significant refactoring 450 # of this file to work around. 451 # 452 # TODO(haberman): refactor and optimize if this becomes an issue. 453 entry_submsg = value.GetEntryClass()(key=key, value=value[key]) 454 self.PrintField(field, entry_submsg) 455 elif field.label == descriptor.FieldDescriptor.LABEL_REPEATED: 456 if (self.use_short_repeated_primitives 457 and field.cpp_type != descriptor.FieldDescriptor.CPPTYPE_MESSAGE 458 and field.cpp_type != descriptor.FieldDescriptor.CPPTYPE_STRING): 459 self._PrintShortRepeatedPrimitivesValue(field, value) 460 else: 461 for element in value: 462 self.PrintField(field, element) 463 else: 464 self.PrintField(field, value) 465 466 if self.print_unknown_fields: 467 self._PrintUnknownFields(message.UnknownFields()) 468 469 def _PrintUnknownFields(self, unknown_fields): 470 """Print unknown fields.""" 471 out = self.out 472 for field in unknown_fields: 473 out.write(' ' * self.indent) 474 out.write(str(field.field_number)) 475 if field.wire_type == WIRETYPE_START_GROUP: 476 if self.as_one_line: 477 out.write(' { ') 478 else: 479 out.write(' {\n') 480 self.indent += 2 481 482 self._PrintUnknownFields(field.data) 483 484 if self.as_one_line: 485 out.write('} ') 486 else: 487 self.indent -= 2 488 out.write(' ' * self.indent + '}\n') 489 elif field.wire_type == WIRETYPE_LENGTH_DELIMITED: 490 try: 491 # If this field is parseable as a Message, it is probably 492 # an embedded message. 493 # pylint: disable=protected-access 494 (embedded_unknown_message, pos) = decoder._DecodeUnknownFieldSet( 495 memoryview(field.data), 0, len(field.data)) 496 except Exception: # pylint: disable=broad-except 497 pos = 0 498 499 if pos == len(field.data): 500 if self.as_one_line: 501 out.write(' { ') 502 else: 503 out.write(' {\n') 504 self.indent += 2 505 506 self._PrintUnknownFields(embedded_unknown_message) 507 508 if self.as_one_line: 509 out.write('} ') 510 else: 511 self.indent -= 2 512 out.write(' ' * self.indent + '}\n') 513 else: 514 # A string or bytes field. self.as_utf8 may not work. 515 out.write(': \"') 516 out.write(text_encoding.CEscape(field.data, False)) 517 out.write('\" ' if self.as_one_line else '\"\n') 518 else: 519 # varint, fixed32, fixed64 520 out.write(': ') 521 out.write(str(field.data)) 522 out.write(' ' if self.as_one_line else '\n') 523 524 def _PrintFieldName(self, field): 525 """Print field name.""" 526 out = self.out 527 out.write(' ' * self.indent) 528 if self.use_field_number: 529 out.write(str(field.number)) 530 else: 531 if field.is_extension: 532 out.write('[') 533 if (field.containing_type.GetOptions().message_set_wire_format and 534 field.type == descriptor.FieldDescriptor.TYPE_MESSAGE and 535 field.label == descriptor.FieldDescriptor.LABEL_OPTIONAL): 536 out.write(field.message_type.full_name) 537 else: 538 out.write(field.full_name) 539 out.write(']') 540 elif field.type == descriptor.FieldDescriptor.TYPE_GROUP: 541 # For groups, use the capitalized name. 542 out.write(field.message_type.name) 543 else: 544 out.write(field.name) 545 546 if (self.force_colon or 547 field.cpp_type != descriptor.FieldDescriptor.CPPTYPE_MESSAGE): 548 # The colon is optional in this case, but our cross-language golden files 549 # don't include it. Here, the colon is only included if force_colon is 550 # set to True 551 out.write(':') 552 553 def PrintField(self, field, value): 554 """Print a single field name/value pair.""" 555 self._PrintFieldName(field) 556 self.out.write(' ') 557 self.PrintFieldValue(field, value) 558 self.out.write(' ' if self.as_one_line else '\n') 559 560 def _PrintShortRepeatedPrimitivesValue(self, field, value): 561 """"Prints short repeated primitives value.""" 562 # Note: this is called only when value has at least one element. 563 self._PrintFieldName(field) 564 self.out.write(' [') 565 for i in six.moves.range(len(value) - 1): 566 self.PrintFieldValue(field, value[i]) 567 self.out.write(', ') 568 self.PrintFieldValue(field, value[-1]) 569 self.out.write(']') 570 if self.force_colon: 571 self.out.write(':') 572 self.out.write(' ' if self.as_one_line else '\n') 573 574 def _PrintMessageFieldValue(self, value): 575 if self.pointy_brackets: 576 openb = '<' 577 closeb = '>' 578 else: 579 openb = '{' 580 closeb = '}' 581 582 if self.as_one_line: 583 self.out.write('%s ' % openb) 584 self.PrintMessage(value) 585 self.out.write(closeb) 586 else: 587 self.out.write('%s\n' % openb) 588 self.indent += 2 589 self.PrintMessage(value) 590 self.indent -= 2 591 self.out.write(' ' * self.indent + closeb) 592 593 def PrintFieldValue(self, field, value): 594 """Print a single field value (not including name). 595 596 For repeated fields, the value should be a single element. 597 598 Args: 599 field: The descriptor of the field to be printed. 600 value: The value of the field. 601 """ 602 out = self.out 603 if field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_MESSAGE: 604 self._PrintMessageFieldValue(value) 605 elif field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_ENUM: 606 enum_value = field.enum_type.values_by_number.get(value, None) 607 if enum_value is not None: 608 out.write(enum_value.name) 609 else: 610 out.write(str(value)) 611 elif field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_STRING: 612 out.write('\"') 613 if isinstance(value, six.text_type) and (six.PY2 or not self.as_utf8): 614 out_value = value.encode('utf-8') 615 else: 616 out_value = value 617 if field.type == descriptor.FieldDescriptor.TYPE_BYTES: 618 # We always need to escape all binary data in TYPE_BYTES fields. 619 out_as_utf8 = False 620 else: 621 out_as_utf8 = self.as_utf8 622 out.write(text_encoding.CEscape(out_value, out_as_utf8)) 623 out.write('\"') 624 elif field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_BOOL: 625 if value: 626 out.write('true') 627 else: 628 out.write('false') 629 elif field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_FLOAT: 630 if self.float_format is not None: 631 out.write('{1:{0}}'.format(self.float_format, value)) 632 else: 633 out.write(str(type_checkers.ToShortestFloat(value))) 634 elif (field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_DOUBLE and 635 self.double_format is not None): 636 out.write('{1:{0}}'.format(self.double_format, value)) 637 else: 638 out.write(str(value)) 639 640 641def Parse(text, 642 message, 643 allow_unknown_extension=False, 644 allow_field_number=False, 645 descriptor_pool=None, 646 allow_unknown_field=False): 647 """Parses a text representation of a protocol message into a message. 648 649 NOTE: for historical reasons this function does not clear the input 650 message. This is different from what the binary msg.ParseFrom(...) does. 651 If text contains a field already set in message, the value is appended if the 652 field is repeated. Otherwise, an error is raised. 653 654 Example:: 655 656 a = MyProto() 657 a.repeated_field.append('test') 658 b = MyProto() 659 660 # Repeated fields are combined 661 text_format.Parse(repr(a), b) 662 text_format.Parse(repr(a), b) # repeated_field contains ["test", "test"] 663 664 # Non-repeated fields cannot be overwritten 665 a.singular_field = 1 666 b.singular_field = 2 667 text_format.Parse(repr(a), b) # ParseError 668 669 # Binary version: 670 b.ParseFromString(a.SerializeToString()) # repeated_field is now "test" 671 672 Caller is responsible for clearing the message as needed. 673 674 Args: 675 text (str): Message text representation. 676 message (Message): A protocol buffer message to merge into. 677 allow_unknown_extension: if True, skip over missing extensions and keep 678 parsing 679 allow_field_number: if True, both field number and field name are allowed. 680 descriptor_pool (DescriptorPool): Descriptor pool used to resolve Any types. 681 allow_unknown_field: if True, skip over unknown field and keep 682 parsing. Avoid to use this option if possible. It may hide some 683 errors (e.g. spelling error on field name) 684 685 Returns: 686 Message: The same message passed as argument. 687 688 Raises: 689 ParseError: On text parsing problems. 690 """ 691 return ParseLines(text.split(b'\n' if isinstance(text, bytes) else u'\n'), 692 message, 693 allow_unknown_extension, 694 allow_field_number, 695 descriptor_pool=descriptor_pool, 696 allow_unknown_field=allow_unknown_field) 697 698 699def Merge(text, 700 message, 701 allow_unknown_extension=False, 702 allow_field_number=False, 703 descriptor_pool=None, 704 allow_unknown_field=False): 705 """Parses a text representation of a protocol message into a message. 706 707 Like Parse(), but allows repeated values for a non-repeated field, and uses 708 the last one. This means any non-repeated, top-level fields specified in text 709 replace those in the message. 710 711 Args: 712 text (str): Message text representation. 713 message (Message): A protocol buffer message to merge into. 714 allow_unknown_extension: if True, skip over missing extensions and keep 715 parsing 716 allow_field_number: if True, both field number and field name are allowed. 717 descriptor_pool (DescriptorPool): Descriptor pool used to resolve Any types. 718 allow_unknown_field: if True, skip over unknown field and keep 719 parsing. Avoid to use this option if possible. It may hide some 720 errors (e.g. spelling error on field name) 721 722 Returns: 723 Message: The same message passed as argument. 724 725 Raises: 726 ParseError: On text parsing problems. 727 """ 728 return MergeLines( 729 text.split(b'\n' if isinstance(text, bytes) else u'\n'), 730 message, 731 allow_unknown_extension, 732 allow_field_number, 733 descriptor_pool=descriptor_pool, 734 allow_unknown_field=allow_unknown_field) 735 736 737def ParseLines(lines, 738 message, 739 allow_unknown_extension=False, 740 allow_field_number=False, 741 descriptor_pool=None, 742 allow_unknown_field=False): 743 """Parses a text representation of a protocol message into a message. 744 745 See Parse() for caveats. 746 747 Args: 748 lines: An iterable of lines of a message's text representation. 749 message: A protocol buffer message to merge into. 750 allow_unknown_extension: if True, skip over missing extensions and keep 751 parsing 752 allow_field_number: if True, both field number and field name are allowed. 753 descriptor_pool: A DescriptorPool used to resolve Any types. 754 allow_unknown_field: if True, skip over unknown field and keep 755 parsing. Avoid to use this option if possible. It may hide some 756 errors (e.g. spelling error on field name) 757 758 Returns: 759 The same message passed as argument. 760 761 Raises: 762 ParseError: On text parsing problems. 763 """ 764 parser = _Parser(allow_unknown_extension, 765 allow_field_number, 766 descriptor_pool=descriptor_pool, 767 allow_unknown_field=allow_unknown_field) 768 return parser.ParseLines(lines, message) 769 770 771def MergeLines(lines, 772 message, 773 allow_unknown_extension=False, 774 allow_field_number=False, 775 descriptor_pool=None, 776 allow_unknown_field=False): 777 """Parses a text representation of a protocol message into a message. 778 779 See Merge() for more details. 780 781 Args: 782 lines: An iterable of lines of a message's text representation. 783 message: A protocol buffer message to merge into. 784 allow_unknown_extension: if True, skip over missing extensions and keep 785 parsing 786 allow_field_number: if True, both field number and field name are allowed. 787 descriptor_pool: A DescriptorPool used to resolve Any types. 788 allow_unknown_field: if True, skip over unknown field and keep 789 parsing. Avoid to use this option if possible. It may hide some 790 errors (e.g. spelling error on field name) 791 792 Returns: 793 The same message passed as argument. 794 795 Raises: 796 ParseError: On text parsing problems. 797 """ 798 parser = _Parser(allow_unknown_extension, 799 allow_field_number, 800 descriptor_pool=descriptor_pool, 801 allow_unknown_field=allow_unknown_field) 802 return parser.MergeLines(lines, message) 803 804 805class _Parser(object): 806 """Text format parser for protocol message.""" 807 808 def __init__(self, 809 allow_unknown_extension=False, 810 allow_field_number=False, 811 descriptor_pool=None, 812 allow_unknown_field=False): 813 self.allow_unknown_extension = allow_unknown_extension 814 self.allow_field_number = allow_field_number 815 self.descriptor_pool = descriptor_pool 816 self.allow_unknown_field = allow_unknown_field 817 818 def ParseLines(self, lines, message): 819 """Parses a text representation of a protocol message into a message.""" 820 self._allow_multiple_scalars = False 821 self._ParseOrMerge(lines, message) 822 return message 823 824 def MergeLines(self, lines, message): 825 """Merges a text representation of a protocol message into a message.""" 826 self._allow_multiple_scalars = True 827 self._ParseOrMerge(lines, message) 828 return message 829 830 def _ParseOrMerge(self, lines, message): 831 """Converts a text representation of a protocol message into a message. 832 833 Args: 834 lines: Lines of a message's text representation. 835 message: A protocol buffer message to merge into. 836 837 Raises: 838 ParseError: On text parsing problems. 839 """ 840 # Tokenize expects native str lines. 841 if six.PY2: 842 str_lines = (line if isinstance(line, str) else line.encode('utf-8') 843 for line in lines) 844 else: 845 str_lines = (line if isinstance(line, str) else line.decode('utf-8') 846 for line in lines) 847 tokenizer = Tokenizer(str_lines) 848 while not tokenizer.AtEnd(): 849 self._MergeField(tokenizer, message) 850 851 def _MergeField(self, tokenizer, message): 852 """Merges a single protocol message field into a message. 853 854 Args: 855 tokenizer: A tokenizer to parse the field name and values. 856 message: A protocol message to record the data. 857 858 Raises: 859 ParseError: In case of text parsing problems. 860 """ 861 message_descriptor = message.DESCRIPTOR 862 if (message_descriptor.full_name == _ANY_FULL_TYPE_NAME and 863 tokenizer.TryConsume('[')): 864 type_url_prefix, packed_type_name = self._ConsumeAnyTypeUrl(tokenizer) 865 tokenizer.Consume(']') 866 tokenizer.TryConsume(':') 867 if tokenizer.TryConsume('<'): 868 expanded_any_end_token = '>' 869 else: 870 tokenizer.Consume('{') 871 expanded_any_end_token = '}' 872 expanded_any_sub_message = _BuildMessageFromTypeName(packed_type_name, 873 self.descriptor_pool) 874 if not expanded_any_sub_message: 875 raise ParseError('Type %s not found in descriptor pool' % 876 packed_type_name) 877 while not tokenizer.TryConsume(expanded_any_end_token): 878 if tokenizer.AtEnd(): 879 raise tokenizer.ParseErrorPreviousToken('Expected "%s".' % 880 (expanded_any_end_token,)) 881 self._MergeField(tokenizer, expanded_any_sub_message) 882 message.Pack(expanded_any_sub_message, 883 type_url_prefix=type_url_prefix) 884 return 885 886 if tokenizer.TryConsume('['): 887 name = [tokenizer.ConsumeIdentifier()] 888 while tokenizer.TryConsume('.'): 889 name.append(tokenizer.ConsumeIdentifier()) 890 name = '.'.join(name) 891 892 if not message_descriptor.is_extendable: 893 raise tokenizer.ParseErrorPreviousToken( 894 'Message type "%s" does not have extensions.' % 895 message_descriptor.full_name) 896 # pylint: disable=protected-access 897 field = message.Extensions._FindExtensionByName(name) 898 # pylint: enable=protected-access 899 if not field: 900 if self.allow_unknown_extension: 901 field = None 902 else: 903 raise tokenizer.ParseErrorPreviousToken( 904 'Extension "%s" not registered. ' 905 'Did you import the _pb2 module which defines it? ' 906 'If you are trying to place the extension in the MessageSet ' 907 'field of another message that is in an Any or MessageSet field, ' 908 'that message\'s _pb2 module must be imported as well' % name) 909 elif message_descriptor != field.containing_type: 910 raise tokenizer.ParseErrorPreviousToken( 911 'Extension "%s" does not extend message type "%s".' % 912 (name, message_descriptor.full_name)) 913 914 tokenizer.Consume(']') 915 916 else: 917 name = tokenizer.ConsumeIdentifierOrNumber() 918 if self.allow_field_number and name.isdigit(): 919 number = ParseInteger(name, True, True) 920 field = message_descriptor.fields_by_number.get(number, None) 921 if not field and message_descriptor.is_extendable: 922 field = message.Extensions._FindExtensionByNumber(number) 923 else: 924 field = message_descriptor.fields_by_name.get(name, None) 925 926 # Group names are expected to be capitalized as they appear in the 927 # .proto file, which actually matches their type names, not their field 928 # names. 929 if not field: 930 field = message_descriptor.fields_by_name.get(name.lower(), None) 931 if field and field.type != descriptor.FieldDescriptor.TYPE_GROUP: 932 field = None 933 934 if (field and field.type == descriptor.FieldDescriptor.TYPE_GROUP and 935 field.message_type.name != name): 936 field = None 937 938 if not field and not self.allow_unknown_field: 939 raise tokenizer.ParseErrorPreviousToken( 940 'Message type "%s" has no field named "%s".' % 941 (message_descriptor.full_name, name)) 942 943 if field: 944 if not self._allow_multiple_scalars and field.containing_oneof: 945 # Check if there's a different field set in this oneof. 946 # Note that we ignore the case if the same field was set before, and we 947 # apply _allow_multiple_scalars to non-scalar fields as well. 948 which_oneof = message.WhichOneof(field.containing_oneof.name) 949 if which_oneof is not None and which_oneof != field.name: 950 raise tokenizer.ParseErrorPreviousToken( 951 'Field "%s" is specified along with field "%s", another member ' 952 'of oneof "%s" for message type "%s".' % 953 (field.name, which_oneof, field.containing_oneof.name, 954 message_descriptor.full_name)) 955 956 if field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_MESSAGE: 957 tokenizer.TryConsume(':') 958 merger = self._MergeMessageField 959 else: 960 tokenizer.Consume(':') 961 merger = self._MergeScalarField 962 963 if (field.label == descriptor.FieldDescriptor.LABEL_REPEATED and 964 tokenizer.TryConsume('[')): 965 # Short repeated format, e.g. "foo: [1, 2, 3]" 966 if not tokenizer.TryConsume(']'): 967 while True: 968 merger(tokenizer, message, field) 969 if tokenizer.TryConsume(']'): 970 break 971 tokenizer.Consume(',') 972 973 else: 974 merger(tokenizer, message, field) 975 976 else: # Proto field is unknown. 977 assert (self.allow_unknown_extension or self.allow_unknown_field) 978 _SkipFieldContents(tokenizer) 979 980 # For historical reasons, fields may optionally be separated by commas or 981 # semicolons. 982 if not tokenizer.TryConsume(','): 983 tokenizer.TryConsume(';') 984 985 def _ConsumeAnyTypeUrl(self, tokenizer): 986 """Consumes a google.protobuf.Any type URL and returns the type name.""" 987 # Consume "type.googleapis.com/". 988 prefix = [tokenizer.ConsumeIdentifier()] 989 tokenizer.Consume('.') 990 prefix.append(tokenizer.ConsumeIdentifier()) 991 tokenizer.Consume('.') 992 prefix.append(tokenizer.ConsumeIdentifier()) 993 tokenizer.Consume('/') 994 # Consume the fully-qualified type name. 995 name = [tokenizer.ConsumeIdentifier()] 996 while tokenizer.TryConsume('.'): 997 name.append(tokenizer.ConsumeIdentifier()) 998 return '.'.join(prefix), '.'.join(name) 999 1000 def _MergeMessageField(self, tokenizer, message, field): 1001 """Merges a single scalar field into a message. 1002 1003 Args: 1004 tokenizer: A tokenizer to parse the field value. 1005 message: The message of which field is a member. 1006 field: The descriptor of the field to be merged. 1007 1008 Raises: 1009 ParseError: In case of text parsing problems. 1010 """ 1011 is_map_entry = _IsMapEntry(field) 1012 1013 if tokenizer.TryConsume('<'): 1014 end_token = '>' 1015 else: 1016 tokenizer.Consume('{') 1017 end_token = '}' 1018 1019 if field.label == descriptor.FieldDescriptor.LABEL_REPEATED: 1020 if field.is_extension: 1021 sub_message = message.Extensions[field].add() 1022 elif is_map_entry: 1023 sub_message = getattr(message, field.name).GetEntryClass()() 1024 else: 1025 sub_message = getattr(message, field.name).add() 1026 else: 1027 if field.is_extension: 1028 if (not self._allow_multiple_scalars and 1029 message.HasExtension(field)): 1030 raise tokenizer.ParseErrorPreviousToken( 1031 'Message type "%s" should not have multiple "%s" extensions.' % 1032 (message.DESCRIPTOR.full_name, field.full_name)) 1033 sub_message = message.Extensions[field] 1034 else: 1035 # Also apply _allow_multiple_scalars to message field. 1036 # TODO(jieluo): Change to _allow_singular_overwrites. 1037 if (not self._allow_multiple_scalars and 1038 message.HasField(field.name)): 1039 raise tokenizer.ParseErrorPreviousToken( 1040 'Message type "%s" should not have multiple "%s" fields.' % 1041 (message.DESCRIPTOR.full_name, field.name)) 1042 sub_message = getattr(message, field.name) 1043 sub_message.SetInParent() 1044 1045 while not tokenizer.TryConsume(end_token): 1046 if tokenizer.AtEnd(): 1047 raise tokenizer.ParseErrorPreviousToken('Expected "%s".' % (end_token,)) 1048 self._MergeField(tokenizer, sub_message) 1049 1050 if is_map_entry: 1051 value_cpptype = field.message_type.fields_by_name['value'].cpp_type 1052 if value_cpptype == descriptor.FieldDescriptor.CPPTYPE_MESSAGE: 1053 value = getattr(message, field.name)[sub_message.key] 1054 value.MergeFrom(sub_message.value) 1055 else: 1056 getattr(message, field.name)[sub_message.key] = sub_message.value 1057 1058 @staticmethod 1059 def _IsProto3Syntax(message): 1060 message_descriptor = message.DESCRIPTOR 1061 return (hasattr(message_descriptor, 'syntax') and 1062 message_descriptor.syntax == 'proto3') 1063 1064 def _MergeScalarField(self, tokenizer, message, field): 1065 """Merges a single scalar field into a message. 1066 1067 Args: 1068 tokenizer: A tokenizer to parse the field value. 1069 message: A protocol message to record the data. 1070 field: The descriptor of the field to be merged. 1071 1072 Raises: 1073 ParseError: In case of text parsing problems. 1074 RuntimeError: On runtime errors. 1075 """ 1076 _ = self.allow_unknown_extension 1077 value = None 1078 1079 if field.type in (descriptor.FieldDescriptor.TYPE_INT32, 1080 descriptor.FieldDescriptor.TYPE_SINT32, 1081 descriptor.FieldDescriptor.TYPE_SFIXED32): 1082 value = _ConsumeInt32(tokenizer) 1083 elif field.type in (descriptor.FieldDescriptor.TYPE_INT64, 1084 descriptor.FieldDescriptor.TYPE_SINT64, 1085 descriptor.FieldDescriptor.TYPE_SFIXED64): 1086 value = _ConsumeInt64(tokenizer) 1087 elif field.type in (descriptor.FieldDescriptor.TYPE_UINT32, 1088 descriptor.FieldDescriptor.TYPE_FIXED32): 1089 value = _ConsumeUint32(tokenizer) 1090 elif field.type in (descriptor.FieldDescriptor.TYPE_UINT64, 1091 descriptor.FieldDescriptor.TYPE_FIXED64): 1092 value = _ConsumeUint64(tokenizer) 1093 elif field.type in (descriptor.FieldDescriptor.TYPE_FLOAT, 1094 descriptor.FieldDescriptor.TYPE_DOUBLE): 1095 value = tokenizer.ConsumeFloat() 1096 elif field.type == descriptor.FieldDescriptor.TYPE_BOOL: 1097 value = tokenizer.ConsumeBool() 1098 elif field.type == descriptor.FieldDescriptor.TYPE_STRING: 1099 value = tokenizer.ConsumeString() 1100 elif field.type == descriptor.FieldDescriptor.TYPE_BYTES: 1101 value = tokenizer.ConsumeByteString() 1102 elif field.type == descriptor.FieldDescriptor.TYPE_ENUM: 1103 value = tokenizer.ConsumeEnum(field) 1104 else: 1105 raise RuntimeError('Unknown field type %d' % field.type) 1106 1107 if field.label == descriptor.FieldDescriptor.LABEL_REPEATED: 1108 if field.is_extension: 1109 message.Extensions[field].append(value) 1110 else: 1111 getattr(message, field.name).append(value) 1112 else: 1113 if field.is_extension: 1114 if (not self._allow_multiple_scalars and 1115 not self._IsProto3Syntax(message) and 1116 message.HasExtension(field)): 1117 raise tokenizer.ParseErrorPreviousToken( 1118 'Message type "%s" should not have multiple "%s" extensions.' % 1119 (message.DESCRIPTOR.full_name, field.full_name)) 1120 else: 1121 message.Extensions[field] = value 1122 else: 1123 duplicate_error = False 1124 if not self._allow_multiple_scalars: 1125 if self._IsProto3Syntax(message): 1126 # Proto3 doesn't represent presence so we try best effort to check 1127 # multiple scalars by compare to default values. 1128 duplicate_error = bool(getattr(message, field.name)) 1129 else: 1130 duplicate_error = message.HasField(field.name) 1131 1132 if duplicate_error: 1133 raise tokenizer.ParseErrorPreviousToken( 1134 'Message type "%s" should not have multiple "%s" fields.' % 1135 (message.DESCRIPTOR.full_name, field.name)) 1136 else: 1137 setattr(message, field.name, value) 1138 1139 1140def _SkipFieldContents(tokenizer): 1141 """Skips over contents (value or message) of a field. 1142 1143 Args: 1144 tokenizer: A tokenizer to parse the field name and values. 1145 """ 1146 # Try to guess the type of this field. 1147 # If this field is not a message, there should be a ":" between the 1148 # field name and the field value and also the field value should not 1149 # start with "{" or "<" which indicates the beginning of a message body. 1150 # If there is no ":" or there is a "{" or "<" after ":", this field has 1151 # to be a message or the input is ill-formed. 1152 if tokenizer.TryConsume(':') and not tokenizer.LookingAt( 1153 '{') and not tokenizer.LookingAt('<'): 1154 _SkipFieldValue(tokenizer) 1155 else: 1156 _SkipFieldMessage(tokenizer) 1157 1158 1159def _SkipField(tokenizer): 1160 """Skips over a complete field (name and value/message). 1161 1162 Args: 1163 tokenizer: A tokenizer to parse the field name and values. 1164 """ 1165 if tokenizer.TryConsume('['): 1166 # Consume extension name. 1167 tokenizer.ConsumeIdentifier() 1168 while tokenizer.TryConsume('.'): 1169 tokenizer.ConsumeIdentifier() 1170 tokenizer.Consume(']') 1171 else: 1172 tokenizer.ConsumeIdentifierOrNumber() 1173 1174 _SkipFieldContents(tokenizer) 1175 1176 # For historical reasons, fields may optionally be separated by commas or 1177 # semicolons. 1178 if not tokenizer.TryConsume(','): 1179 tokenizer.TryConsume(';') 1180 1181 1182def _SkipFieldMessage(tokenizer): 1183 """Skips over a field message. 1184 1185 Args: 1186 tokenizer: A tokenizer to parse the field name and values. 1187 """ 1188 1189 if tokenizer.TryConsume('<'): 1190 delimiter = '>' 1191 else: 1192 tokenizer.Consume('{') 1193 delimiter = '}' 1194 1195 while not tokenizer.LookingAt('>') and not tokenizer.LookingAt('}'): 1196 _SkipField(tokenizer) 1197 1198 tokenizer.Consume(delimiter) 1199 1200 1201def _SkipFieldValue(tokenizer): 1202 """Skips over a field value. 1203 1204 Args: 1205 tokenizer: A tokenizer to parse the field name and values. 1206 1207 Raises: 1208 ParseError: In case an invalid field value is found. 1209 """ 1210 # String/bytes tokens can come in multiple adjacent string literals. 1211 # If we can consume one, consume as many as we can. 1212 if tokenizer.TryConsumeByteString(): 1213 while tokenizer.TryConsumeByteString(): 1214 pass 1215 return 1216 1217 if (not tokenizer.TryConsumeIdentifier() and 1218 not _TryConsumeInt64(tokenizer) and not _TryConsumeUint64(tokenizer) and 1219 not tokenizer.TryConsumeFloat()): 1220 raise ParseError('Invalid field value: ' + tokenizer.token) 1221 1222 1223class Tokenizer(object): 1224 """Protocol buffer text representation tokenizer. 1225 1226 This class handles the lower level string parsing by splitting it into 1227 meaningful tokens. 1228 1229 It was directly ported from the Java protocol buffer API. 1230 """ 1231 1232 _WHITESPACE = re.compile(r'\s+') 1233 _COMMENT = re.compile(r'(\s*#.*$)', re.MULTILINE) 1234 _WHITESPACE_OR_COMMENT = re.compile(r'(\s|(#.*$))+', re.MULTILINE) 1235 _TOKEN = re.compile('|'.join([ 1236 r'[a-zA-Z_][0-9a-zA-Z_+-]*', # an identifier 1237 r'([0-9+-]|(\.[0-9]))[0-9a-zA-Z_.+-]*', # a number 1238 ] + [ # quoted str for each quote mark 1239 # Avoid backtracking! https://stackoverflow.com/a/844267 1240 r'{qt}[^{qt}\n\\]*((\\.)+[^{qt}\n\\]*)*({qt}|\\?$)'.format(qt=mark) 1241 for mark in _QUOTES 1242 ])) 1243 1244 _IDENTIFIER = re.compile(r'[^\d\W]\w*') 1245 _IDENTIFIER_OR_NUMBER = re.compile(r'\w+') 1246 1247 def __init__(self, lines, skip_comments=True): 1248 self._position = 0 1249 self._line = -1 1250 self._column = 0 1251 self._token_start = None 1252 self.token = '' 1253 self._lines = iter(lines) 1254 self._current_line = '' 1255 self._previous_line = 0 1256 self._previous_column = 0 1257 self._more_lines = True 1258 self._skip_comments = skip_comments 1259 self._whitespace_pattern = (skip_comments and self._WHITESPACE_OR_COMMENT 1260 or self._WHITESPACE) 1261 self._SkipWhitespace() 1262 self.NextToken() 1263 1264 def LookingAt(self, token): 1265 return self.token == token 1266 1267 def AtEnd(self): 1268 """Checks the end of the text was reached. 1269 1270 Returns: 1271 True iff the end was reached. 1272 """ 1273 return not self.token 1274 1275 def _PopLine(self): 1276 while len(self._current_line) <= self._column: 1277 try: 1278 self._current_line = next(self._lines) 1279 except StopIteration: 1280 self._current_line = '' 1281 self._more_lines = False 1282 return 1283 else: 1284 self._line += 1 1285 self._column = 0 1286 1287 def _SkipWhitespace(self): 1288 while True: 1289 self._PopLine() 1290 match = self._whitespace_pattern.match(self._current_line, self._column) 1291 if not match: 1292 break 1293 length = len(match.group(0)) 1294 self._column += length 1295 1296 def TryConsume(self, token): 1297 """Tries to consume a given piece of text. 1298 1299 Args: 1300 token: Text to consume. 1301 1302 Returns: 1303 True iff the text was consumed. 1304 """ 1305 if self.token == token: 1306 self.NextToken() 1307 return True 1308 return False 1309 1310 def Consume(self, token): 1311 """Consumes a piece of text. 1312 1313 Args: 1314 token: Text to consume. 1315 1316 Raises: 1317 ParseError: If the text couldn't be consumed. 1318 """ 1319 if not self.TryConsume(token): 1320 raise self.ParseError('Expected "%s".' % token) 1321 1322 def ConsumeComment(self): 1323 result = self.token 1324 if not self._COMMENT.match(result): 1325 raise self.ParseError('Expected comment.') 1326 self.NextToken() 1327 return result 1328 1329 def ConsumeCommentOrTrailingComment(self): 1330 """Consumes a comment, returns a 2-tuple (trailing bool, comment str).""" 1331 1332 # Tokenizer initializes _previous_line and _previous_column to 0. As the 1333 # tokenizer starts, it looks like there is a previous token on the line. 1334 just_started = self._line == 0 and self._column == 0 1335 1336 before_parsing = self._previous_line 1337 comment = self.ConsumeComment() 1338 1339 # A trailing comment is a comment on the same line than the previous token. 1340 trailing = (self._previous_line == before_parsing 1341 and not just_started) 1342 1343 return trailing, comment 1344 1345 def TryConsumeIdentifier(self): 1346 try: 1347 self.ConsumeIdentifier() 1348 return True 1349 except ParseError: 1350 return False 1351 1352 def ConsumeIdentifier(self): 1353 """Consumes protocol message field identifier. 1354 1355 Returns: 1356 Identifier string. 1357 1358 Raises: 1359 ParseError: If an identifier couldn't be consumed. 1360 """ 1361 result = self.token 1362 if not self._IDENTIFIER.match(result): 1363 raise self.ParseError('Expected identifier.') 1364 self.NextToken() 1365 return result 1366 1367 def TryConsumeIdentifierOrNumber(self): 1368 try: 1369 self.ConsumeIdentifierOrNumber() 1370 return True 1371 except ParseError: 1372 return False 1373 1374 def ConsumeIdentifierOrNumber(self): 1375 """Consumes protocol message field identifier. 1376 1377 Returns: 1378 Identifier string. 1379 1380 Raises: 1381 ParseError: If an identifier couldn't be consumed. 1382 """ 1383 result = self.token 1384 if not self._IDENTIFIER_OR_NUMBER.match(result): 1385 raise self.ParseError('Expected identifier or number, got %s.' % result) 1386 self.NextToken() 1387 return result 1388 1389 def TryConsumeInteger(self): 1390 try: 1391 # Note: is_long only affects value type, not whether an error is raised. 1392 self.ConsumeInteger() 1393 return True 1394 except ParseError: 1395 return False 1396 1397 def ConsumeInteger(self, is_long=False): 1398 """Consumes an integer number. 1399 1400 Args: 1401 is_long: True if the value should be returned as a long integer. 1402 Returns: 1403 The integer parsed. 1404 1405 Raises: 1406 ParseError: If an integer couldn't be consumed. 1407 """ 1408 try: 1409 result = _ParseAbstractInteger(self.token, is_long=is_long) 1410 except ValueError as e: 1411 raise self.ParseError(str(e)) 1412 self.NextToken() 1413 return result 1414 1415 def TryConsumeFloat(self): 1416 try: 1417 self.ConsumeFloat() 1418 return True 1419 except ParseError: 1420 return False 1421 1422 def ConsumeFloat(self): 1423 """Consumes an floating point number. 1424 1425 Returns: 1426 The number parsed. 1427 1428 Raises: 1429 ParseError: If a floating point number couldn't be consumed. 1430 """ 1431 try: 1432 result = ParseFloat(self.token) 1433 except ValueError as e: 1434 raise self.ParseError(str(e)) 1435 self.NextToken() 1436 return result 1437 1438 def ConsumeBool(self): 1439 """Consumes a boolean value. 1440 1441 Returns: 1442 The bool parsed. 1443 1444 Raises: 1445 ParseError: If a boolean value couldn't be consumed. 1446 """ 1447 try: 1448 result = ParseBool(self.token) 1449 except ValueError as e: 1450 raise self.ParseError(str(e)) 1451 self.NextToken() 1452 return result 1453 1454 def TryConsumeByteString(self): 1455 try: 1456 self.ConsumeByteString() 1457 return True 1458 except ParseError: 1459 return False 1460 1461 def ConsumeString(self): 1462 """Consumes a string value. 1463 1464 Returns: 1465 The string parsed. 1466 1467 Raises: 1468 ParseError: If a string value couldn't be consumed. 1469 """ 1470 the_bytes = self.ConsumeByteString() 1471 try: 1472 return six.text_type(the_bytes, 'utf-8') 1473 except UnicodeDecodeError as e: 1474 raise self._StringParseError(e) 1475 1476 def ConsumeByteString(self): 1477 """Consumes a byte array value. 1478 1479 Returns: 1480 The array parsed (as a string). 1481 1482 Raises: 1483 ParseError: If a byte array value couldn't be consumed. 1484 """ 1485 the_list = [self._ConsumeSingleByteString()] 1486 while self.token and self.token[0] in _QUOTES: 1487 the_list.append(self._ConsumeSingleByteString()) 1488 return b''.join(the_list) 1489 1490 def _ConsumeSingleByteString(self): 1491 """Consume one token of a string literal. 1492 1493 String literals (whether bytes or text) can come in multiple adjacent 1494 tokens which are automatically concatenated, like in C or Python. This 1495 method only consumes one token. 1496 1497 Returns: 1498 The token parsed. 1499 Raises: 1500 ParseError: When the wrong format data is found. 1501 """ 1502 text = self.token 1503 if len(text) < 1 or text[0] not in _QUOTES: 1504 raise self.ParseError('Expected string but found: %r' % (text,)) 1505 1506 if len(text) < 2 or text[-1] != text[0]: 1507 raise self.ParseError('String missing ending quote: %r' % (text,)) 1508 1509 try: 1510 result = text_encoding.CUnescape(text[1:-1]) 1511 except ValueError as e: 1512 raise self.ParseError(str(e)) 1513 self.NextToken() 1514 return result 1515 1516 def ConsumeEnum(self, field): 1517 try: 1518 result = ParseEnum(field, self.token) 1519 except ValueError as e: 1520 raise self.ParseError(str(e)) 1521 self.NextToken() 1522 return result 1523 1524 def ParseErrorPreviousToken(self, message): 1525 """Creates and *returns* a ParseError for the previously read token. 1526 1527 Args: 1528 message: A message to set for the exception. 1529 1530 Returns: 1531 A ParseError instance. 1532 """ 1533 return ParseError(message, self._previous_line + 1, 1534 self._previous_column + 1) 1535 1536 def ParseError(self, message): 1537 """Creates and *returns* a ParseError for the current token.""" 1538 return ParseError('\'' + self._current_line + '\': ' + message, 1539 self._line + 1, self._column + 1) 1540 1541 def _StringParseError(self, e): 1542 return self.ParseError('Couldn\'t parse string: ' + str(e)) 1543 1544 def NextToken(self): 1545 """Reads the next meaningful token.""" 1546 self._previous_line = self._line 1547 self._previous_column = self._column 1548 1549 self._column += len(self.token) 1550 self._SkipWhitespace() 1551 1552 if not self._more_lines: 1553 self.token = '' 1554 return 1555 1556 match = self._TOKEN.match(self._current_line, self._column) 1557 if not match and not self._skip_comments: 1558 match = self._COMMENT.match(self._current_line, self._column) 1559 if match: 1560 token = match.group(0) 1561 self.token = token 1562 else: 1563 self.token = self._current_line[self._column] 1564 1565# Aliased so it can still be accessed by current visibility violators. 1566# TODO(dbarnett): Migrate violators to textformat_tokenizer. 1567_Tokenizer = Tokenizer # pylint: disable=invalid-name 1568 1569 1570def _ConsumeInt32(tokenizer): 1571 """Consumes a signed 32bit integer number from tokenizer. 1572 1573 Args: 1574 tokenizer: A tokenizer used to parse the number. 1575 1576 Returns: 1577 The integer parsed. 1578 1579 Raises: 1580 ParseError: If a signed 32bit integer couldn't be consumed. 1581 """ 1582 return _ConsumeInteger(tokenizer, is_signed=True, is_long=False) 1583 1584 1585def _ConsumeUint32(tokenizer): 1586 """Consumes an unsigned 32bit integer number from tokenizer. 1587 1588 Args: 1589 tokenizer: A tokenizer used to parse the number. 1590 1591 Returns: 1592 The integer parsed. 1593 1594 Raises: 1595 ParseError: If an unsigned 32bit integer couldn't be consumed. 1596 """ 1597 return _ConsumeInteger(tokenizer, is_signed=False, is_long=False) 1598 1599 1600def _TryConsumeInt64(tokenizer): 1601 try: 1602 _ConsumeInt64(tokenizer) 1603 return True 1604 except ParseError: 1605 return False 1606 1607 1608def _ConsumeInt64(tokenizer): 1609 """Consumes a signed 32bit integer number from tokenizer. 1610 1611 Args: 1612 tokenizer: A tokenizer used to parse the number. 1613 1614 Returns: 1615 The integer parsed. 1616 1617 Raises: 1618 ParseError: If a signed 32bit integer couldn't be consumed. 1619 """ 1620 return _ConsumeInteger(tokenizer, is_signed=True, is_long=True) 1621 1622 1623def _TryConsumeUint64(tokenizer): 1624 try: 1625 _ConsumeUint64(tokenizer) 1626 return True 1627 except ParseError: 1628 return False 1629 1630 1631def _ConsumeUint64(tokenizer): 1632 """Consumes an unsigned 64bit integer number from tokenizer. 1633 1634 Args: 1635 tokenizer: A tokenizer used to parse the number. 1636 1637 Returns: 1638 The integer parsed. 1639 1640 Raises: 1641 ParseError: If an unsigned 64bit integer couldn't be consumed. 1642 """ 1643 return _ConsumeInteger(tokenizer, is_signed=False, is_long=True) 1644 1645 1646def _TryConsumeInteger(tokenizer, is_signed=False, is_long=False): 1647 try: 1648 _ConsumeInteger(tokenizer, is_signed=is_signed, is_long=is_long) 1649 return True 1650 except ParseError: 1651 return False 1652 1653 1654def _ConsumeInteger(tokenizer, is_signed=False, is_long=False): 1655 """Consumes an integer number from tokenizer. 1656 1657 Args: 1658 tokenizer: A tokenizer used to parse the number. 1659 is_signed: True if a signed integer must be parsed. 1660 is_long: True if a long integer must be parsed. 1661 1662 Returns: 1663 The integer parsed. 1664 1665 Raises: 1666 ParseError: If an integer with given characteristics couldn't be consumed. 1667 """ 1668 try: 1669 result = ParseInteger(tokenizer.token, is_signed=is_signed, is_long=is_long) 1670 except ValueError as e: 1671 raise tokenizer.ParseError(str(e)) 1672 tokenizer.NextToken() 1673 return result 1674 1675 1676def ParseInteger(text, is_signed=False, is_long=False): 1677 """Parses an integer. 1678 1679 Args: 1680 text: The text to parse. 1681 is_signed: True if a signed integer must be parsed. 1682 is_long: True if a long integer must be parsed. 1683 1684 Returns: 1685 The integer value. 1686 1687 Raises: 1688 ValueError: Thrown Iff the text is not a valid integer. 1689 """ 1690 # Do the actual parsing. Exception handling is propagated to caller. 1691 result = _ParseAbstractInteger(text, is_long=is_long) 1692 1693 # Check if the integer is sane. Exceptions handled by callers. 1694 checker = _INTEGER_CHECKERS[2 * int(is_long) + int(is_signed)] 1695 checker.CheckValue(result) 1696 return result 1697 1698 1699def _ParseAbstractInteger(text, is_long=False): 1700 """Parses an integer without checking size/signedness. 1701 1702 Args: 1703 text: The text to parse. 1704 is_long: True if the value should be returned as a long integer. 1705 1706 Returns: 1707 The integer value. 1708 1709 Raises: 1710 ValueError: Thrown Iff the text is not a valid integer. 1711 """ 1712 # Do the actual parsing. Exception handling is propagated to caller. 1713 orig_text = text 1714 c_octal_match = re.match(r'(-?)0(\d+)$', text) 1715 if c_octal_match: 1716 # Python 3 no longer supports 0755 octal syntax without the 'o', so 1717 # we always use the '0o' prefix for multi-digit numbers starting with 0. 1718 text = c_octal_match.group(1) + '0o' + c_octal_match.group(2) 1719 try: 1720 # We force 32-bit values to int and 64-bit values to long to make 1721 # alternate implementations where the distinction is more significant 1722 # (e.g. the C++ implementation) simpler. 1723 if is_long: 1724 return long(text, 0) 1725 else: 1726 return int(text, 0) 1727 except ValueError: 1728 raise ValueError('Couldn\'t parse integer: %s' % orig_text) 1729 1730 1731def ParseFloat(text): 1732 """Parse a floating point number. 1733 1734 Args: 1735 text: Text to parse. 1736 1737 Returns: 1738 The number parsed. 1739 1740 Raises: 1741 ValueError: If a floating point number couldn't be parsed. 1742 """ 1743 try: 1744 # Assume Python compatible syntax. 1745 return float(text) 1746 except ValueError: 1747 # Check alternative spellings. 1748 if _FLOAT_INFINITY.match(text): 1749 if text[0] == '-': 1750 return float('-inf') 1751 else: 1752 return float('inf') 1753 elif _FLOAT_NAN.match(text): 1754 return float('nan') 1755 else: 1756 # assume '1.0f' format 1757 try: 1758 return float(text.rstrip('f')) 1759 except ValueError: 1760 raise ValueError('Couldn\'t parse float: %s' % text) 1761 1762 1763def ParseBool(text): 1764 """Parse a boolean value. 1765 1766 Args: 1767 text: Text to parse. 1768 1769 Returns: 1770 Boolean values parsed 1771 1772 Raises: 1773 ValueError: If text is not a valid boolean. 1774 """ 1775 if text in ('true', 't', '1', 'True'): 1776 return True 1777 elif text in ('false', 'f', '0', 'False'): 1778 return False 1779 else: 1780 raise ValueError('Expected "true" or "false".') 1781 1782 1783def ParseEnum(field, value): 1784 """Parse an enum value. 1785 1786 The value can be specified by a number (the enum value), or by 1787 a string literal (the enum name). 1788 1789 Args: 1790 field: Enum field descriptor. 1791 value: String value. 1792 1793 Returns: 1794 Enum value number. 1795 1796 Raises: 1797 ValueError: If the enum value could not be parsed. 1798 """ 1799 enum_descriptor = field.enum_type 1800 try: 1801 number = int(value, 0) 1802 except ValueError: 1803 # Identifier. 1804 enum_value = enum_descriptor.values_by_name.get(value, None) 1805 if enum_value is None: 1806 raise ValueError('Enum type "%s" has no value named %s.' % 1807 (enum_descriptor.full_name, value)) 1808 else: 1809 # Numeric value. 1810 if hasattr(field.file, 'syntax'): 1811 # Attribute is checked for compatibility. 1812 if field.file.syntax == 'proto3': 1813 # Proto3 accept numeric unknown enums. 1814 return number 1815 enum_value = enum_descriptor.values_by_number.get(number, None) 1816 if enum_value is None: 1817 raise ValueError('Enum type "%s" has no value with number %d.' % 1818 (enum_descriptor.full_name, number)) 1819 return enum_value.number 1820