1# Protocol Buffers - Google's data interchange format 2# Copyright 2008 Google Inc. All rights reserved. 3# https://developers.google.com/protocol-buffers/ 4# 5# Redistribution and use in source and binary forms, with or without 6# modification, are permitted provided that the following conditions are 7# met: 8# 9# * Redistributions of source code must retain the above copyright 10# notice, this list of conditions and the following disclaimer. 11# * Redistributions in binary form must reproduce the above 12# copyright notice, this list of conditions and the following disclaimer 13# in the documentation and/or other materials provided with the 14# distribution. 15# * Neither the name of Google Inc. nor the names of its 16# contributors may be used to endorse or promote products derived from 17# this software without specific prior written permission. 18# 19# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 23# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 25# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 31"""Contains routines for printing protocol messages in text format. 32 33Simple usage example:: 34 35 # Create a proto object and serialize it to a text proto string. 36 message = my_proto_pb2.MyMessage(foo='bar') 37 text_proto = text_format.MessageToString(message) 38 39 # Parse a text proto string. 40 message = text_format.Parse(text_proto, my_proto_pb2.MyMessage()) 41""" 42 43__author__ = 'kenton@google.com (Kenton Varda)' 44 45# TODO(b/129989314) Import thread contention leads to test failures. 46import encodings.raw_unicode_escape # pylint: disable=unused-import 47import encodings.unicode_escape # pylint: disable=unused-import 48import io 49import math 50import re 51import six 52 53from google.protobuf.internal import decoder 54from google.protobuf.internal import type_checkers 55from google.protobuf import descriptor 56from google.protobuf import text_encoding 57 58if six.PY3: 59 long = int # pylint: disable=redefined-builtin,invalid-name 60 61# pylint: disable=g-import-not-at-top 62__all__ = ['MessageToString', 'Parse', 'PrintMessage', 'PrintField', 63 'PrintFieldValue', 'Merge', 'MessageToBytes'] 64 65_INTEGER_CHECKERS = (type_checkers.Uint32ValueChecker(), 66 type_checkers.Int32ValueChecker(), 67 type_checkers.Uint64ValueChecker(), 68 type_checkers.Int64ValueChecker()) 69_FLOAT_INFINITY = re.compile('-?inf(?:inity)?f?$', re.IGNORECASE) 70_FLOAT_NAN = re.compile('nanf?$', re.IGNORECASE) 71_QUOTES = frozenset(("'", '"')) 72_ANY_FULL_TYPE_NAME = 'google.protobuf.Any' 73 74 75class Error(Exception): 76 """Top-level module error for text_format.""" 77 78 79class ParseError(Error): 80 """Thrown in case of text parsing or tokenizing error.""" 81 82 def __init__(self, message=None, line=None, column=None): 83 if message is not None and line is not None: 84 loc = str(line) 85 if column is not None: 86 loc += ':{0}'.format(column) 87 message = '{0} : {1}'.format(loc, message) 88 if message is not None: 89 super(ParseError, self).__init__(message) 90 else: 91 super(ParseError, self).__init__() 92 self._line = line 93 self._column = column 94 95 def GetLine(self): 96 return self._line 97 98 def GetColumn(self): 99 return self._column 100 101 102class TextWriter(object): 103 104 def __init__(self, as_utf8): 105 if six.PY2: 106 self._writer = io.BytesIO() 107 else: 108 self._writer = io.StringIO() 109 110 def write(self, val): 111 if six.PY2: 112 if isinstance(val, six.text_type): 113 val = val.encode('utf-8') 114 return self._writer.write(val) 115 116 def close(self): 117 return self._writer.close() 118 119 def getvalue(self): 120 return self._writer.getvalue() 121 122 123def MessageToString( 124 message, 125 as_utf8=False, 126 as_one_line=False, 127 use_short_repeated_primitives=False, 128 pointy_brackets=False, 129 use_index_order=False, 130 float_format=None, 131 double_format=None, 132 use_field_number=False, 133 descriptor_pool=None, 134 indent=0, 135 message_formatter=None, 136 print_unknown_fields=False, 137 force_colon=False): 138 # type: (...) -> str 139 """Convert protobuf message to text format. 140 141 Double values can be formatted compactly with 15 digits of 142 precision (which is the most that IEEE 754 "double" can guarantee) 143 using double_format='.15g'. To ensure that converting to text and back to a 144 proto will result in an identical value, double_format='.17g' should be used. 145 146 Args: 147 message: The protocol buffers message. 148 as_utf8: Return unescaped Unicode for non-ASCII characters. 149 In Python 3 actual Unicode characters may appear as is in strings. 150 In Python 2 the return value will be valid UTF-8 rather than only ASCII. 151 as_one_line: Don't introduce newlines between fields. 152 use_short_repeated_primitives: Use short repeated format for primitives. 153 pointy_brackets: If True, use angle brackets instead of curly braces for 154 nesting. 155 use_index_order: If True, fields of a proto message will be printed using 156 the order defined in source code instead of the field number, extensions 157 will be printed at the end of the message and their relative order is 158 determined by the extension number. By default, use the field number 159 order. 160 float_format (str): If set, use this to specify float field formatting 161 (per the "Format Specification Mini-Language"); otherwise, shortest float 162 that has same value in wire will be printed. Also affect double field 163 if double_format is not set but float_format is set. 164 double_format (str): If set, use this to specify double field formatting 165 (per the "Format Specification Mini-Language"); if it is not set but 166 float_format is set, use float_format. Otherwise, use ``str()`` 167 use_field_number: If True, print field numbers instead of names. 168 descriptor_pool (DescriptorPool): Descriptor pool used to resolve Any types. 169 indent (int): The initial indent level, in terms of spaces, for pretty 170 print. 171 message_formatter (function(message, indent, as_one_line) -> unicode|None): 172 Custom formatter for selected sub-messages (usually based on message 173 type). Use to pretty print parts of the protobuf for easier diffing. 174 print_unknown_fields: If True, unknown fields will be printed. 175 force_colon: If set, a colon will be added after the field name even if the 176 field is a proto message. 177 178 Returns: 179 str: A string of the text formatted protocol buffer message. 180 """ 181 out = TextWriter(as_utf8) 182 printer = _Printer( 183 out, 184 indent, 185 as_utf8, 186 as_one_line, 187 use_short_repeated_primitives, 188 pointy_brackets, 189 use_index_order, 190 float_format, 191 double_format, 192 use_field_number, 193 descriptor_pool, 194 message_formatter, 195 print_unknown_fields=print_unknown_fields, 196 force_colon=force_colon) 197 printer.PrintMessage(message) 198 result = out.getvalue() 199 out.close() 200 if as_one_line: 201 return result.rstrip() 202 return result 203 204 205def MessageToBytes(message, **kwargs): 206 # type: (...) -> bytes 207 """Convert protobuf message to encoded text format. See MessageToString.""" 208 text = MessageToString(message, **kwargs) 209 if isinstance(text, bytes): 210 return text 211 codec = 'utf-8' if kwargs.get('as_utf8') else 'ascii' 212 return text.encode(codec) 213 214 215def _IsMapEntry(field): 216 return (field.type == descriptor.FieldDescriptor.TYPE_MESSAGE and 217 field.message_type.has_options and 218 field.message_type.GetOptions().map_entry) 219 220 221def PrintMessage(message, 222 out, 223 indent=0, 224 as_utf8=False, 225 as_one_line=False, 226 use_short_repeated_primitives=False, 227 pointy_brackets=False, 228 use_index_order=False, 229 float_format=None, 230 double_format=None, 231 use_field_number=False, 232 descriptor_pool=None, 233 message_formatter=None, 234 print_unknown_fields=False, 235 force_colon=False): 236 printer = _Printer( 237 out=out, indent=indent, as_utf8=as_utf8, 238 as_one_line=as_one_line, 239 use_short_repeated_primitives=use_short_repeated_primitives, 240 pointy_brackets=pointy_brackets, 241 use_index_order=use_index_order, 242 float_format=float_format, 243 double_format=double_format, 244 use_field_number=use_field_number, 245 descriptor_pool=descriptor_pool, 246 message_formatter=message_formatter, 247 print_unknown_fields=print_unknown_fields, 248 force_colon=force_colon) 249 printer.PrintMessage(message) 250 251 252def PrintField(field, 253 value, 254 out, 255 indent=0, 256 as_utf8=False, 257 as_one_line=False, 258 use_short_repeated_primitives=False, 259 pointy_brackets=False, 260 use_index_order=False, 261 float_format=None, 262 double_format=None, 263 message_formatter=None, 264 print_unknown_fields=False, 265 force_colon=False): 266 """Print a single field name/value pair.""" 267 printer = _Printer(out, indent, as_utf8, as_one_line, 268 use_short_repeated_primitives, pointy_brackets, 269 use_index_order, float_format, double_format, 270 message_formatter=message_formatter, 271 print_unknown_fields=print_unknown_fields, 272 force_colon=force_colon) 273 printer.PrintField(field, value) 274 275 276def PrintFieldValue(field, 277 value, 278 out, 279 indent=0, 280 as_utf8=False, 281 as_one_line=False, 282 use_short_repeated_primitives=False, 283 pointy_brackets=False, 284 use_index_order=False, 285 float_format=None, 286 double_format=None, 287 message_formatter=None, 288 print_unknown_fields=False, 289 force_colon=False): 290 """Print a single field value (not including name).""" 291 printer = _Printer(out, indent, as_utf8, as_one_line, 292 use_short_repeated_primitives, pointy_brackets, 293 use_index_order, float_format, double_format, 294 message_formatter=message_formatter, 295 print_unknown_fields=print_unknown_fields, 296 force_colon=force_colon) 297 printer.PrintFieldValue(field, value) 298 299 300def _BuildMessageFromTypeName(type_name, descriptor_pool): 301 """Returns a protobuf message instance. 302 303 Args: 304 type_name: Fully-qualified protobuf message type name string. 305 descriptor_pool: DescriptorPool instance. 306 307 Returns: 308 A Message instance of type matching type_name, or None if the a Descriptor 309 wasn't found matching type_name. 310 """ 311 # pylint: disable=g-import-not-at-top 312 if descriptor_pool is None: 313 from google.protobuf import descriptor_pool as pool_mod 314 descriptor_pool = pool_mod.Default() 315 from google.protobuf import symbol_database 316 database = symbol_database.Default() 317 try: 318 message_descriptor = descriptor_pool.FindMessageTypeByName(type_name) 319 except KeyError: 320 return None 321 message_type = database.GetPrototype(message_descriptor) 322 return message_type() 323 324 325# These values must match WireType enum in google/protobuf/wire_format.h. 326WIRETYPE_LENGTH_DELIMITED = 2 327WIRETYPE_START_GROUP = 3 328 329 330class _Printer(object): 331 """Text format printer for protocol message.""" 332 333 def __init__( 334 self, 335 out, 336 indent=0, 337 as_utf8=False, 338 as_one_line=False, 339 use_short_repeated_primitives=False, 340 pointy_brackets=False, 341 use_index_order=False, 342 float_format=None, 343 double_format=None, 344 use_field_number=False, 345 descriptor_pool=None, 346 message_formatter=None, 347 print_unknown_fields=False, 348 force_colon=False): 349 """Initialize the Printer. 350 351 Double values can be formatted compactly with 15 digits of precision 352 (which is the most that IEEE 754 "double" can guarantee) using 353 double_format='.15g'. To ensure that converting to text and back to a proto 354 will result in an identical value, double_format='.17g' should be used. 355 356 Args: 357 out: To record the text format result. 358 indent: The initial indent level for pretty print. 359 as_utf8: Return unescaped Unicode for non-ASCII characters. 360 In Python 3 actual Unicode characters may appear as is in strings. 361 In Python 2 the return value will be valid UTF-8 rather than ASCII. 362 as_one_line: Don't introduce newlines between fields. 363 use_short_repeated_primitives: Use short repeated format for primitives. 364 pointy_brackets: If True, use angle brackets instead of curly braces for 365 nesting. 366 use_index_order: If True, print fields of a proto message using the order 367 defined in source code instead of the field number. By default, use the 368 field number order. 369 float_format: If set, use this to specify float field formatting 370 (per the "Format Specification Mini-Language"); otherwise, shortest 371 float that has same value in wire will be printed. Also affect double 372 field if double_format is not set but float_format is set. 373 double_format: If set, use this to specify double field formatting 374 (per the "Format Specification Mini-Language"); if it is not set but 375 float_format is set, use float_format. Otherwise, str() is used. 376 use_field_number: If True, print field numbers instead of names. 377 descriptor_pool: A DescriptorPool used to resolve Any types. 378 message_formatter: A function(message, indent, as_one_line): unicode|None 379 to custom format selected sub-messages (usually based on message type). 380 Use to pretty print parts of the protobuf for easier diffing. 381 print_unknown_fields: If True, unknown fields will be printed. 382 force_colon: If set, a colon will be added after the field name even if 383 the field is a proto message. 384 """ 385 self.out = out 386 self.indent = indent 387 self.as_utf8 = as_utf8 388 self.as_one_line = as_one_line 389 self.use_short_repeated_primitives = use_short_repeated_primitives 390 self.pointy_brackets = pointy_brackets 391 self.use_index_order = use_index_order 392 self.float_format = float_format 393 if double_format is not None: 394 self.double_format = double_format 395 else: 396 self.double_format = float_format 397 self.use_field_number = use_field_number 398 self.descriptor_pool = descriptor_pool 399 self.message_formatter = message_formatter 400 self.print_unknown_fields = print_unknown_fields 401 self.force_colon = force_colon 402 403 def _TryPrintAsAnyMessage(self, message): 404 """Serializes if message is a google.protobuf.Any field.""" 405 if '/' not in message.type_url: 406 return False 407 packed_message = _BuildMessageFromTypeName(message.TypeName(), 408 self.descriptor_pool) 409 if packed_message: 410 packed_message.MergeFromString(message.value) 411 colon = ':' if self.force_colon else '' 412 self.out.write('%s[%s]%s ' % (self.indent * ' ', message.type_url, colon)) 413 self._PrintMessageFieldValue(packed_message) 414 self.out.write(' ' if self.as_one_line else '\n') 415 return True 416 else: 417 return False 418 419 def _TryCustomFormatMessage(self, message): 420 formatted = self.message_formatter(message, self.indent, self.as_one_line) 421 if formatted is None: 422 return False 423 424 out = self.out 425 out.write(' ' * self.indent) 426 out.write(formatted) 427 out.write(' ' if self.as_one_line else '\n') 428 return True 429 430 def PrintMessage(self, message): 431 """Convert protobuf message to text format. 432 433 Args: 434 message: The protocol buffers message. 435 """ 436 if self.message_formatter and self._TryCustomFormatMessage(message): 437 return 438 if (message.DESCRIPTOR.full_name == _ANY_FULL_TYPE_NAME and 439 self._TryPrintAsAnyMessage(message)): 440 return 441 fields = message.ListFields() 442 if self.use_index_order: 443 fields.sort( 444 key=lambda x: x[0].number if x[0].is_extension else x[0].index) 445 for field, value in fields: 446 if _IsMapEntry(field): 447 for key in sorted(value): 448 # This is slow for maps with submessage entries because it copies the 449 # entire tree. Unfortunately this would take significant refactoring 450 # of this file to work around. 451 # 452 # TODO(haberman): refactor and optimize if this becomes an issue. 453 entry_submsg = value.GetEntryClass()(key=key, value=value[key]) 454 self.PrintField(field, entry_submsg) 455 elif field.label == descriptor.FieldDescriptor.LABEL_REPEATED: 456 if (self.use_short_repeated_primitives 457 and field.cpp_type != descriptor.FieldDescriptor.CPPTYPE_MESSAGE 458 and field.cpp_type != descriptor.FieldDescriptor.CPPTYPE_STRING): 459 self._PrintShortRepeatedPrimitivesValue(field, value) 460 else: 461 for element in value: 462 self.PrintField(field, element) 463 else: 464 self.PrintField(field, value) 465 466 if self.print_unknown_fields: 467 self._PrintUnknownFields(message.UnknownFields()) 468 469 def _PrintUnknownFields(self, unknown_fields): 470 """Print unknown fields.""" 471 out = self.out 472 for field in unknown_fields: 473 out.write(' ' * self.indent) 474 out.write(str(field.field_number)) 475 if field.wire_type == WIRETYPE_START_GROUP: 476 if self.as_one_line: 477 out.write(' { ') 478 else: 479 out.write(' {\n') 480 self.indent += 2 481 482 self._PrintUnknownFields(field.data) 483 484 if self.as_one_line: 485 out.write('} ') 486 else: 487 self.indent -= 2 488 out.write(' ' * self.indent + '}\n') 489 elif field.wire_type == WIRETYPE_LENGTH_DELIMITED: 490 try: 491 # If this field is parseable as a Message, it is probably 492 # an embedded message. 493 # pylint: disable=protected-access 494 (embedded_unknown_message, pos) = decoder._DecodeUnknownFieldSet( 495 memoryview(field.data), 0, len(field.data)) 496 except Exception: # pylint: disable=broad-except 497 pos = 0 498 499 if pos == len(field.data): 500 if self.as_one_line: 501 out.write(' { ') 502 else: 503 out.write(' {\n') 504 self.indent += 2 505 506 self._PrintUnknownFields(embedded_unknown_message) 507 508 if self.as_one_line: 509 out.write('} ') 510 else: 511 self.indent -= 2 512 out.write(' ' * self.indent + '}\n') 513 else: 514 # A string or bytes field. self.as_utf8 may not work. 515 out.write(': \"') 516 out.write(text_encoding.CEscape(field.data, False)) 517 out.write('\" ' if self.as_one_line else '\"\n') 518 else: 519 # varint, fixed32, fixed64 520 out.write(': ') 521 out.write(str(field.data)) 522 out.write(' ' if self.as_one_line else '\n') 523 524 def _PrintFieldName(self, field): 525 """Print field name.""" 526 out = self.out 527 out.write(' ' * self.indent) 528 if self.use_field_number: 529 out.write(str(field.number)) 530 else: 531 if field.is_extension: 532 out.write('[') 533 if (field.containing_type.GetOptions().message_set_wire_format and 534 field.type == descriptor.FieldDescriptor.TYPE_MESSAGE and 535 field.label == descriptor.FieldDescriptor.LABEL_OPTIONAL): 536 out.write(field.message_type.full_name) 537 else: 538 out.write(field.full_name) 539 out.write(']') 540 elif field.type == descriptor.FieldDescriptor.TYPE_GROUP: 541 # For groups, use the capitalized name. 542 out.write(field.message_type.name) 543 else: 544 out.write(field.name) 545 546 if (self.force_colon or 547 field.cpp_type != descriptor.FieldDescriptor.CPPTYPE_MESSAGE): 548 # The colon is optional in this case, but our cross-language golden files 549 # don't include it. Here, the colon is only included if force_colon is 550 # set to True 551 out.write(':') 552 553 def PrintField(self, field, value): 554 """Print a single field name/value pair.""" 555 self._PrintFieldName(field) 556 self.out.write(' ') 557 self.PrintFieldValue(field, value) 558 self.out.write(' ' if self.as_one_line else '\n') 559 560 def _PrintShortRepeatedPrimitivesValue(self, field, value): 561 """"Prints short repeated primitives value.""" 562 # Note: this is called only when value has at least one element. 563 self._PrintFieldName(field) 564 self.out.write(' [') 565 for i in six.moves.range(len(value) - 1): 566 self.PrintFieldValue(field, value[i]) 567 self.out.write(', ') 568 self.PrintFieldValue(field, value[-1]) 569 self.out.write(']') 570 if self.force_colon: 571 self.out.write(':') 572 self.out.write(' ' if self.as_one_line else '\n') 573 574 def _PrintMessageFieldValue(self, value): 575 if self.pointy_brackets: 576 openb = '<' 577 closeb = '>' 578 else: 579 openb = '{' 580 closeb = '}' 581 582 if self.as_one_line: 583 self.out.write('%s ' % openb) 584 self.PrintMessage(value) 585 self.out.write(closeb) 586 else: 587 self.out.write('%s\n' % openb) 588 self.indent += 2 589 self.PrintMessage(value) 590 self.indent -= 2 591 self.out.write(' ' * self.indent + closeb) 592 593 def PrintFieldValue(self, field, value): 594 """Print a single field value (not including name). 595 596 For repeated fields, the value should be a single element. 597 598 Args: 599 field: The descriptor of the field to be printed. 600 value: The value of the field. 601 """ 602 out = self.out 603 if field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_MESSAGE: 604 self._PrintMessageFieldValue(value) 605 elif field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_ENUM: 606 enum_value = field.enum_type.values_by_number.get(value, None) 607 if enum_value is not None: 608 out.write(enum_value.name) 609 else: 610 out.write(str(value)) 611 elif field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_STRING: 612 out.write('\"') 613 if isinstance(value, six.text_type) and (six.PY2 or not self.as_utf8): 614 out_value = value.encode('utf-8') 615 else: 616 out_value = value 617 if field.type == descriptor.FieldDescriptor.TYPE_BYTES: 618 # We always need to escape all binary data in TYPE_BYTES fields. 619 out_as_utf8 = False 620 else: 621 out_as_utf8 = self.as_utf8 622 out.write(text_encoding.CEscape(out_value, out_as_utf8)) 623 out.write('\"') 624 elif field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_BOOL: 625 if value: 626 out.write('true') 627 else: 628 out.write('false') 629 elif field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_FLOAT: 630 if self.float_format is not None: 631 out.write('{1:{0}}'.format(self.float_format, value)) 632 else: 633 if math.isnan(value): 634 out.write(str(value)) 635 else: 636 out.write(str(type_checkers.ToShortestFloat(value))) 637 elif (field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_DOUBLE and 638 self.double_format is not None): 639 out.write('{1:{0}}'.format(self.double_format, value)) 640 else: 641 out.write(str(value)) 642 643 644def Parse(text, 645 message, 646 allow_unknown_extension=False, 647 allow_field_number=False, 648 descriptor_pool=None, 649 allow_unknown_field=False): 650 """Parses a text representation of a protocol message into a message. 651 652 NOTE: for historical reasons this function does not clear the input 653 message. This is different from what the binary msg.ParseFrom(...) does. 654 If text contains a field already set in message, the value is appended if the 655 field is repeated. Otherwise, an error is raised. 656 657 Example:: 658 659 a = MyProto() 660 a.repeated_field.append('test') 661 b = MyProto() 662 663 # Repeated fields are combined 664 text_format.Parse(repr(a), b) 665 text_format.Parse(repr(a), b) # repeated_field contains ["test", "test"] 666 667 # Non-repeated fields cannot be overwritten 668 a.singular_field = 1 669 b.singular_field = 2 670 text_format.Parse(repr(a), b) # ParseError 671 672 # Binary version: 673 b.ParseFromString(a.SerializeToString()) # repeated_field is now "test" 674 675 Caller is responsible for clearing the message as needed. 676 677 Args: 678 text (str): Message text representation. 679 message (Message): A protocol buffer message to merge into. 680 allow_unknown_extension: if True, skip over missing extensions and keep 681 parsing 682 allow_field_number: if True, both field number and field name are allowed. 683 descriptor_pool (DescriptorPool): Descriptor pool used to resolve Any types. 684 allow_unknown_field: if True, skip over unknown field and keep 685 parsing. Avoid to use this option if possible. It may hide some 686 errors (e.g. spelling error on field name) 687 688 Returns: 689 Message: The same message passed as argument. 690 691 Raises: 692 ParseError: On text parsing problems. 693 """ 694 return ParseLines(text.split(b'\n' if isinstance(text, bytes) else u'\n'), 695 message, 696 allow_unknown_extension, 697 allow_field_number, 698 descriptor_pool=descriptor_pool, 699 allow_unknown_field=allow_unknown_field) 700 701 702def Merge(text, 703 message, 704 allow_unknown_extension=False, 705 allow_field_number=False, 706 descriptor_pool=None, 707 allow_unknown_field=False): 708 """Parses a text representation of a protocol message into a message. 709 710 Like Parse(), but allows repeated values for a non-repeated field, and uses 711 the last one. This means any non-repeated, top-level fields specified in text 712 replace those in the message. 713 714 Args: 715 text (str): Message text representation. 716 message (Message): A protocol buffer message to merge into. 717 allow_unknown_extension: if True, skip over missing extensions and keep 718 parsing 719 allow_field_number: if True, both field number and field name are allowed. 720 descriptor_pool (DescriptorPool): Descriptor pool used to resolve Any types. 721 allow_unknown_field: if True, skip over unknown field and keep 722 parsing. Avoid to use this option if possible. It may hide some 723 errors (e.g. spelling error on field name) 724 725 Returns: 726 Message: The same message passed as argument. 727 728 Raises: 729 ParseError: On text parsing problems. 730 """ 731 return MergeLines( 732 text.split(b'\n' if isinstance(text, bytes) else u'\n'), 733 message, 734 allow_unknown_extension, 735 allow_field_number, 736 descriptor_pool=descriptor_pool, 737 allow_unknown_field=allow_unknown_field) 738 739 740def ParseLines(lines, 741 message, 742 allow_unknown_extension=False, 743 allow_field_number=False, 744 descriptor_pool=None, 745 allow_unknown_field=False): 746 """Parses a text representation of a protocol message into a message. 747 748 See Parse() for caveats. 749 750 Args: 751 lines: An iterable of lines of a message's text representation. 752 message: A protocol buffer message to merge into. 753 allow_unknown_extension: if True, skip over missing extensions and keep 754 parsing 755 allow_field_number: if True, both field number and field name are allowed. 756 descriptor_pool: A DescriptorPool used to resolve Any types. 757 allow_unknown_field: if True, skip over unknown field and keep 758 parsing. Avoid to use this option if possible. It may hide some 759 errors (e.g. spelling error on field name) 760 761 Returns: 762 The same message passed as argument. 763 764 Raises: 765 ParseError: On text parsing problems. 766 """ 767 parser = _Parser(allow_unknown_extension, 768 allow_field_number, 769 descriptor_pool=descriptor_pool, 770 allow_unknown_field=allow_unknown_field) 771 return parser.ParseLines(lines, message) 772 773 774def MergeLines(lines, 775 message, 776 allow_unknown_extension=False, 777 allow_field_number=False, 778 descriptor_pool=None, 779 allow_unknown_field=False): 780 """Parses a text representation of a protocol message into a message. 781 782 See Merge() for more details. 783 784 Args: 785 lines: An iterable of lines of a message's text representation. 786 message: A protocol buffer message to merge into. 787 allow_unknown_extension: if True, skip over missing extensions and keep 788 parsing 789 allow_field_number: if True, both field number and field name are allowed. 790 descriptor_pool: A DescriptorPool used to resolve Any types. 791 allow_unknown_field: if True, skip over unknown field and keep 792 parsing. Avoid to use this option if possible. It may hide some 793 errors (e.g. spelling error on field name) 794 795 Returns: 796 The same message passed as argument. 797 798 Raises: 799 ParseError: On text parsing problems. 800 """ 801 parser = _Parser(allow_unknown_extension, 802 allow_field_number, 803 descriptor_pool=descriptor_pool, 804 allow_unknown_field=allow_unknown_field) 805 return parser.MergeLines(lines, message) 806 807 808class _Parser(object): 809 """Text format parser for protocol message.""" 810 811 def __init__(self, 812 allow_unknown_extension=False, 813 allow_field_number=False, 814 descriptor_pool=None, 815 allow_unknown_field=False): 816 self.allow_unknown_extension = allow_unknown_extension 817 self.allow_field_number = allow_field_number 818 self.descriptor_pool = descriptor_pool 819 self.allow_unknown_field = allow_unknown_field 820 821 def ParseLines(self, lines, message): 822 """Parses a text representation of a protocol message into a message.""" 823 self._allow_multiple_scalars = False 824 self._ParseOrMerge(lines, message) 825 return message 826 827 def MergeLines(self, lines, message): 828 """Merges a text representation of a protocol message into a message.""" 829 self._allow_multiple_scalars = True 830 self._ParseOrMerge(lines, message) 831 return message 832 833 def _ParseOrMerge(self, lines, message): 834 """Converts a text representation of a protocol message into a message. 835 836 Args: 837 lines: Lines of a message's text representation. 838 message: A protocol buffer message to merge into. 839 840 Raises: 841 ParseError: On text parsing problems. 842 """ 843 # Tokenize expects native str lines. 844 if six.PY2: 845 str_lines = (line if isinstance(line, str) else line.encode('utf-8') 846 for line in lines) 847 else: 848 str_lines = (line if isinstance(line, str) else line.decode('utf-8') 849 for line in lines) 850 tokenizer = Tokenizer(str_lines) 851 while not tokenizer.AtEnd(): 852 self._MergeField(tokenizer, message) 853 854 def _MergeField(self, tokenizer, message): 855 """Merges a single protocol message field into a message. 856 857 Args: 858 tokenizer: A tokenizer to parse the field name and values. 859 message: A protocol message to record the data. 860 861 Raises: 862 ParseError: In case of text parsing problems. 863 """ 864 message_descriptor = message.DESCRIPTOR 865 if (message_descriptor.full_name == _ANY_FULL_TYPE_NAME and 866 tokenizer.TryConsume('[')): 867 type_url_prefix, packed_type_name = self._ConsumeAnyTypeUrl(tokenizer) 868 tokenizer.Consume(']') 869 tokenizer.TryConsume(':') 870 if tokenizer.TryConsume('<'): 871 expanded_any_end_token = '>' 872 else: 873 tokenizer.Consume('{') 874 expanded_any_end_token = '}' 875 expanded_any_sub_message = _BuildMessageFromTypeName(packed_type_name, 876 self.descriptor_pool) 877 if not expanded_any_sub_message: 878 raise ParseError('Type %s not found in descriptor pool' % 879 packed_type_name) 880 while not tokenizer.TryConsume(expanded_any_end_token): 881 if tokenizer.AtEnd(): 882 raise tokenizer.ParseErrorPreviousToken('Expected "%s".' % 883 (expanded_any_end_token,)) 884 self._MergeField(tokenizer, expanded_any_sub_message) 885 deterministic = False 886 887 message.Pack(expanded_any_sub_message, 888 type_url_prefix=type_url_prefix, 889 deterministic=deterministic) 890 return 891 892 if tokenizer.TryConsume('['): 893 name = [tokenizer.ConsumeIdentifier()] 894 while tokenizer.TryConsume('.'): 895 name.append(tokenizer.ConsumeIdentifier()) 896 name = '.'.join(name) 897 898 if not message_descriptor.is_extendable: 899 raise tokenizer.ParseErrorPreviousToken( 900 'Message type "%s" does not have extensions.' % 901 message_descriptor.full_name) 902 # pylint: disable=protected-access 903 field = message.Extensions._FindExtensionByName(name) 904 # pylint: enable=protected-access 905 if not field: 906 if self.allow_unknown_extension: 907 field = None 908 else: 909 raise tokenizer.ParseErrorPreviousToken( 910 'Extension "%s" not registered. ' 911 'Did you import the _pb2 module which defines it? ' 912 'If you are trying to place the extension in the MessageSet ' 913 'field of another message that is in an Any or MessageSet field, ' 914 'that message\'s _pb2 module must be imported as well' % name) 915 elif message_descriptor != field.containing_type: 916 raise tokenizer.ParseErrorPreviousToken( 917 'Extension "%s" does not extend message type "%s".' % 918 (name, message_descriptor.full_name)) 919 920 tokenizer.Consume(']') 921 922 else: 923 name = tokenizer.ConsumeIdentifierOrNumber() 924 if self.allow_field_number and name.isdigit(): 925 number = ParseInteger(name, True, True) 926 field = message_descriptor.fields_by_number.get(number, None) 927 if not field and message_descriptor.is_extendable: 928 field = message.Extensions._FindExtensionByNumber(number) 929 else: 930 field = message_descriptor.fields_by_name.get(name, None) 931 932 # Group names are expected to be capitalized as they appear in the 933 # .proto file, which actually matches their type names, not their field 934 # names. 935 if not field: 936 field = message_descriptor.fields_by_name.get(name.lower(), None) 937 if field and field.type != descriptor.FieldDescriptor.TYPE_GROUP: 938 field = None 939 940 if (field and field.type == descriptor.FieldDescriptor.TYPE_GROUP and 941 field.message_type.name != name): 942 field = None 943 944 if not field and not self.allow_unknown_field: 945 raise tokenizer.ParseErrorPreviousToken( 946 'Message type "%s" has no field named "%s".' % 947 (message_descriptor.full_name, name)) 948 949 if field: 950 if not self._allow_multiple_scalars and field.containing_oneof: 951 # Check if there's a different field set in this oneof. 952 # Note that we ignore the case if the same field was set before, and we 953 # apply _allow_multiple_scalars to non-scalar fields as well. 954 which_oneof = message.WhichOneof(field.containing_oneof.name) 955 if which_oneof is not None and which_oneof != field.name: 956 raise tokenizer.ParseErrorPreviousToken( 957 'Field "%s" is specified along with field "%s", another member ' 958 'of oneof "%s" for message type "%s".' % 959 (field.name, which_oneof, field.containing_oneof.name, 960 message_descriptor.full_name)) 961 962 if field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_MESSAGE: 963 tokenizer.TryConsume(':') 964 merger = self._MergeMessageField 965 else: 966 tokenizer.Consume(':') 967 merger = self._MergeScalarField 968 969 if (field.label == descriptor.FieldDescriptor.LABEL_REPEATED and 970 tokenizer.TryConsume('[')): 971 # Short repeated format, e.g. "foo: [1, 2, 3]" 972 if not tokenizer.TryConsume(']'): 973 while True: 974 merger(tokenizer, message, field) 975 if tokenizer.TryConsume(']'): 976 break 977 tokenizer.Consume(',') 978 979 else: 980 merger(tokenizer, message, field) 981 982 else: # Proto field is unknown. 983 assert (self.allow_unknown_extension or self.allow_unknown_field) 984 _SkipFieldContents(tokenizer) 985 986 # For historical reasons, fields may optionally be separated by commas or 987 # semicolons. 988 if not tokenizer.TryConsume(','): 989 tokenizer.TryConsume(';') 990 991 def _ConsumeAnyTypeUrl(self, tokenizer): 992 """Consumes a google.protobuf.Any type URL and returns the type name.""" 993 # Consume "type.googleapis.com/". 994 prefix = [tokenizer.ConsumeIdentifier()] 995 tokenizer.Consume('.') 996 prefix.append(tokenizer.ConsumeIdentifier()) 997 tokenizer.Consume('.') 998 prefix.append(tokenizer.ConsumeIdentifier()) 999 tokenizer.Consume('/') 1000 # Consume the fully-qualified type name. 1001 name = [tokenizer.ConsumeIdentifier()] 1002 while tokenizer.TryConsume('.'): 1003 name.append(tokenizer.ConsumeIdentifier()) 1004 return '.'.join(prefix), '.'.join(name) 1005 1006 def _MergeMessageField(self, tokenizer, message, field): 1007 """Merges a single scalar field into a message. 1008 1009 Args: 1010 tokenizer: A tokenizer to parse the field value. 1011 message: The message of which field is a member. 1012 field: The descriptor of the field to be merged. 1013 1014 Raises: 1015 ParseError: In case of text parsing problems. 1016 """ 1017 is_map_entry = _IsMapEntry(field) 1018 1019 if tokenizer.TryConsume('<'): 1020 end_token = '>' 1021 else: 1022 tokenizer.Consume('{') 1023 end_token = '}' 1024 1025 if field.label == descriptor.FieldDescriptor.LABEL_REPEATED: 1026 if field.is_extension: 1027 sub_message = message.Extensions[field].add() 1028 elif is_map_entry: 1029 sub_message = getattr(message, field.name).GetEntryClass()() 1030 else: 1031 sub_message = getattr(message, field.name).add() 1032 else: 1033 if field.is_extension: 1034 if (not self._allow_multiple_scalars and 1035 message.HasExtension(field)): 1036 raise tokenizer.ParseErrorPreviousToken( 1037 'Message type "%s" should not have multiple "%s" extensions.' % 1038 (message.DESCRIPTOR.full_name, field.full_name)) 1039 sub_message = message.Extensions[field] 1040 else: 1041 # Also apply _allow_multiple_scalars to message field. 1042 # TODO(jieluo): Change to _allow_singular_overwrites. 1043 if (not self._allow_multiple_scalars and 1044 message.HasField(field.name)): 1045 raise tokenizer.ParseErrorPreviousToken( 1046 'Message type "%s" should not have multiple "%s" fields.' % 1047 (message.DESCRIPTOR.full_name, field.name)) 1048 sub_message = getattr(message, field.name) 1049 sub_message.SetInParent() 1050 1051 while not tokenizer.TryConsume(end_token): 1052 if tokenizer.AtEnd(): 1053 raise tokenizer.ParseErrorPreviousToken('Expected "%s".' % (end_token,)) 1054 self._MergeField(tokenizer, sub_message) 1055 1056 if is_map_entry: 1057 value_cpptype = field.message_type.fields_by_name['value'].cpp_type 1058 if value_cpptype == descriptor.FieldDescriptor.CPPTYPE_MESSAGE: 1059 value = getattr(message, field.name)[sub_message.key] 1060 value.MergeFrom(sub_message.value) 1061 else: 1062 getattr(message, field.name)[sub_message.key] = sub_message.value 1063 1064 @staticmethod 1065 def _IsProto3Syntax(message): 1066 message_descriptor = message.DESCRIPTOR 1067 return (hasattr(message_descriptor, 'syntax') and 1068 message_descriptor.syntax == 'proto3') 1069 1070 def _MergeScalarField(self, tokenizer, message, field): 1071 """Merges a single scalar field into a message. 1072 1073 Args: 1074 tokenizer: A tokenizer to parse the field value. 1075 message: A protocol message to record the data. 1076 field: The descriptor of the field to be merged. 1077 1078 Raises: 1079 ParseError: In case of text parsing problems. 1080 RuntimeError: On runtime errors. 1081 """ 1082 _ = self.allow_unknown_extension 1083 value = None 1084 1085 if field.type in (descriptor.FieldDescriptor.TYPE_INT32, 1086 descriptor.FieldDescriptor.TYPE_SINT32, 1087 descriptor.FieldDescriptor.TYPE_SFIXED32): 1088 value = _ConsumeInt32(tokenizer) 1089 elif field.type in (descriptor.FieldDescriptor.TYPE_INT64, 1090 descriptor.FieldDescriptor.TYPE_SINT64, 1091 descriptor.FieldDescriptor.TYPE_SFIXED64): 1092 value = _ConsumeInt64(tokenizer) 1093 elif field.type in (descriptor.FieldDescriptor.TYPE_UINT32, 1094 descriptor.FieldDescriptor.TYPE_FIXED32): 1095 value = _ConsumeUint32(tokenizer) 1096 elif field.type in (descriptor.FieldDescriptor.TYPE_UINT64, 1097 descriptor.FieldDescriptor.TYPE_FIXED64): 1098 value = _ConsumeUint64(tokenizer) 1099 elif field.type in (descriptor.FieldDescriptor.TYPE_FLOAT, 1100 descriptor.FieldDescriptor.TYPE_DOUBLE): 1101 value = tokenizer.ConsumeFloat() 1102 elif field.type == descriptor.FieldDescriptor.TYPE_BOOL: 1103 value = tokenizer.ConsumeBool() 1104 elif field.type == descriptor.FieldDescriptor.TYPE_STRING: 1105 value = tokenizer.ConsumeString() 1106 elif field.type == descriptor.FieldDescriptor.TYPE_BYTES: 1107 value = tokenizer.ConsumeByteString() 1108 elif field.type == descriptor.FieldDescriptor.TYPE_ENUM: 1109 value = tokenizer.ConsumeEnum(field) 1110 else: 1111 raise RuntimeError('Unknown field type %d' % field.type) 1112 1113 if field.label == descriptor.FieldDescriptor.LABEL_REPEATED: 1114 if field.is_extension: 1115 message.Extensions[field].append(value) 1116 else: 1117 getattr(message, field.name).append(value) 1118 else: 1119 if field.is_extension: 1120 if (not self._allow_multiple_scalars and 1121 not self._IsProto3Syntax(message) and 1122 message.HasExtension(field)): 1123 raise tokenizer.ParseErrorPreviousToken( 1124 'Message type "%s" should not have multiple "%s" extensions.' % 1125 (message.DESCRIPTOR.full_name, field.full_name)) 1126 else: 1127 message.Extensions[field] = value 1128 else: 1129 duplicate_error = False 1130 if not self._allow_multiple_scalars: 1131 if self._IsProto3Syntax(message): 1132 # Proto3 doesn't represent presence so we try best effort to check 1133 # multiple scalars by compare to default values. 1134 duplicate_error = bool(getattr(message, field.name)) 1135 else: 1136 duplicate_error = message.HasField(field.name) 1137 1138 if duplicate_error: 1139 raise tokenizer.ParseErrorPreviousToken( 1140 'Message type "%s" should not have multiple "%s" fields.' % 1141 (message.DESCRIPTOR.full_name, field.name)) 1142 else: 1143 setattr(message, field.name, value) 1144 1145 1146def _SkipFieldContents(tokenizer): 1147 """Skips over contents (value or message) of a field. 1148 1149 Args: 1150 tokenizer: A tokenizer to parse the field name and values. 1151 """ 1152 # Try to guess the type of this field. 1153 # If this field is not a message, there should be a ":" between the 1154 # field name and the field value and also the field value should not 1155 # start with "{" or "<" which indicates the beginning of a message body. 1156 # If there is no ":" or there is a "{" or "<" after ":", this field has 1157 # to be a message or the input is ill-formed. 1158 if tokenizer.TryConsume(':') and not tokenizer.LookingAt( 1159 '{') and not tokenizer.LookingAt('<'): 1160 _SkipFieldValue(tokenizer) 1161 else: 1162 _SkipFieldMessage(tokenizer) 1163 1164 1165def _SkipField(tokenizer): 1166 """Skips over a complete field (name and value/message). 1167 1168 Args: 1169 tokenizer: A tokenizer to parse the field name and values. 1170 """ 1171 if tokenizer.TryConsume('['): 1172 # Consume extension name. 1173 tokenizer.ConsumeIdentifier() 1174 while tokenizer.TryConsume('.'): 1175 tokenizer.ConsumeIdentifier() 1176 tokenizer.Consume(']') 1177 else: 1178 tokenizer.ConsumeIdentifierOrNumber() 1179 1180 _SkipFieldContents(tokenizer) 1181 1182 # For historical reasons, fields may optionally be separated by commas or 1183 # semicolons. 1184 if not tokenizer.TryConsume(','): 1185 tokenizer.TryConsume(';') 1186 1187 1188def _SkipFieldMessage(tokenizer): 1189 """Skips over a field message. 1190 1191 Args: 1192 tokenizer: A tokenizer to parse the field name and values. 1193 """ 1194 1195 if tokenizer.TryConsume('<'): 1196 delimiter = '>' 1197 else: 1198 tokenizer.Consume('{') 1199 delimiter = '}' 1200 1201 while not tokenizer.LookingAt('>') and not tokenizer.LookingAt('}'): 1202 _SkipField(tokenizer) 1203 1204 tokenizer.Consume(delimiter) 1205 1206 1207def _SkipFieldValue(tokenizer): 1208 """Skips over a field value. 1209 1210 Args: 1211 tokenizer: A tokenizer to parse the field name and values. 1212 1213 Raises: 1214 ParseError: In case an invalid field value is found. 1215 """ 1216 # String/bytes tokens can come in multiple adjacent string literals. 1217 # If we can consume one, consume as many as we can. 1218 if tokenizer.TryConsumeByteString(): 1219 while tokenizer.TryConsumeByteString(): 1220 pass 1221 return 1222 1223 if (not tokenizer.TryConsumeIdentifier() and 1224 not _TryConsumeInt64(tokenizer) and not _TryConsumeUint64(tokenizer) and 1225 not tokenizer.TryConsumeFloat()): 1226 raise ParseError('Invalid field value: ' + tokenizer.token) 1227 1228 1229class Tokenizer(object): 1230 """Protocol buffer text representation tokenizer. 1231 1232 This class handles the lower level string parsing by splitting it into 1233 meaningful tokens. 1234 1235 It was directly ported from the Java protocol buffer API. 1236 """ 1237 1238 _WHITESPACE = re.compile(r'\s+') 1239 _COMMENT = re.compile(r'(\s*#.*$)', re.MULTILINE) 1240 _WHITESPACE_OR_COMMENT = re.compile(r'(\s|(#.*$))+', re.MULTILINE) 1241 _TOKEN = re.compile('|'.join([ 1242 r'[a-zA-Z_][0-9a-zA-Z_+-]*', # an identifier 1243 r'([0-9+-]|(\.[0-9]))[0-9a-zA-Z_.+-]*', # a number 1244 ] + [ # quoted str for each quote mark 1245 # Avoid backtracking! https://stackoverflow.com/a/844267 1246 r'{qt}[^{qt}\n\\]*((\\.)+[^{qt}\n\\]*)*({qt}|\\?$)'.format(qt=mark) 1247 for mark in _QUOTES 1248 ])) 1249 1250 _IDENTIFIER = re.compile(r'[^\d\W]\w*') 1251 _IDENTIFIER_OR_NUMBER = re.compile(r'\w+') 1252 1253 def __init__(self, lines, skip_comments=True): 1254 self._position = 0 1255 self._line = -1 1256 self._column = 0 1257 self._token_start = None 1258 self.token = '' 1259 self._lines = iter(lines) 1260 self._current_line = '' 1261 self._previous_line = 0 1262 self._previous_column = 0 1263 self._more_lines = True 1264 self._skip_comments = skip_comments 1265 self._whitespace_pattern = (skip_comments and self._WHITESPACE_OR_COMMENT 1266 or self._WHITESPACE) 1267 self._SkipWhitespace() 1268 self.NextToken() 1269 1270 def LookingAt(self, token): 1271 return self.token == token 1272 1273 def AtEnd(self): 1274 """Checks the end of the text was reached. 1275 1276 Returns: 1277 True iff the end was reached. 1278 """ 1279 return not self.token 1280 1281 def _PopLine(self): 1282 while len(self._current_line) <= self._column: 1283 try: 1284 self._current_line = next(self._lines) 1285 except StopIteration: 1286 self._current_line = '' 1287 self._more_lines = False 1288 return 1289 else: 1290 self._line += 1 1291 self._column = 0 1292 1293 def _SkipWhitespace(self): 1294 while True: 1295 self._PopLine() 1296 match = self._whitespace_pattern.match(self._current_line, self._column) 1297 if not match: 1298 break 1299 length = len(match.group(0)) 1300 self._column += length 1301 1302 def TryConsume(self, token): 1303 """Tries to consume a given piece of text. 1304 1305 Args: 1306 token: Text to consume. 1307 1308 Returns: 1309 True iff the text was consumed. 1310 """ 1311 if self.token == token: 1312 self.NextToken() 1313 return True 1314 return False 1315 1316 def Consume(self, token): 1317 """Consumes a piece of text. 1318 1319 Args: 1320 token: Text to consume. 1321 1322 Raises: 1323 ParseError: If the text couldn't be consumed. 1324 """ 1325 if not self.TryConsume(token): 1326 raise self.ParseError('Expected "%s".' % token) 1327 1328 def ConsumeComment(self): 1329 result = self.token 1330 if not self._COMMENT.match(result): 1331 raise self.ParseError('Expected comment.') 1332 self.NextToken() 1333 return result 1334 1335 def ConsumeCommentOrTrailingComment(self): 1336 """Consumes a comment, returns a 2-tuple (trailing bool, comment str).""" 1337 1338 # Tokenizer initializes _previous_line and _previous_column to 0. As the 1339 # tokenizer starts, it looks like there is a previous token on the line. 1340 just_started = self._line == 0 and self._column == 0 1341 1342 before_parsing = self._previous_line 1343 comment = self.ConsumeComment() 1344 1345 # A trailing comment is a comment on the same line than the previous token. 1346 trailing = (self._previous_line == before_parsing 1347 and not just_started) 1348 1349 return trailing, comment 1350 1351 def TryConsumeIdentifier(self): 1352 try: 1353 self.ConsumeIdentifier() 1354 return True 1355 except ParseError: 1356 return False 1357 1358 def ConsumeIdentifier(self): 1359 """Consumes protocol message field identifier. 1360 1361 Returns: 1362 Identifier string. 1363 1364 Raises: 1365 ParseError: If an identifier couldn't be consumed. 1366 """ 1367 result = self.token 1368 if not self._IDENTIFIER.match(result): 1369 raise self.ParseError('Expected identifier.') 1370 self.NextToken() 1371 return result 1372 1373 def TryConsumeIdentifierOrNumber(self): 1374 try: 1375 self.ConsumeIdentifierOrNumber() 1376 return True 1377 except ParseError: 1378 return False 1379 1380 def ConsumeIdentifierOrNumber(self): 1381 """Consumes protocol message field identifier. 1382 1383 Returns: 1384 Identifier string. 1385 1386 Raises: 1387 ParseError: If an identifier couldn't be consumed. 1388 """ 1389 result = self.token 1390 if not self._IDENTIFIER_OR_NUMBER.match(result): 1391 raise self.ParseError('Expected identifier or number, got %s.' % result) 1392 self.NextToken() 1393 return result 1394 1395 def TryConsumeInteger(self): 1396 try: 1397 # Note: is_long only affects value type, not whether an error is raised. 1398 self.ConsumeInteger() 1399 return True 1400 except ParseError: 1401 return False 1402 1403 def ConsumeInteger(self, is_long=False): 1404 """Consumes an integer number. 1405 1406 Args: 1407 is_long: True if the value should be returned as a long integer. 1408 Returns: 1409 The integer parsed. 1410 1411 Raises: 1412 ParseError: If an integer couldn't be consumed. 1413 """ 1414 try: 1415 result = _ParseAbstractInteger(self.token, is_long=is_long) 1416 except ValueError as e: 1417 raise self.ParseError(str(e)) 1418 self.NextToken() 1419 return result 1420 1421 def TryConsumeFloat(self): 1422 try: 1423 self.ConsumeFloat() 1424 return True 1425 except ParseError: 1426 return False 1427 1428 def ConsumeFloat(self): 1429 """Consumes an floating point number. 1430 1431 Returns: 1432 The number parsed. 1433 1434 Raises: 1435 ParseError: If a floating point number couldn't be consumed. 1436 """ 1437 try: 1438 result = ParseFloat(self.token) 1439 except ValueError as e: 1440 raise self.ParseError(str(e)) 1441 self.NextToken() 1442 return result 1443 1444 def ConsumeBool(self): 1445 """Consumes a boolean value. 1446 1447 Returns: 1448 The bool parsed. 1449 1450 Raises: 1451 ParseError: If a boolean value couldn't be consumed. 1452 """ 1453 try: 1454 result = ParseBool(self.token) 1455 except ValueError as e: 1456 raise self.ParseError(str(e)) 1457 self.NextToken() 1458 return result 1459 1460 def TryConsumeByteString(self): 1461 try: 1462 self.ConsumeByteString() 1463 return True 1464 except ParseError: 1465 return False 1466 1467 def ConsumeString(self): 1468 """Consumes a string value. 1469 1470 Returns: 1471 The string parsed. 1472 1473 Raises: 1474 ParseError: If a string value couldn't be consumed. 1475 """ 1476 the_bytes = self.ConsumeByteString() 1477 try: 1478 return six.text_type(the_bytes, 'utf-8') 1479 except UnicodeDecodeError as e: 1480 raise self._StringParseError(e) 1481 1482 def ConsumeByteString(self): 1483 """Consumes a byte array value. 1484 1485 Returns: 1486 The array parsed (as a string). 1487 1488 Raises: 1489 ParseError: If a byte array value couldn't be consumed. 1490 """ 1491 the_list = [self._ConsumeSingleByteString()] 1492 while self.token and self.token[0] in _QUOTES: 1493 the_list.append(self._ConsumeSingleByteString()) 1494 return b''.join(the_list) 1495 1496 def _ConsumeSingleByteString(self): 1497 """Consume one token of a string literal. 1498 1499 String literals (whether bytes or text) can come in multiple adjacent 1500 tokens which are automatically concatenated, like in C or Python. This 1501 method only consumes one token. 1502 1503 Returns: 1504 The token parsed. 1505 Raises: 1506 ParseError: When the wrong format data is found. 1507 """ 1508 text = self.token 1509 if len(text) < 1 or text[0] not in _QUOTES: 1510 raise self.ParseError('Expected string but found: %r' % (text,)) 1511 1512 if len(text) < 2 or text[-1] != text[0]: 1513 raise self.ParseError('String missing ending quote: %r' % (text,)) 1514 1515 try: 1516 result = text_encoding.CUnescape(text[1:-1]) 1517 except ValueError as e: 1518 raise self.ParseError(str(e)) 1519 self.NextToken() 1520 return result 1521 1522 def ConsumeEnum(self, field): 1523 try: 1524 result = ParseEnum(field, self.token) 1525 except ValueError as e: 1526 raise self.ParseError(str(e)) 1527 self.NextToken() 1528 return result 1529 1530 def ParseErrorPreviousToken(self, message): 1531 """Creates and *returns* a ParseError for the previously read token. 1532 1533 Args: 1534 message: A message to set for the exception. 1535 1536 Returns: 1537 A ParseError instance. 1538 """ 1539 return ParseError(message, self._previous_line + 1, 1540 self._previous_column + 1) 1541 1542 def ParseError(self, message): 1543 """Creates and *returns* a ParseError for the current token.""" 1544 return ParseError('\'' + self._current_line + '\': ' + message, 1545 self._line + 1, self._column + 1) 1546 1547 def _StringParseError(self, e): 1548 return self.ParseError('Couldn\'t parse string: ' + str(e)) 1549 1550 def NextToken(self): 1551 """Reads the next meaningful token.""" 1552 self._previous_line = self._line 1553 self._previous_column = self._column 1554 1555 self._column += len(self.token) 1556 self._SkipWhitespace() 1557 1558 if not self._more_lines: 1559 self.token = '' 1560 return 1561 1562 match = self._TOKEN.match(self._current_line, self._column) 1563 if not match and not self._skip_comments: 1564 match = self._COMMENT.match(self._current_line, self._column) 1565 if match: 1566 token = match.group(0) 1567 self.token = token 1568 else: 1569 self.token = self._current_line[self._column] 1570 1571# Aliased so it can still be accessed by current visibility violators. 1572# TODO(dbarnett): Migrate violators to textformat_tokenizer. 1573_Tokenizer = Tokenizer # pylint: disable=invalid-name 1574 1575 1576def _ConsumeInt32(tokenizer): 1577 """Consumes a signed 32bit integer number from tokenizer. 1578 1579 Args: 1580 tokenizer: A tokenizer used to parse the number. 1581 1582 Returns: 1583 The integer parsed. 1584 1585 Raises: 1586 ParseError: If a signed 32bit integer couldn't be consumed. 1587 """ 1588 return _ConsumeInteger(tokenizer, is_signed=True, is_long=False) 1589 1590 1591def _ConsumeUint32(tokenizer): 1592 """Consumes an unsigned 32bit integer number from tokenizer. 1593 1594 Args: 1595 tokenizer: A tokenizer used to parse the number. 1596 1597 Returns: 1598 The integer parsed. 1599 1600 Raises: 1601 ParseError: If an unsigned 32bit integer couldn't be consumed. 1602 """ 1603 return _ConsumeInteger(tokenizer, is_signed=False, is_long=False) 1604 1605 1606def _TryConsumeInt64(tokenizer): 1607 try: 1608 _ConsumeInt64(tokenizer) 1609 return True 1610 except ParseError: 1611 return False 1612 1613 1614def _ConsumeInt64(tokenizer): 1615 """Consumes a signed 32bit integer number from tokenizer. 1616 1617 Args: 1618 tokenizer: A tokenizer used to parse the number. 1619 1620 Returns: 1621 The integer parsed. 1622 1623 Raises: 1624 ParseError: If a signed 32bit integer couldn't be consumed. 1625 """ 1626 return _ConsumeInteger(tokenizer, is_signed=True, is_long=True) 1627 1628 1629def _TryConsumeUint64(tokenizer): 1630 try: 1631 _ConsumeUint64(tokenizer) 1632 return True 1633 except ParseError: 1634 return False 1635 1636 1637def _ConsumeUint64(tokenizer): 1638 """Consumes an unsigned 64bit integer number from tokenizer. 1639 1640 Args: 1641 tokenizer: A tokenizer used to parse the number. 1642 1643 Returns: 1644 The integer parsed. 1645 1646 Raises: 1647 ParseError: If an unsigned 64bit integer couldn't be consumed. 1648 """ 1649 return _ConsumeInteger(tokenizer, is_signed=False, is_long=True) 1650 1651 1652def _TryConsumeInteger(tokenizer, is_signed=False, is_long=False): 1653 try: 1654 _ConsumeInteger(tokenizer, is_signed=is_signed, is_long=is_long) 1655 return True 1656 except ParseError: 1657 return False 1658 1659 1660def _ConsumeInteger(tokenizer, is_signed=False, is_long=False): 1661 """Consumes an integer number from tokenizer. 1662 1663 Args: 1664 tokenizer: A tokenizer used to parse the number. 1665 is_signed: True if a signed integer must be parsed. 1666 is_long: True if a long integer must be parsed. 1667 1668 Returns: 1669 The integer parsed. 1670 1671 Raises: 1672 ParseError: If an integer with given characteristics couldn't be consumed. 1673 """ 1674 try: 1675 result = ParseInteger(tokenizer.token, is_signed=is_signed, is_long=is_long) 1676 except ValueError as e: 1677 raise tokenizer.ParseError(str(e)) 1678 tokenizer.NextToken() 1679 return result 1680 1681 1682def ParseInteger(text, is_signed=False, is_long=False): 1683 """Parses an integer. 1684 1685 Args: 1686 text: The text to parse. 1687 is_signed: True if a signed integer must be parsed. 1688 is_long: True if a long integer must be parsed. 1689 1690 Returns: 1691 The integer value. 1692 1693 Raises: 1694 ValueError: Thrown Iff the text is not a valid integer. 1695 """ 1696 # Do the actual parsing. Exception handling is propagated to caller. 1697 result = _ParseAbstractInteger(text, is_long=is_long) 1698 1699 # Check if the integer is sane. Exceptions handled by callers. 1700 checker = _INTEGER_CHECKERS[2 * int(is_long) + int(is_signed)] 1701 checker.CheckValue(result) 1702 return result 1703 1704 1705def _ParseAbstractInteger(text, is_long=False): 1706 """Parses an integer without checking size/signedness. 1707 1708 Args: 1709 text: The text to parse. 1710 is_long: True if the value should be returned as a long integer. 1711 1712 Returns: 1713 The integer value. 1714 1715 Raises: 1716 ValueError: Thrown Iff the text is not a valid integer. 1717 """ 1718 # Do the actual parsing. Exception handling is propagated to caller. 1719 orig_text = text 1720 c_octal_match = re.match(r'(-?)0(\d+)$', text) 1721 if c_octal_match: 1722 # Python 3 no longer supports 0755 octal syntax without the 'o', so 1723 # we always use the '0o' prefix for multi-digit numbers starting with 0. 1724 text = c_octal_match.group(1) + '0o' + c_octal_match.group(2) 1725 try: 1726 # We force 32-bit values to int and 64-bit values to long to make 1727 # alternate implementations where the distinction is more significant 1728 # (e.g. the C++ implementation) simpler. 1729 if is_long: 1730 return long(text, 0) 1731 else: 1732 return int(text, 0) 1733 except ValueError: 1734 raise ValueError('Couldn\'t parse integer: %s' % orig_text) 1735 1736 1737def ParseFloat(text): 1738 """Parse a floating point number. 1739 1740 Args: 1741 text: Text to parse. 1742 1743 Returns: 1744 The number parsed. 1745 1746 Raises: 1747 ValueError: If a floating point number couldn't be parsed. 1748 """ 1749 try: 1750 # Assume Python compatible syntax. 1751 return float(text) 1752 except ValueError: 1753 # Check alternative spellings. 1754 if _FLOAT_INFINITY.match(text): 1755 if text[0] == '-': 1756 return float('-inf') 1757 else: 1758 return float('inf') 1759 elif _FLOAT_NAN.match(text): 1760 return float('nan') 1761 else: 1762 # assume '1.0f' format 1763 try: 1764 return float(text.rstrip('f')) 1765 except ValueError: 1766 raise ValueError('Couldn\'t parse float: %s' % text) 1767 1768 1769def ParseBool(text): 1770 """Parse a boolean value. 1771 1772 Args: 1773 text: Text to parse. 1774 1775 Returns: 1776 Boolean values parsed 1777 1778 Raises: 1779 ValueError: If text is not a valid boolean. 1780 """ 1781 if text in ('true', 't', '1', 'True'): 1782 return True 1783 elif text in ('false', 'f', '0', 'False'): 1784 return False 1785 else: 1786 raise ValueError('Expected "true" or "false".') 1787 1788 1789def ParseEnum(field, value): 1790 """Parse an enum value. 1791 1792 The value can be specified by a number (the enum value), or by 1793 a string literal (the enum name). 1794 1795 Args: 1796 field: Enum field descriptor. 1797 value: String value. 1798 1799 Returns: 1800 Enum value number. 1801 1802 Raises: 1803 ValueError: If the enum value could not be parsed. 1804 """ 1805 enum_descriptor = field.enum_type 1806 try: 1807 number = int(value, 0) 1808 except ValueError: 1809 # Identifier. 1810 enum_value = enum_descriptor.values_by_name.get(value, None) 1811 if enum_value is None: 1812 raise ValueError('Enum type "%s" has no value named %s.' % 1813 (enum_descriptor.full_name, value)) 1814 else: 1815 # Numeric value. 1816 if hasattr(field.file, 'syntax'): 1817 # Attribute is checked for compatibility. 1818 if field.file.syntax == 'proto3': 1819 # Proto3 accept numeric unknown enums. 1820 return number 1821 enum_value = enum_descriptor.values_by_number.get(number, None) 1822 if enum_value is None: 1823 raise ValueError('Enum type "%s" has no value with number %d.' % 1824 (enum_descriptor.full_name, number)) 1825 return enum_value.number 1826