1# Protocol Buffers - Google's data interchange format 2# Copyright 2008 Google Inc. All rights reserved. 3# https://developers.google.com/protocol-buffers/ 4# 5# Redistribution and use in source and binary forms, with or without 6# modification, are permitted provided that the following conditions are 7# met: 8# 9# * Redistributions of source code must retain the above copyright 10# notice, this list of conditions and the following disclaimer. 11# * Redistributions in binary form must reproduce the above 12# copyright notice, this list of conditions and the following disclaimer 13# in the documentation and/or other materials provided with the 14# distribution. 15# * Neither the name of Google Inc. nor the names of its 16# contributors may be used to endorse or promote products derived from 17# this software without specific prior written permission. 18# 19# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 23# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 25# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 31"""Contains routines for printing protocol messages in text format. 32 33Simple usage example: 34 35 # Create a proto object and serialize it to a text proto string. 36 message = my_proto_pb2.MyMessage(foo='bar') 37 text_proto = text_format.MessageToString(message) 38 39 # Parse a text proto string. 40 message = text_format.Parse(text_proto, my_proto_pb2.MyMessage()) 41""" 42 43__author__ = 'kenton@google.com (Kenton Varda)' 44 45import io 46import re 47 48import six 49 50if six.PY3: 51 long = int # pylint: disable=redefined-builtin,invalid-name 52 53# pylint: disable=g-import-not-at-top 54from google.protobuf.internal import decoder 55from google.protobuf.internal import type_checkers 56from google.protobuf import descriptor 57from google.protobuf import text_encoding 58 59__all__ = ['MessageToString', 'Parse', 'PrintMessage', 'PrintField', 60 'PrintFieldValue', 'Merge', 'MessageToBytes'] 61 62_INTEGER_CHECKERS = (type_checkers.Uint32ValueChecker(), 63 type_checkers.Int32ValueChecker(), 64 type_checkers.Uint64ValueChecker(), 65 type_checkers.Int64ValueChecker()) 66_FLOAT_INFINITY = re.compile('-?inf(?:inity)?f?$', re.IGNORECASE) 67_FLOAT_NAN = re.compile('nanf?$', re.IGNORECASE) 68_QUOTES = frozenset(("'", '"')) 69_ANY_FULL_TYPE_NAME = 'google.protobuf.Any' 70 71 72class Error(Exception): 73 """Top-level module error for text_format.""" 74 75 76class ParseError(Error): 77 """Thrown in case of text parsing or tokenizing error.""" 78 79 def __init__(self, message=None, line=None, column=None): 80 if message is not None and line is not None: 81 loc = str(line) 82 if column is not None: 83 loc += ':{0}'.format(column) 84 message = '{0} : {1}'.format(loc, message) 85 if message is not None: 86 super(ParseError, self).__init__(message) 87 else: 88 super(ParseError, self).__init__() 89 self._line = line 90 self._column = column 91 92 def GetLine(self): 93 return self._line 94 95 def GetColumn(self): 96 return self._column 97 98 99class TextWriter(object): 100 101 def __init__(self, as_utf8): 102 if six.PY2: 103 self._writer = io.BytesIO() 104 else: 105 self._writer = io.StringIO() 106 107 def write(self, val): 108 if six.PY2: 109 if isinstance(val, six.text_type): 110 val = val.encode('utf-8') 111 return self._writer.write(val) 112 113 def close(self): 114 return self._writer.close() 115 116 def getvalue(self): 117 return self._writer.getvalue() 118 119 120def MessageToString(message, 121 as_utf8=False, 122 as_one_line=False, 123 use_short_repeated_primitives=False, 124 pointy_brackets=False, 125 use_index_order=False, 126 float_format=None, 127 double_format=None, 128 use_field_number=False, 129 descriptor_pool=None, 130 indent=0, 131 message_formatter=None, 132 print_unknown_fields=False): 133 # type: (...) -> str 134 """Convert protobuf message to text format. 135 136 Double values can be formatted compactly with 15 digits of 137 precision (which is the most that IEEE 754 "double" can guarantee) 138 using double_format='.15g'. To ensure that converting to text and back to a 139 proto will result in an identical value, double_format='.17g' should be used. 140 141 Args: 142 message: The protocol buffers message. 143 as_utf8: Return unescaped Unicode for non-ASCII characters. 144 In Python 3 actual Unicode characters may appear as is in strings. 145 In Python 2 the return value will be valid UTF-8 rather than only ASCII. 146 as_one_line: Don't introduce newlines between fields. 147 use_short_repeated_primitives: Use short repeated format for primitives. 148 pointy_brackets: If True, use angle brackets instead of curly braces for 149 nesting. 150 use_index_order: If True, fields of a proto message will be printed using 151 the order defined in source code instead of the field number, extensions 152 will be printed at the end of the message and their relative order is 153 determined by the extension number. By default, use the field number 154 order. 155 float_format: If set, use this to specify float field formatting 156 (per the "Format Specification Mini-Language"); otherwise, 8 valid digits 157 is used (default '.8g'). Also affect double field if double_format is 158 not set but float_format is set. 159 double_format: If set, use this to specify double field formatting 160 (per the "Format Specification Mini-Language"); if it is not set but 161 float_format is set, use float_format. Otherwise, use str() 162 use_field_number: If True, print field numbers instead of names. 163 descriptor_pool: A DescriptorPool used to resolve Any types. 164 indent: The initial indent level, in terms of spaces, for pretty print. 165 message_formatter: A function(message, indent, as_one_line): unicode|None 166 to custom format selected sub-messages (usually based on message type). 167 Use to pretty print parts of the protobuf for easier diffing. 168 print_unknown_fields: If True, unknown fields will be printed. 169 170 Returns: 171 A string of the text formatted protocol buffer message. 172 """ 173 out = TextWriter(as_utf8) 174 printer = _Printer(out, indent, as_utf8, as_one_line, 175 use_short_repeated_primitives, pointy_brackets, 176 use_index_order, float_format, double_format, 177 use_field_number, 178 descriptor_pool, message_formatter, 179 print_unknown_fields=print_unknown_fields) 180 printer.PrintMessage(message) 181 result = out.getvalue() 182 out.close() 183 if as_one_line: 184 return result.rstrip() 185 return result 186 187 188def MessageToBytes(message, **kwargs): 189 # type: (...) -> bytes 190 """Convert protobuf message to encoded text format. See MessageToString.""" 191 text = MessageToString(message, **kwargs) 192 if isinstance(text, bytes): 193 return text 194 codec = 'utf-8' if kwargs.get('as_utf8') else 'ascii' 195 return text.encode(codec) 196 197 198def _IsMapEntry(field): 199 return (field.type == descriptor.FieldDescriptor.TYPE_MESSAGE and 200 field.message_type.has_options and 201 field.message_type.GetOptions().map_entry) 202 203 204def PrintMessage(message, 205 out, 206 indent=0, 207 as_utf8=False, 208 as_one_line=False, 209 use_short_repeated_primitives=False, 210 pointy_brackets=False, 211 use_index_order=False, 212 float_format=None, 213 double_format=None, 214 use_field_number=False, 215 descriptor_pool=None, 216 message_formatter=None, 217 print_unknown_fields=False): 218 printer = _Printer( 219 out=out, indent=indent, as_utf8=as_utf8, 220 as_one_line=as_one_line, 221 use_short_repeated_primitives=use_short_repeated_primitives, 222 pointy_brackets=pointy_brackets, 223 use_index_order=use_index_order, 224 float_format=float_format, 225 double_format=double_format, 226 use_field_number=use_field_number, 227 descriptor_pool=descriptor_pool, 228 message_formatter=message_formatter, 229 print_unknown_fields=print_unknown_fields) 230 printer.PrintMessage(message) 231 232 233def PrintField(field, 234 value, 235 out, 236 indent=0, 237 as_utf8=False, 238 as_one_line=False, 239 use_short_repeated_primitives=False, 240 pointy_brackets=False, 241 use_index_order=False, 242 float_format=None, 243 double_format=None, 244 message_formatter=None, 245 print_unknown_fields=False): 246 """Print a single field name/value pair.""" 247 printer = _Printer(out, indent, as_utf8, as_one_line, 248 use_short_repeated_primitives, pointy_brackets, 249 use_index_order, float_format, double_format, 250 message_formatter=message_formatter, 251 print_unknown_fields=print_unknown_fields) 252 printer.PrintField(field, value) 253 254 255def PrintFieldValue(field, 256 value, 257 out, 258 indent=0, 259 as_utf8=False, 260 as_one_line=False, 261 use_short_repeated_primitives=False, 262 pointy_brackets=False, 263 use_index_order=False, 264 float_format=None, 265 double_format=None, 266 message_formatter=None, 267 print_unknown_fields=False): 268 """Print a single field value (not including name).""" 269 printer = _Printer(out, indent, as_utf8, as_one_line, 270 use_short_repeated_primitives, pointy_brackets, 271 use_index_order, float_format, double_format, 272 message_formatter=message_formatter, 273 print_unknown_fields=print_unknown_fields) 274 printer.PrintFieldValue(field, value) 275 276 277def _BuildMessageFromTypeName(type_name, descriptor_pool): 278 """Returns a protobuf message instance. 279 280 Args: 281 type_name: Fully-qualified protobuf message type name string. 282 descriptor_pool: DescriptorPool instance. 283 284 Returns: 285 A Message instance of type matching type_name, or None if the a Descriptor 286 wasn't found matching type_name. 287 """ 288 # pylint: disable=g-import-not-at-top 289 if descriptor_pool is None: 290 from google.protobuf import descriptor_pool as pool_mod 291 descriptor_pool = pool_mod.Default() 292 from google.protobuf import symbol_database 293 database = symbol_database.Default() 294 try: 295 message_descriptor = descriptor_pool.FindMessageTypeByName(type_name) 296 except KeyError: 297 return None 298 message_type = database.GetPrototype(message_descriptor) 299 return message_type() 300 301 302# These values must match WireType enum in google/protobuf/wire_format.h. 303WIRETYPE_LENGTH_DELIMITED = 2 304WIRETYPE_START_GROUP = 3 305 306 307class _Printer(object): 308 """Text format printer for protocol message.""" 309 310 def __init__(self, 311 out, 312 indent=0, 313 as_utf8=False, 314 as_one_line=False, 315 use_short_repeated_primitives=False, 316 pointy_brackets=False, 317 use_index_order=False, 318 float_format=None, 319 double_format=None, 320 use_field_number=False, 321 descriptor_pool=None, 322 message_formatter=None, 323 print_unknown_fields=False): 324 """Initialize the Printer. 325 326 Double values can be formatted compactly with 15 digits of precision 327 (which is the most that IEEE 754 "double" can guarantee) using 328 double_format='.15g'. To ensure that converting to text and back to a proto 329 will result in an identical value, double_format='.17g' should be used. 330 331 Args: 332 out: To record the text format result. 333 indent: The initial indent level for pretty print. 334 as_utf8: Return unescaped Unicode for non-ASCII characters. 335 In Python 3 actual Unicode characters may appear as is in strings. 336 In Python 2 the return value will be valid UTF-8 rather than ASCII. 337 as_one_line: Don't introduce newlines between fields. 338 use_short_repeated_primitives: Use short repeated format for primitives. 339 pointy_brackets: If True, use angle brackets instead of curly braces for 340 nesting. 341 use_index_order: If True, print fields of a proto message using the order 342 defined in source code instead of the field number. By default, use the 343 field number order. 344 float_format: If set, use this to specify float field formatting 345 (per the "Format Specification Mini-Language"); otherwise, 8 valid 346 digits is used (default '.8g'). Also affect double field if 347 double_format is not set but float_format is set. 348 double_format: If set, use this to specify double field formatting 349 (per the "Format Specification Mini-Language"); if it is not set but 350 float_format is set, use float_format. Otherwise, str() is used. 351 use_field_number: If True, print field numbers instead of names. 352 descriptor_pool: A DescriptorPool used to resolve Any types. 353 message_formatter: A function(message, indent, as_one_line): unicode|None 354 to custom format selected sub-messages (usually based on message type). 355 Use to pretty print parts of the protobuf for easier diffing. 356 print_unknown_fields: If True, unknown fields will be printed. 357 """ 358 self.out = out 359 self.indent = indent 360 self.as_utf8 = as_utf8 361 self.as_one_line = as_one_line 362 self.use_short_repeated_primitives = use_short_repeated_primitives 363 self.pointy_brackets = pointy_brackets 364 self.use_index_order = use_index_order 365 self.float_format = float_format 366 if double_format is not None: 367 self.double_format = double_format 368 else: 369 self.double_format = float_format 370 self.use_field_number = use_field_number 371 self.descriptor_pool = descriptor_pool 372 self.message_formatter = message_formatter 373 self.print_unknown_fields = print_unknown_fields 374 375 def _TryPrintAsAnyMessage(self, message): 376 """Serializes if message is a google.protobuf.Any field.""" 377 if '/' not in message.type_url: 378 return False 379 packed_message = _BuildMessageFromTypeName(message.TypeName(), 380 self.descriptor_pool) 381 if packed_message: 382 packed_message.MergeFromString(message.value) 383 self.out.write('%s[%s] ' % (self.indent * ' ', message.type_url)) 384 self._PrintMessageFieldValue(packed_message) 385 self.out.write(' ' if self.as_one_line else '\n') 386 return True 387 else: 388 return False 389 390 def _TryCustomFormatMessage(self, message): 391 formatted = self.message_formatter(message, self.indent, self.as_one_line) 392 if formatted is None: 393 return False 394 395 out = self.out 396 out.write(' ' * self.indent) 397 out.write(formatted) 398 out.write(' ' if self.as_one_line else '\n') 399 return True 400 401 def PrintMessage(self, message): 402 """Convert protobuf message to text format. 403 404 Args: 405 message: The protocol buffers message. 406 """ 407 if self.message_formatter and self._TryCustomFormatMessage(message): 408 return 409 if (message.DESCRIPTOR.full_name == _ANY_FULL_TYPE_NAME and 410 self._TryPrintAsAnyMessage(message)): 411 return 412 fields = message.ListFields() 413 if self.use_index_order: 414 fields.sort( 415 key=lambda x: x[0].number if x[0].is_extension else x[0].index) 416 for field, value in fields: 417 if _IsMapEntry(field): 418 for key in sorted(value): 419 # This is slow for maps with submessage entries because it copies the 420 # entire tree. Unfortunately this would take significant refactoring 421 # of this file to work around. 422 # 423 # TODO(haberman): refactor and optimize if this becomes an issue. 424 entry_submsg = value.GetEntryClass()(key=key, value=value[key]) 425 self.PrintField(field, entry_submsg) 426 elif field.label == descriptor.FieldDescriptor.LABEL_REPEATED: 427 if (self.use_short_repeated_primitives 428 and field.cpp_type != descriptor.FieldDescriptor.CPPTYPE_MESSAGE 429 and field.cpp_type != descriptor.FieldDescriptor.CPPTYPE_STRING): 430 self._PrintShortRepeatedPrimitivesValue(field, value) 431 else: 432 for element in value: 433 self.PrintField(field, element) 434 else: 435 self.PrintField(field, value) 436 437 if self.print_unknown_fields: 438 self._PrintUnknownFields(message.UnknownFields()) 439 440 def _PrintUnknownFields(self, unknown_fields): 441 """Print unknown fields.""" 442 out = self.out 443 for field in unknown_fields: 444 out.write(' ' * self.indent) 445 out.write(str(field.field_number)) 446 if field.wire_type == WIRETYPE_START_GROUP: 447 if self.as_one_line: 448 out.write(' { ') 449 else: 450 out.write(' {\n') 451 self.indent += 2 452 453 self._PrintUnknownFields(field.data) 454 455 if self.as_one_line: 456 out.write('} ') 457 else: 458 out.write('}\n') 459 self.indent -= 2 460 elif field.wire_type == WIRETYPE_LENGTH_DELIMITED: 461 try: 462 # If this field is parseable as a Message, it is probably 463 # an embedded message. 464 # pylint: disable=protected-access 465 (embedded_unknown_message, pos) = decoder._DecodeUnknownFieldSet( 466 memoryview(field.data), 0, len(field.data)) 467 except Exception: # pylint: disable=broad-except 468 pos = 0 469 470 if pos == len(field.data): 471 if self.as_one_line: 472 out.write(' { ') 473 else: 474 out.write(' {\n') 475 self.indent += 2 476 477 self._PrintUnknownFields(embedded_unknown_message) 478 479 if self.as_one_line: 480 out.write('} ') 481 else: 482 out.write('}\n') 483 self.indent -= 2 484 else: 485 # A string or bytes field. self.as_utf8 may not work. 486 out.write(': \"') 487 out.write(text_encoding.CEscape(field.data, False)) 488 out.write('\" ' if self.as_one_line else '\"\n') 489 else: 490 # varint, fixed32, fixed64 491 out.write(': ') 492 out.write(str(field.data)) 493 out.write(' ' if self.as_one_line else '\n') 494 495 def _PrintFieldName(self, field): 496 """Print field name.""" 497 out = self.out 498 out.write(' ' * self.indent) 499 if self.use_field_number: 500 out.write(str(field.number)) 501 else: 502 if field.is_extension: 503 out.write('[') 504 if (field.containing_type.GetOptions().message_set_wire_format and 505 field.type == descriptor.FieldDescriptor.TYPE_MESSAGE and 506 field.label == descriptor.FieldDescriptor.LABEL_OPTIONAL): 507 out.write(field.message_type.full_name) 508 else: 509 out.write(field.full_name) 510 out.write(']') 511 elif field.type == descriptor.FieldDescriptor.TYPE_GROUP: 512 # For groups, use the capitalized name. 513 out.write(field.message_type.name) 514 else: 515 out.write(field.name) 516 517 if field.cpp_type != descriptor.FieldDescriptor.CPPTYPE_MESSAGE: 518 # The colon is optional in this case, but our cross-language golden files 519 # don't include it. 520 out.write(':') 521 522 def PrintField(self, field, value): 523 """Print a single field name/value pair.""" 524 self._PrintFieldName(field) 525 self.out.write(' ') 526 self.PrintFieldValue(field, value) 527 self.out.write(' ' if self.as_one_line else '\n') 528 529 def _PrintShortRepeatedPrimitivesValue(self, field, value): 530 # Note: this is called only when value has at least one element. 531 self._PrintFieldName(field) 532 self.out.write(' [') 533 for i in six.moves.range(len(value) - 1): 534 self.PrintFieldValue(field, value[i]) 535 self.out.write(', ') 536 self.PrintFieldValue(field, value[-1]) 537 self.out.write(']') 538 self.out.write(' ' if self.as_one_line else '\n') 539 540 def _PrintMessageFieldValue(self, value): 541 if self.pointy_brackets: 542 openb = '<' 543 closeb = '>' 544 else: 545 openb = '{' 546 closeb = '}' 547 548 if self.as_one_line: 549 self.out.write('%s ' % openb) 550 self.PrintMessage(value) 551 self.out.write(closeb) 552 else: 553 self.out.write('%s\n' % openb) 554 self.indent += 2 555 self.PrintMessage(value) 556 self.indent -= 2 557 self.out.write(' ' * self.indent + closeb) 558 559 def PrintFieldValue(self, field, value): 560 """Print a single field value (not including name). 561 562 For repeated fields, the value should be a single element. 563 564 Args: 565 field: The descriptor of the field to be printed. 566 value: The value of the field. 567 """ 568 out = self.out 569 if field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_MESSAGE: 570 self._PrintMessageFieldValue(value) 571 elif field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_ENUM: 572 enum_value = field.enum_type.values_by_number.get(value, None) 573 if enum_value is not None: 574 out.write(enum_value.name) 575 else: 576 out.write(str(value)) 577 elif field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_STRING: 578 out.write('\"') 579 if isinstance(value, six.text_type) and (six.PY2 or not self.as_utf8): 580 out_value = value.encode('utf-8') 581 else: 582 out_value = value 583 if field.type == descriptor.FieldDescriptor.TYPE_BYTES: 584 # We always need to escape all binary data in TYPE_BYTES fields. 585 out_as_utf8 = False 586 else: 587 out_as_utf8 = self.as_utf8 588 out.write(text_encoding.CEscape(out_value, out_as_utf8)) 589 out.write('\"') 590 elif field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_BOOL: 591 if value: 592 out.write('true') 593 else: 594 out.write('false') 595 elif field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_FLOAT: 596 if self.float_format is not None: 597 out.write('{1:{0}}'.format(self.float_format, value)) 598 else: 599 out.write(str(float(format(value, '.8g')))) 600 elif (field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_DOUBLE and 601 self.double_format is not None): 602 out.write('{1:{0}}'.format(self.double_format, value)) 603 else: 604 out.write(str(value)) 605 606 607def Parse(text, 608 message, 609 allow_unknown_extension=False, 610 allow_field_number=False, 611 descriptor_pool=None, 612 allow_unknown_field=False): 613 """Parses a text representation of a protocol message into a message. 614 615 NOTE: for historical reasons this function does not clear the input 616 message. This is different from what the binary msg.ParseFrom(...) does. 617 618 Example 619 a = MyProto() 620 a.repeated_field.append('test') 621 b = MyProto() 622 623 text_format.Parse(repr(a), b) 624 text_format.Parse(repr(a), b) # repeated_field contains ["test", "test"] 625 626 # Binary version: 627 b.ParseFromString(a.SerializeToString()) # repeated_field is now "test" 628 629 Caller is responsible for clearing the message as needed. 630 631 Args: 632 text: Message text representation. 633 message: A protocol buffer message to merge into. 634 allow_unknown_extension: if True, skip over missing extensions and keep 635 parsing 636 allow_field_number: if True, both field number and field name are allowed. 637 descriptor_pool: A DescriptorPool used to resolve Any types. 638 allow_unknown_field: if True, skip over unknown field and keep 639 parsing. Avoid to use this option if possible. It may hide some 640 errors (e.g. spelling error on field name) 641 642 Returns: 643 The same message passed as argument. 644 645 Raises: 646 ParseError: On text parsing problems. 647 """ 648 return ParseLines(text.split(b'\n' if isinstance(text, bytes) else u'\n'), 649 message, 650 allow_unknown_extension, 651 allow_field_number, 652 descriptor_pool=descriptor_pool, 653 allow_unknown_field=allow_unknown_field) 654 655 656def Merge(text, 657 message, 658 allow_unknown_extension=False, 659 allow_field_number=False, 660 descriptor_pool=None, 661 allow_unknown_field=False): 662 """Parses a text representation of a protocol message into a message. 663 664 Like Parse(), but allows repeated values for a non-repeated field, and uses 665 the last one. 666 667 Args: 668 text: Message text representation. 669 message: A protocol buffer message to merge into. 670 allow_unknown_extension: if True, skip over missing extensions and keep 671 parsing 672 allow_field_number: if True, both field number and field name are allowed. 673 descriptor_pool: A DescriptorPool used to resolve Any types. 674 allow_unknown_field: if True, skip over unknown field and keep 675 parsing. Avoid to use this option if possible. It may hide some 676 errors (e.g. spelling error on field name) 677 678 Returns: 679 The same message passed as argument. 680 681 Raises: 682 ParseError: On text parsing problems. 683 """ 684 return MergeLines( 685 text.split(b'\n' if isinstance(text, bytes) else u'\n'), 686 message, 687 allow_unknown_extension, 688 allow_field_number, 689 descriptor_pool=descriptor_pool, 690 allow_unknown_field=allow_unknown_field) 691 692 693def ParseLines(lines, 694 message, 695 allow_unknown_extension=False, 696 allow_field_number=False, 697 descriptor_pool=None, 698 allow_unknown_field=False): 699 """Parses a text representation of a protocol message into a message. 700 701 Args: 702 lines: An iterable of lines of a message's text representation. 703 message: A protocol buffer message to merge into. 704 allow_unknown_extension: if True, skip over missing extensions and keep 705 parsing 706 allow_field_number: if True, both field number and field name are allowed. 707 descriptor_pool: A DescriptorPool used to resolve Any types. 708 allow_unknown_field: if True, skip over unknown field and keep 709 parsing. Avoid to use this option if possible. It may hide some 710 errors (e.g. spelling error on field name) 711 712 Returns: 713 The same message passed as argument. 714 715 Raises: 716 ParseError: On text parsing problems. 717 """ 718 parser = _Parser(allow_unknown_extension, 719 allow_field_number, 720 descriptor_pool=descriptor_pool, 721 allow_unknown_field=allow_unknown_field) 722 return parser.ParseLines(lines, message) 723 724 725def MergeLines(lines, 726 message, 727 allow_unknown_extension=False, 728 allow_field_number=False, 729 descriptor_pool=None, 730 allow_unknown_field=False): 731 """Parses a text representation of a protocol message into a message. 732 733 Like ParseLines(), but allows repeated values for a non-repeated field, and 734 uses the last one. 735 736 Args: 737 lines: An iterable of lines of a message's text representation. 738 message: A protocol buffer message to merge into. 739 allow_unknown_extension: if True, skip over missing extensions and keep 740 parsing 741 allow_field_number: if True, both field number and field name are allowed. 742 descriptor_pool: A DescriptorPool used to resolve Any types. 743 allow_unknown_field: if True, skip over unknown field and keep 744 parsing. Avoid to use this option if possible. It may hide some 745 errors (e.g. spelling error on field name) 746 747 Returns: 748 The same message passed as argument. 749 750 Raises: 751 ParseError: On text parsing problems. 752 """ 753 parser = _Parser(allow_unknown_extension, 754 allow_field_number, 755 descriptor_pool=descriptor_pool, 756 allow_unknown_field=allow_unknown_field) 757 return parser.MergeLines(lines, message) 758 759 760class _Parser(object): 761 """Text format parser for protocol message.""" 762 763 def __init__(self, 764 allow_unknown_extension=False, 765 allow_field_number=False, 766 descriptor_pool=None, 767 allow_unknown_field=False): 768 self.allow_unknown_extension = allow_unknown_extension 769 self.allow_field_number = allow_field_number 770 self.descriptor_pool = descriptor_pool 771 self.allow_unknown_field = allow_unknown_field 772 773 def ParseLines(self, lines, message): 774 """Parses a text representation of a protocol message into a message.""" 775 self._allow_multiple_scalars = False 776 self._ParseOrMerge(lines, message) 777 return message 778 779 def MergeLines(self, lines, message): 780 """Merges a text representation of a protocol message into a message.""" 781 self._allow_multiple_scalars = True 782 self._ParseOrMerge(lines, message) 783 return message 784 785 def _ParseOrMerge(self, lines, message): 786 """Converts a text representation of a protocol message into a message. 787 788 Args: 789 lines: Lines of a message's text representation. 790 message: A protocol buffer message to merge into. 791 792 Raises: 793 ParseError: On text parsing problems. 794 """ 795 # Tokenize expects native str lines. 796 if six.PY2: 797 str_lines = (line if isinstance(line, str) else line.encode('utf-8') 798 for line in lines) 799 else: 800 str_lines = (line if isinstance(line, str) else line.decode('utf-8') 801 for line in lines) 802 tokenizer = Tokenizer(str_lines) 803 while not tokenizer.AtEnd(): 804 self._MergeField(tokenizer, message) 805 806 def _MergeField(self, tokenizer, message): 807 """Merges a single protocol message field into a message. 808 809 Args: 810 tokenizer: A tokenizer to parse the field name and values. 811 message: A protocol message to record the data. 812 813 Raises: 814 ParseError: In case of text parsing problems. 815 """ 816 message_descriptor = message.DESCRIPTOR 817 if (message_descriptor.full_name == _ANY_FULL_TYPE_NAME and 818 tokenizer.TryConsume('[')): 819 type_url_prefix, packed_type_name = self._ConsumeAnyTypeUrl(tokenizer) 820 tokenizer.Consume(']') 821 tokenizer.TryConsume(':') 822 if tokenizer.TryConsume('<'): 823 expanded_any_end_token = '>' 824 else: 825 tokenizer.Consume('{') 826 expanded_any_end_token = '}' 827 expanded_any_sub_message = _BuildMessageFromTypeName(packed_type_name, 828 self.descriptor_pool) 829 if not expanded_any_sub_message: 830 raise ParseError('Type %s not found in descriptor pool' % 831 packed_type_name) 832 while not tokenizer.TryConsume(expanded_any_end_token): 833 if tokenizer.AtEnd(): 834 raise tokenizer.ParseErrorPreviousToken('Expected "%s".' % 835 (expanded_any_end_token,)) 836 self._MergeField(tokenizer, expanded_any_sub_message) 837 message.Pack(expanded_any_sub_message, 838 type_url_prefix=type_url_prefix) 839 return 840 841 if tokenizer.TryConsume('['): 842 name = [tokenizer.ConsumeIdentifier()] 843 while tokenizer.TryConsume('.'): 844 name.append(tokenizer.ConsumeIdentifier()) 845 name = '.'.join(name) 846 847 if not message_descriptor.is_extendable: 848 raise tokenizer.ParseErrorPreviousToken( 849 'Message type "%s" does not have extensions.' % 850 message_descriptor.full_name) 851 # pylint: disable=protected-access 852 field = message.Extensions._FindExtensionByName(name) 853 # pylint: enable=protected-access 854 if not field: 855 if self.allow_unknown_extension: 856 field = None 857 else: 858 raise tokenizer.ParseErrorPreviousToken( 859 'Extension "%s" not registered. ' 860 'Did you import the _pb2 module which defines it? ' 861 'If you are trying to place the extension in the MessageSet ' 862 'field of another message that is in an Any or MessageSet field, ' 863 'that message\'s _pb2 module must be imported as well' % name) 864 elif message_descriptor != field.containing_type: 865 raise tokenizer.ParseErrorPreviousToken( 866 'Extension "%s" does not extend message type "%s".' % 867 (name, message_descriptor.full_name)) 868 869 tokenizer.Consume(']') 870 871 else: 872 name = tokenizer.ConsumeIdentifierOrNumber() 873 if self.allow_field_number and name.isdigit(): 874 number = ParseInteger(name, True, True) 875 field = message_descriptor.fields_by_number.get(number, None) 876 if not field and message_descriptor.is_extendable: 877 field = message.Extensions._FindExtensionByNumber(number) 878 else: 879 field = message_descriptor.fields_by_name.get(name, None) 880 881 # Group names are expected to be capitalized as they appear in the 882 # .proto file, which actually matches their type names, not their field 883 # names. 884 if not field: 885 field = message_descriptor.fields_by_name.get(name.lower(), None) 886 if field and field.type != descriptor.FieldDescriptor.TYPE_GROUP: 887 field = None 888 889 if (field and field.type == descriptor.FieldDescriptor.TYPE_GROUP and 890 field.message_type.name != name): 891 field = None 892 893 if not field and not self.allow_unknown_field: 894 raise tokenizer.ParseErrorPreviousToken( 895 'Message type "%s" has no field named "%s".' % 896 (message_descriptor.full_name, name)) 897 898 if field: 899 if not self._allow_multiple_scalars and field.containing_oneof: 900 # Check if there's a different field set in this oneof. 901 # Note that we ignore the case if the same field was set before, and we 902 # apply _allow_multiple_scalars to non-scalar fields as well. 903 which_oneof = message.WhichOneof(field.containing_oneof.name) 904 if which_oneof is not None and which_oneof != field.name: 905 raise tokenizer.ParseErrorPreviousToken( 906 'Field "%s" is specified along with field "%s", another member ' 907 'of oneof "%s" for message type "%s".' % 908 (field.name, which_oneof, field.containing_oneof.name, 909 message_descriptor.full_name)) 910 911 if field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_MESSAGE: 912 tokenizer.TryConsume(':') 913 merger = self._MergeMessageField 914 else: 915 tokenizer.Consume(':') 916 merger = self._MergeScalarField 917 918 if (field.label == descriptor.FieldDescriptor.LABEL_REPEATED and 919 tokenizer.TryConsume('[')): 920 # Short repeated format, e.g. "foo: [1, 2, 3]" 921 if not tokenizer.TryConsume(']'): 922 while True: 923 merger(tokenizer, message, field) 924 if tokenizer.TryConsume(']'): 925 break 926 tokenizer.Consume(',') 927 928 else: 929 merger(tokenizer, message, field) 930 931 else: # Proto field is unknown. 932 assert (self.allow_unknown_extension or self.allow_unknown_field) 933 _SkipFieldContents(tokenizer) 934 935 # For historical reasons, fields may optionally be separated by commas or 936 # semicolons. 937 if not tokenizer.TryConsume(','): 938 tokenizer.TryConsume(';') 939 940 def _ConsumeAnyTypeUrl(self, tokenizer): 941 """Consumes a google.protobuf.Any type URL and returns the type name.""" 942 # Consume "type.googleapis.com/". 943 prefix = [tokenizer.ConsumeIdentifier()] 944 tokenizer.Consume('.') 945 prefix.append(tokenizer.ConsumeIdentifier()) 946 tokenizer.Consume('.') 947 prefix.append(tokenizer.ConsumeIdentifier()) 948 tokenizer.Consume('/') 949 # Consume the fully-qualified type name. 950 name = [tokenizer.ConsumeIdentifier()] 951 while tokenizer.TryConsume('.'): 952 name.append(tokenizer.ConsumeIdentifier()) 953 return '.'.join(prefix), '.'.join(name) 954 955 def _MergeMessageField(self, tokenizer, message, field): 956 """Merges a single scalar field into a message. 957 958 Args: 959 tokenizer: A tokenizer to parse the field value. 960 message: The message of which field is a member. 961 field: The descriptor of the field to be merged. 962 963 Raises: 964 ParseError: In case of text parsing problems. 965 """ 966 is_map_entry = _IsMapEntry(field) 967 968 if tokenizer.TryConsume('<'): 969 end_token = '>' 970 else: 971 tokenizer.Consume('{') 972 end_token = '}' 973 974 if field.label == descriptor.FieldDescriptor.LABEL_REPEATED: 975 if field.is_extension: 976 sub_message = message.Extensions[field].add() 977 elif is_map_entry: 978 sub_message = getattr(message, field.name).GetEntryClass()() 979 else: 980 sub_message = getattr(message, field.name).add() 981 else: 982 if field.is_extension: 983 if (not self._allow_multiple_scalars and 984 message.HasExtension(field)): 985 raise tokenizer.ParseErrorPreviousToken( 986 'Message type "%s" should not have multiple "%s" extensions.' % 987 (message.DESCRIPTOR.full_name, field.full_name)) 988 sub_message = message.Extensions[field] 989 else: 990 # Also apply _allow_multiple_scalars to message field. 991 # TODO(jieluo): Change to _allow_singular_overwrites. 992 if (not self._allow_multiple_scalars and 993 message.HasField(field.name)): 994 raise tokenizer.ParseErrorPreviousToken( 995 'Message type "%s" should not have multiple "%s" fields.' % 996 (message.DESCRIPTOR.full_name, field.name)) 997 sub_message = getattr(message, field.name) 998 sub_message.SetInParent() 999 1000 while not tokenizer.TryConsume(end_token): 1001 if tokenizer.AtEnd(): 1002 raise tokenizer.ParseErrorPreviousToken('Expected "%s".' % (end_token,)) 1003 self._MergeField(tokenizer, sub_message) 1004 1005 if is_map_entry: 1006 value_cpptype = field.message_type.fields_by_name['value'].cpp_type 1007 if value_cpptype == descriptor.FieldDescriptor.CPPTYPE_MESSAGE: 1008 value = getattr(message, field.name)[sub_message.key] 1009 value.MergeFrom(sub_message.value) 1010 else: 1011 getattr(message, field.name)[sub_message.key] = sub_message.value 1012 1013 @staticmethod 1014 def _IsProto3Syntax(message): 1015 message_descriptor = message.DESCRIPTOR 1016 return (hasattr(message_descriptor, 'syntax') and 1017 message_descriptor.syntax == 'proto3') 1018 1019 def _MergeScalarField(self, tokenizer, message, field): 1020 """Merges a single scalar field into a message. 1021 1022 Args: 1023 tokenizer: A tokenizer to parse the field value. 1024 message: A protocol message to record the data. 1025 field: The descriptor of the field to be merged. 1026 1027 Raises: 1028 ParseError: In case of text parsing problems. 1029 RuntimeError: On runtime errors. 1030 """ 1031 _ = self.allow_unknown_extension 1032 value = None 1033 1034 if field.type in (descriptor.FieldDescriptor.TYPE_INT32, 1035 descriptor.FieldDescriptor.TYPE_SINT32, 1036 descriptor.FieldDescriptor.TYPE_SFIXED32): 1037 value = _ConsumeInt32(tokenizer) 1038 elif field.type in (descriptor.FieldDescriptor.TYPE_INT64, 1039 descriptor.FieldDescriptor.TYPE_SINT64, 1040 descriptor.FieldDescriptor.TYPE_SFIXED64): 1041 value = _ConsumeInt64(tokenizer) 1042 elif field.type in (descriptor.FieldDescriptor.TYPE_UINT32, 1043 descriptor.FieldDescriptor.TYPE_FIXED32): 1044 value = _ConsumeUint32(tokenizer) 1045 elif field.type in (descriptor.FieldDescriptor.TYPE_UINT64, 1046 descriptor.FieldDescriptor.TYPE_FIXED64): 1047 value = _ConsumeUint64(tokenizer) 1048 elif field.type in (descriptor.FieldDescriptor.TYPE_FLOAT, 1049 descriptor.FieldDescriptor.TYPE_DOUBLE): 1050 value = tokenizer.ConsumeFloat() 1051 elif field.type == descriptor.FieldDescriptor.TYPE_BOOL: 1052 value = tokenizer.ConsumeBool() 1053 elif field.type == descriptor.FieldDescriptor.TYPE_STRING: 1054 value = tokenizer.ConsumeString() 1055 elif field.type == descriptor.FieldDescriptor.TYPE_BYTES: 1056 value = tokenizer.ConsumeByteString() 1057 elif field.type == descriptor.FieldDescriptor.TYPE_ENUM: 1058 value = tokenizer.ConsumeEnum(field) 1059 else: 1060 raise RuntimeError('Unknown field type %d' % field.type) 1061 1062 if field.label == descriptor.FieldDescriptor.LABEL_REPEATED: 1063 if field.is_extension: 1064 message.Extensions[field].append(value) 1065 else: 1066 getattr(message, field.name).append(value) 1067 else: 1068 if field.is_extension: 1069 if (not self._allow_multiple_scalars and 1070 not self._IsProto3Syntax(message) and 1071 message.HasExtension(field)): 1072 raise tokenizer.ParseErrorPreviousToken( 1073 'Message type "%s" should not have multiple "%s" extensions.' % 1074 (message.DESCRIPTOR.full_name, field.full_name)) 1075 else: 1076 message.Extensions[field] = value 1077 else: 1078 duplicate_error = False 1079 if not self._allow_multiple_scalars: 1080 if self._IsProto3Syntax(message): 1081 # Proto3 doesn't represent presence so we try best effort to check 1082 # multiple scalars by compare to default values. 1083 duplicate_error = bool(getattr(message, field.name)) 1084 else: 1085 duplicate_error = message.HasField(field.name) 1086 1087 if duplicate_error: 1088 raise tokenizer.ParseErrorPreviousToken( 1089 'Message type "%s" should not have multiple "%s" fields.' % 1090 (message.DESCRIPTOR.full_name, field.name)) 1091 else: 1092 setattr(message, field.name, value) 1093 1094 1095def _SkipFieldContents(tokenizer): 1096 """Skips over contents (value or message) of a field. 1097 1098 Args: 1099 tokenizer: A tokenizer to parse the field name and values. 1100 """ 1101 # Try to guess the type of this field. 1102 # If this field is not a message, there should be a ":" between the 1103 # field name and the field value and also the field value should not 1104 # start with "{" or "<" which indicates the beginning of a message body. 1105 # If there is no ":" or there is a "{" or "<" after ":", this field has 1106 # to be a message or the input is ill-formed. 1107 if tokenizer.TryConsume(':') and not tokenizer.LookingAt( 1108 '{') and not tokenizer.LookingAt('<'): 1109 _SkipFieldValue(tokenizer) 1110 else: 1111 _SkipFieldMessage(tokenizer) 1112 1113 1114def _SkipField(tokenizer): 1115 """Skips over a complete field (name and value/message). 1116 1117 Args: 1118 tokenizer: A tokenizer to parse the field name and values. 1119 """ 1120 if tokenizer.TryConsume('['): 1121 # Consume extension name. 1122 tokenizer.ConsumeIdentifier() 1123 while tokenizer.TryConsume('.'): 1124 tokenizer.ConsumeIdentifier() 1125 tokenizer.Consume(']') 1126 else: 1127 tokenizer.ConsumeIdentifierOrNumber() 1128 1129 _SkipFieldContents(tokenizer) 1130 1131 # For historical reasons, fields may optionally be separated by commas or 1132 # semicolons. 1133 if not tokenizer.TryConsume(','): 1134 tokenizer.TryConsume(';') 1135 1136 1137def _SkipFieldMessage(tokenizer): 1138 """Skips over a field message. 1139 1140 Args: 1141 tokenizer: A tokenizer to parse the field name and values. 1142 """ 1143 1144 if tokenizer.TryConsume('<'): 1145 delimiter = '>' 1146 else: 1147 tokenizer.Consume('{') 1148 delimiter = '}' 1149 1150 while not tokenizer.LookingAt('>') and not tokenizer.LookingAt('}'): 1151 _SkipField(tokenizer) 1152 1153 tokenizer.Consume(delimiter) 1154 1155 1156def _SkipFieldValue(tokenizer): 1157 """Skips over a field value. 1158 1159 Args: 1160 tokenizer: A tokenizer to parse the field name and values. 1161 1162 Raises: 1163 ParseError: In case an invalid field value is found. 1164 """ 1165 # String/bytes tokens can come in multiple adjacent string literals. 1166 # If we can consume one, consume as many as we can. 1167 if tokenizer.TryConsumeByteString(): 1168 while tokenizer.TryConsumeByteString(): 1169 pass 1170 return 1171 1172 if (not tokenizer.TryConsumeIdentifier() and 1173 not _TryConsumeInt64(tokenizer) and not _TryConsumeUint64(tokenizer) and 1174 not tokenizer.TryConsumeFloat()): 1175 raise ParseError('Invalid field value: ' + tokenizer.token) 1176 1177 1178class Tokenizer(object): 1179 """Protocol buffer text representation tokenizer. 1180 1181 This class handles the lower level string parsing by splitting it into 1182 meaningful tokens. 1183 1184 It was directly ported from the Java protocol buffer API. 1185 """ 1186 1187 _WHITESPACE = re.compile(r'\s+') 1188 _COMMENT = re.compile(r'(\s*#.*$)', re.MULTILINE) 1189 _WHITESPACE_OR_COMMENT = re.compile(r'(\s|(#.*$))+', re.MULTILINE) 1190 _TOKEN = re.compile('|'.join([ 1191 r'[a-zA-Z_][0-9a-zA-Z_+-]*', # an identifier 1192 r'([0-9+-]|(\.[0-9]))[0-9a-zA-Z_.+-]*', # a number 1193 ] + [ # quoted str for each quote mark 1194 # Avoid backtracking! https://stackoverflow.com/a/844267 1195 r'{qt}[^{qt}\n\\]*((\\.)+[^{qt}\n\\]*)*({qt}|\\?$)'.format(qt=mark) 1196 for mark in _QUOTES 1197 ])) 1198 1199 _IDENTIFIER = re.compile(r'[^\d\W]\w*') 1200 _IDENTIFIER_OR_NUMBER = re.compile(r'\w+') 1201 1202 def __init__(self, lines, skip_comments=True): 1203 self._position = 0 1204 self._line = -1 1205 self._column = 0 1206 self._token_start = None 1207 self.token = '' 1208 self._lines = iter(lines) 1209 self._current_line = '' 1210 self._previous_line = 0 1211 self._previous_column = 0 1212 self._more_lines = True 1213 self._skip_comments = skip_comments 1214 self._whitespace_pattern = (skip_comments and self._WHITESPACE_OR_COMMENT 1215 or self._WHITESPACE) 1216 self._SkipWhitespace() 1217 self.NextToken() 1218 1219 def LookingAt(self, token): 1220 return self.token == token 1221 1222 def AtEnd(self): 1223 """Checks the end of the text was reached. 1224 1225 Returns: 1226 True iff the end was reached. 1227 """ 1228 return not self.token 1229 1230 def _PopLine(self): 1231 while len(self._current_line) <= self._column: 1232 try: 1233 self._current_line = next(self._lines) 1234 except StopIteration: 1235 self._current_line = '' 1236 self._more_lines = False 1237 return 1238 else: 1239 self._line += 1 1240 self._column = 0 1241 1242 def _SkipWhitespace(self): 1243 while True: 1244 self._PopLine() 1245 match = self._whitespace_pattern.match(self._current_line, self._column) 1246 if not match: 1247 break 1248 length = len(match.group(0)) 1249 self._column += length 1250 1251 def TryConsume(self, token): 1252 """Tries to consume a given piece of text. 1253 1254 Args: 1255 token: Text to consume. 1256 1257 Returns: 1258 True iff the text was consumed. 1259 """ 1260 if self.token == token: 1261 self.NextToken() 1262 return True 1263 return False 1264 1265 def Consume(self, token): 1266 """Consumes a piece of text. 1267 1268 Args: 1269 token: Text to consume. 1270 1271 Raises: 1272 ParseError: If the text couldn't be consumed. 1273 """ 1274 if not self.TryConsume(token): 1275 raise self.ParseError('Expected "%s".' % token) 1276 1277 def ConsumeComment(self): 1278 result = self.token 1279 if not self._COMMENT.match(result): 1280 raise self.ParseError('Expected comment.') 1281 self.NextToken() 1282 return result 1283 1284 def ConsumeCommentOrTrailingComment(self): 1285 """Consumes a comment, returns a 2-tuple (trailing bool, comment str).""" 1286 1287 # Tokenizer initializes _previous_line and _previous_column to 0. As the 1288 # tokenizer starts, it looks like there is a previous token on the line. 1289 just_started = self._line == 0 and self._column == 0 1290 1291 before_parsing = self._previous_line 1292 comment = self.ConsumeComment() 1293 1294 # A trailing comment is a comment on the same line than the previous token. 1295 trailing = (self._previous_line == before_parsing 1296 and not just_started) 1297 1298 return trailing, comment 1299 1300 def TryConsumeIdentifier(self): 1301 try: 1302 self.ConsumeIdentifier() 1303 return True 1304 except ParseError: 1305 return False 1306 1307 def ConsumeIdentifier(self): 1308 """Consumes protocol message field identifier. 1309 1310 Returns: 1311 Identifier string. 1312 1313 Raises: 1314 ParseError: If an identifier couldn't be consumed. 1315 """ 1316 result = self.token 1317 if not self._IDENTIFIER.match(result): 1318 raise self.ParseError('Expected identifier.') 1319 self.NextToken() 1320 return result 1321 1322 def TryConsumeIdentifierOrNumber(self): 1323 try: 1324 self.ConsumeIdentifierOrNumber() 1325 return True 1326 except ParseError: 1327 return False 1328 1329 def ConsumeIdentifierOrNumber(self): 1330 """Consumes protocol message field identifier. 1331 1332 Returns: 1333 Identifier string. 1334 1335 Raises: 1336 ParseError: If an identifier couldn't be consumed. 1337 """ 1338 result = self.token 1339 if not self._IDENTIFIER_OR_NUMBER.match(result): 1340 raise self.ParseError('Expected identifier or number, got %s.' % result) 1341 self.NextToken() 1342 return result 1343 1344 def TryConsumeInteger(self): 1345 try: 1346 # Note: is_long only affects value type, not whether an error is raised. 1347 self.ConsumeInteger() 1348 return True 1349 except ParseError: 1350 return False 1351 1352 def ConsumeInteger(self, is_long=False): 1353 """Consumes an integer number. 1354 1355 Args: 1356 is_long: True if the value should be returned as a long integer. 1357 Returns: 1358 The integer parsed. 1359 1360 Raises: 1361 ParseError: If an integer couldn't be consumed. 1362 """ 1363 try: 1364 result = _ParseAbstractInteger(self.token, is_long=is_long) 1365 except ValueError as e: 1366 raise self.ParseError(str(e)) 1367 self.NextToken() 1368 return result 1369 1370 def TryConsumeFloat(self): 1371 try: 1372 self.ConsumeFloat() 1373 return True 1374 except ParseError: 1375 return False 1376 1377 def ConsumeFloat(self): 1378 """Consumes an floating point number. 1379 1380 Returns: 1381 The number parsed. 1382 1383 Raises: 1384 ParseError: If a floating point number couldn't be consumed. 1385 """ 1386 try: 1387 result = ParseFloat(self.token) 1388 except ValueError as e: 1389 raise self.ParseError(str(e)) 1390 self.NextToken() 1391 return result 1392 1393 def ConsumeBool(self): 1394 """Consumes a boolean value. 1395 1396 Returns: 1397 The bool parsed. 1398 1399 Raises: 1400 ParseError: If a boolean value couldn't be consumed. 1401 """ 1402 try: 1403 result = ParseBool(self.token) 1404 except ValueError as e: 1405 raise self.ParseError(str(e)) 1406 self.NextToken() 1407 return result 1408 1409 def TryConsumeByteString(self): 1410 try: 1411 self.ConsumeByteString() 1412 return True 1413 except ParseError: 1414 return False 1415 1416 def ConsumeString(self): 1417 """Consumes a string value. 1418 1419 Returns: 1420 The string parsed. 1421 1422 Raises: 1423 ParseError: If a string value couldn't be consumed. 1424 """ 1425 the_bytes = self.ConsumeByteString() 1426 try: 1427 return six.text_type(the_bytes, 'utf-8') 1428 except UnicodeDecodeError as e: 1429 raise self._StringParseError(e) 1430 1431 def ConsumeByteString(self): 1432 """Consumes a byte array value. 1433 1434 Returns: 1435 The array parsed (as a string). 1436 1437 Raises: 1438 ParseError: If a byte array value couldn't be consumed. 1439 """ 1440 the_list = [self._ConsumeSingleByteString()] 1441 while self.token and self.token[0] in _QUOTES: 1442 the_list.append(self._ConsumeSingleByteString()) 1443 return b''.join(the_list) 1444 1445 def _ConsumeSingleByteString(self): 1446 """Consume one token of a string literal. 1447 1448 String literals (whether bytes or text) can come in multiple adjacent 1449 tokens which are automatically concatenated, like in C or Python. This 1450 method only consumes one token. 1451 1452 Returns: 1453 The token parsed. 1454 Raises: 1455 ParseError: When the wrong format data is found. 1456 """ 1457 text = self.token 1458 if len(text) < 1 or text[0] not in _QUOTES: 1459 raise self.ParseError('Expected string but found: %r' % (text,)) 1460 1461 if len(text) < 2 or text[-1] != text[0]: 1462 raise self.ParseError('String missing ending quote: %r' % (text,)) 1463 1464 try: 1465 result = text_encoding.CUnescape(text[1:-1]) 1466 except ValueError as e: 1467 raise self.ParseError(str(e)) 1468 self.NextToken() 1469 return result 1470 1471 def ConsumeEnum(self, field): 1472 try: 1473 result = ParseEnum(field, self.token) 1474 except ValueError as e: 1475 raise self.ParseError(str(e)) 1476 self.NextToken() 1477 return result 1478 1479 def ParseErrorPreviousToken(self, message): 1480 """Creates and *returns* a ParseError for the previously read token. 1481 1482 Args: 1483 message: A message to set for the exception. 1484 1485 Returns: 1486 A ParseError instance. 1487 """ 1488 return ParseError(message, self._previous_line + 1, 1489 self._previous_column + 1) 1490 1491 def ParseError(self, message): 1492 """Creates and *returns* a ParseError for the current token.""" 1493 return ParseError('\'' + self._current_line + '\': ' + message, 1494 self._line + 1, self._column + 1) 1495 1496 def _StringParseError(self, e): 1497 return self.ParseError('Couldn\'t parse string: ' + str(e)) 1498 1499 def NextToken(self): 1500 """Reads the next meaningful token.""" 1501 self._previous_line = self._line 1502 self._previous_column = self._column 1503 1504 self._column += len(self.token) 1505 self._SkipWhitespace() 1506 1507 if not self._more_lines: 1508 self.token = '' 1509 return 1510 1511 match = self._TOKEN.match(self._current_line, self._column) 1512 if not match and not self._skip_comments: 1513 match = self._COMMENT.match(self._current_line, self._column) 1514 if match: 1515 token = match.group(0) 1516 self.token = token 1517 else: 1518 self.token = self._current_line[self._column] 1519 1520# Aliased so it can still be accessed by current visibility violators. 1521# TODO(dbarnett): Migrate violators to textformat_tokenizer. 1522_Tokenizer = Tokenizer # pylint: disable=invalid-name 1523 1524 1525def _ConsumeInt32(tokenizer): 1526 """Consumes a signed 32bit integer number from tokenizer. 1527 1528 Args: 1529 tokenizer: A tokenizer used to parse the number. 1530 1531 Returns: 1532 The integer parsed. 1533 1534 Raises: 1535 ParseError: If a signed 32bit integer couldn't be consumed. 1536 """ 1537 return _ConsumeInteger(tokenizer, is_signed=True, is_long=False) 1538 1539 1540def _ConsumeUint32(tokenizer): 1541 """Consumes an unsigned 32bit integer number from tokenizer. 1542 1543 Args: 1544 tokenizer: A tokenizer used to parse the number. 1545 1546 Returns: 1547 The integer parsed. 1548 1549 Raises: 1550 ParseError: If an unsigned 32bit integer couldn't be consumed. 1551 """ 1552 return _ConsumeInteger(tokenizer, is_signed=False, is_long=False) 1553 1554 1555def _TryConsumeInt64(tokenizer): 1556 try: 1557 _ConsumeInt64(tokenizer) 1558 return True 1559 except ParseError: 1560 return False 1561 1562 1563def _ConsumeInt64(tokenizer): 1564 """Consumes a signed 32bit integer number from tokenizer. 1565 1566 Args: 1567 tokenizer: A tokenizer used to parse the number. 1568 1569 Returns: 1570 The integer parsed. 1571 1572 Raises: 1573 ParseError: If a signed 32bit integer couldn't be consumed. 1574 """ 1575 return _ConsumeInteger(tokenizer, is_signed=True, is_long=True) 1576 1577 1578def _TryConsumeUint64(tokenizer): 1579 try: 1580 _ConsumeUint64(tokenizer) 1581 return True 1582 except ParseError: 1583 return False 1584 1585 1586def _ConsumeUint64(tokenizer): 1587 """Consumes an unsigned 64bit integer number from tokenizer. 1588 1589 Args: 1590 tokenizer: A tokenizer used to parse the number. 1591 1592 Returns: 1593 The integer parsed. 1594 1595 Raises: 1596 ParseError: If an unsigned 64bit integer couldn't be consumed. 1597 """ 1598 return _ConsumeInteger(tokenizer, is_signed=False, is_long=True) 1599 1600 1601def _TryConsumeInteger(tokenizer, is_signed=False, is_long=False): 1602 try: 1603 _ConsumeInteger(tokenizer, is_signed=is_signed, is_long=is_long) 1604 return True 1605 except ParseError: 1606 return False 1607 1608 1609def _ConsumeInteger(tokenizer, is_signed=False, is_long=False): 1610 """Consumes an integer number from tokenizer. 1611 1612 Args: 1613 tokenizer: A tokenizer used to parse the number. 1614 is_signed: True if a signed integer must be parsed. 1615 is_long: True if a long integer must be parsed. 1616 1617 Returns: 1618 The integer parsed. 1619 1620 Raises: 1621 ParseError: If an integer with given characteristics couldn't be consumed. 1622 """ 1623 try: 1624 result = ParseInteger(tokenizer.token, is_signed=is_signed, is_long=is_long) 1625 except ValueError as e: 1626 raise tokenizer.ParseError(str(e)) 1627 tokenizer.NextToken() 1628 return result 1629 1630 1631def ParseInteger(text, is_signed=False, is_long=False): 1632 """Parses an integer. 1633 1634 Args: 1635 text: The text to parse. 1636 is_signed: True if a signed integer must be parsed. 1637 is_long: True if a long integer must be parsed. 1638 1639 Returns: 1640 The integer value. 1641 1642 Raises: 1643 ValueError: Thrown Iff the text is not a valid integer. 1644 """ 1645 # Do the actual parsing. Exception handling is propagated to caller. 1646 result = _ParseAbstractInteger(text, is_long=is_long) 1647 1648 # Check if the integer is sane. Exceptions handled by callers. 1649 checker = _INTEGER_CHECKERS[2 * int(is_long) + int(is_signed)] 1650 checker.CheckValue(result) 1651 return result 1652 1653 1654def _ParseAbstractInteger(text, is_long=False): 1655 """Parses an integer without checking size/signedness. 1656 1657 Args: 1658 text: The text to parse. 1659 is_long: True if the value should be returned as a long integer. 1660 1661 Returns: 1662 The integer value. 1663 1664 Raises: 1665 ValueError: Thrown Iff the text is not a valid integer. 1666 """ 1667 # Do the actual parsing. Exception handling is propagated to caller. 1668 orig_text = text 1669 c_octal_match = re.match(r'(-?)0(\d+)$', text) 1670 if c_octal_match: 1671 # Python 3 no longer supports 0755 octal syntax without the 'o', so 1672 # we always use the '0o' prefix for multi-digit numbers starting with 0. 1673 text = c_octal_match.group(1) + '0o' + c_octal_match.group(2) 1674 try: 1675 # We force 32-bit values to int and 64-bit values to long to make 1676 # alternate implementations where the distinction is more significant 1677 # (e.g. the C++ implementation) simpler. 1678 if is_long: 1679 return long(text, 0) 1680 else: 1681 return int(text, 0) 1682 except ValueError: 1683 raise ValueError('Couldn\'t parse integer: %s' % orig_text) 1684 1685 1686def ParseFloat(text): 1687 """Parse a floating point number. 1688 1689 Args: 1690 text: Text to parse. 1691 1692 Returns: 1693 The number parsed. 1694 1695 Raises: 1696 ValueError: If a floating point number couldn't be parsed. 1697 """ 1698 try: 1699 # Assume Python compatible syntax. 1700 return float(text) 1701 except ValueError: 1702 # Check alternative spellings. 1703 if _FLOAT_INFINITY.match(text): 1704 if text[0] == '-': 1705 return float('-inf') 1706 else: 1707 return float('inf') 1708 elif _FLOAT_NAN.match(text): 1709 return float('nan') 1710 else: 1711 # assume '1.0f' format 1712 try: 1713 return float(text.rstrip('f')) 1714 except ValueError: 1715 raise ValueError('Couldn\'t parse float: %s' % text) 1716 1717 1718def ParseBool(text): 1719 """Parse a boolean value. 1720 1721 Args: 1722 text: Text to parse. 1723 1724 Returns: 1725 Boolean values parsed 1726 1727 Raises: 1728 ValueError: If text is not a valid boolean. 1729 """ 1730 if text in ('true', 't', '1', 'True'): 1731 return True 1732 elif text in ('false', 'f', '0', 'False'): 1733 return False 1734 else: 1735 raise ValueError('Expected "true" or "false".') 1736 1737 1738def ParseEnum(field, value): 1739 """Parse an enum value. 1740 1741 The value can be specified by a number (the enum value), or by 1742 a string literal (the enum name). 1743 1744 Args: 1745 field: Enum field descriptor. 1746 value: String value. 1747 1748 Returns: 1749 Enum value number. 1750 1751 Raises: 1752 ValueError: If the enum value could not be parsed. 1753 """ 1754 enum_descriptor = field.enum_type 1755 try: 1756 number = int(value, 0) 1757 except ValueError: 1758 # Identifier. 1759 enum_value = enum_descriptor.values_by_name.get(value, None) 1760 if enum_value is None: 1761 raise ValueError('Enum type "%s" has no value named %s.' % 1762 (enum_descriptor.full_name, value)) 1763 else: 1764 # Numeric value. 1765 if hasattr(field.file, 'syntax'): 1766 # Attribute is checked for compatibility. 1767 if field.file.syntax == 'proto3': 1768 # Proto3 accept numeric unknown enums. 1769 return number 1770 enum_value = enum_descriptor.values_by_number.get(number, None) 1771 if enum_value is None: 1772 raise ValueError('Enum type "%s" has no value with number %d.' % 1773 (enum_descriptor.full_name, number)) 1774 return enum_value.number 1775