• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# Protocol Buffers - Google's data interchange format
2# Copyright 2008 Google Inc.  All rights reserved.
3# https://developers.google.com/protocol-buffers/
4#
5# Redistribution and use in source and binary forms, with or without
6# modification, are permitted provided that the following conditions are
7# met:
8#
9#     * Redistributions of source code must retain the above copyright
10# notice, this list of conditions and the following disclaimer.
11#     * Redistributions in binary form must reproduce the above
12# copyright notice, this list of conditions and the following disclaimer
13# in the documentation and/or other materials provided with the
14# distribution.
15#     * Neither the name of Google Inc. nor the names of its
16# contributors may be used to endorse or promote products derived from
17# this software without specific prior written permission.
18#
19# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
31"""Contains routines for printing protocol messages in text format.
32
33Simple usage example::
34
35  # Create a proto object and serialize it to a text proto string.
36  message = my_proto_pb2.MyMessage(foo='bar')
37  text_proto = text_format.MessageToString(message)
38
39  # Parse a text proto string.
40  message = text_format.Parse(text_proto, my_proto_pb2.MyMessage())
41"""
42
43__author__ = 'kenton@google.com (Kenton Varda)'
44
45# TODO(b/129989314) Import thread contention leads to test failures.
46import encodings.raw_unicode_escape  # pylint: disable=unused-import
47import encodings.unicode_escape  # pylint: disable=unused-import
48import io
49import re
50
51import six
52
53if six.PY3:
54  long = int  # pylint: disable=redefined-builtin,invalid-name
55
56# pylint: disable=g-import-not-at-top
57from google.protobuf.internal import decoder
58from google.protobuf.internal import type_checkers
59from google.protobuf import descriptor
60from google.protobuf import text_encoding
61
62__all__ = ['MessageToString', 'Parse', 'PrintMessage', 'PrintField',
63           'PrintFieldValue', 'Merge', 'MessageToBytes']
64
65_INTEGER_CHECKERS = (type_checkers.Uint32ValueChecker(),
66                     type_checkers.Int32ValueChecker(),
67                     type_checkers.Uint64ValueChecker(),
68                     type_checkers.Int64ValueChecker())
69_FLOAT_INFINITY = re.compile('-?inf(?:inity)?f?$', re.IGNORECASE)
70_FLOAT_NAN = re.compile('nanf?$', re.IGNORECASE)
71_QUOTES = frozenset(("'", '"'))
72_ANY_FULL_TYPE_NAME = 'google.protobuf.Any'
73
74
75class Error(Exception):
76  """Top-level module error for text_format."""
77
78
79class ParseError(Error):
80  """Thrown in case of text parsing or tokenizing error."""
81
82  def __init__(self, message=None, line=None, column=None):
83    if message is not None and line is not None:
84      loc = str(line)
85      if column is not None:
86        loc += ':{0}'.format(column)
87      message = '{0} : {1}'.format(loc, message)
88    if message is not None:
89      super(ParseError, self).__init__(message)
90    else:
91      super(ParseError, self).__init__()
92    self._line = line
93    self._column = column
94
95  def GetLine(self):
96    return self._line
97
98  def GetColumn(self):
99    return self._column
100
101
102class TextWriter(object):
103
104  def __init__(self, as_utf8):
105    if six.PY2:
106      self._writer = io.BytesIO()
107    else:
108      self._writer = io.StringIO()
109
110  def write(self, val):
111    if six.PY2:
112      if isinstance(val, six.text_type):
113        val = val.encode('utf-8')
114    return self._writer.write(val)
115
116  def close(self):
117    return self._writer.close()
118
119  def getvalue(self):
120    return self._writer.getvalue()
121
122
123def MessageToString(
124    message,
125    as_utf8=False,
126    as_one_line=False,
127    use_short_repeated_primitives=False,
128    pointy_brackets=False,
129    use_index_order=False,
130    float_format=None,
131    double_format=None,
132    use_field_number=False,
133    descriptor_pool=None,
134    indent=0,
135    message_formatter=None,
136    print_unknown_fields=False,
137    force_colon=False):
138  # type: (...) -> str
139  """Convert protobuf message to text format.
140
141  Double values can be formatted compactly with 15 digits of
142  precision (which is the most that IEEE 754 "double" can guarantee)
143  using double_format='.15g'. To ensure that converting to text and back to a
144  proto will result in an identical value, double_format='.17g' should be used.
145
146  Args:
147    message: The protocol buffers message.
148    as_utf8: Return unescaped Unicode for non-ASCII characters.
149        In Python 3 actual Unicode characters may appear as is in strings.
150        In Python 2 the return value will be valid UTF-8 rather than only ASCII.
151    as_one_line: Don't introduce newlines between fields.
152    use_short_repeated_primitives: Use short repeated format for primitives.
153    pointy_brackets: If True, use angle brackets instead of curly braces for
154      nesting.
155    use_index_order: If True, fields of a proto message will be printed using
156      the order defined in source code instead of the field number, extensions
157      will be printed at the end of the message and their relative order is
158      determined by the extension number. By default, use the field number
159      order.
160    float_format (str): If set, use this to specify float field formatting
161      (per the "Format Specification Mini-Language"); otherwise, shortest float
162      that has same value in wire will be printed. Also affect double field
163      if double_format is not set but float_format is set.
164    double_format (str): If set, use this to specify double field formatting
165      (per the "Format Specification Mini-Language"); if it is not set but
166      float_format is set, use float_format. Otherwise, use ``str()``
167    use_field_number: If True, print field numbers instead of names.
168    descriptor_pool (DescriptorPool): Descriptor pool used to resolve Any types.
169    indent (int): The initial indent level, in terms of spaces, for pretty
170      print.
171    message_formatter (function(message, indent, as_one_line) -> unicode|None):
172      Custom formatter for selected sub-messages (usually based on message
173      type). Use to pretty print parts of the protobuf for easier diffing.
174    print_unknown_fields: If True, unknown fields will be printed.
175    force_colon: If set, a colon will be added after the field name even if the
176      field is a proto message.
177
178  Returns:
179    str: A string of the text formatted protocol buffer message.
180  """
181  out = TextWriter(as_utf8)
182  printer = _Printer(
183      out,
184      indent,
185      as_utf8,
186      as_one_line,
187      use_short_repeated_primitives,
188      pointy_brackets,
189      use_index_order,
190      float_format,
191      double_format,
192      use_field_number,
193      descriptor_pool,
194      message_formatter,
195      print_unknown_fields=print_unknown_fields,
196      force_colon=force_colon)
197  printer.PrintMessage(message)
198  result = out.getvalue()
199  out.close()
200  if as_one_line:
201    return result.rstrip()
202  return result
203
204
205def MessageToBytes(message, **kwargs):
206  # type: (...) -> bytes
207  """Convert protobuf message to encoded text format.  See MessageToString."""
208  text = MessageToString(message, **kwargs)
209  if isinstance(text, bytes):
210    return text
211  codec = 'utf-8' if kwargs.get('as_utf8') else 'ascii'
212  return text.encode(codec)
213
214
215def _IsMapEntry(field):
216  return (field.type == descriptor.FieldDescriptor.TYPE_MESSAGE and
217          field.message_type.has_options and
218          field.message_type.GetOptions().map_entry)
219
220
221def PrintMessage(message,
222                 out,
223                 indent=0,
224                 as_utf8=False,
225                 as_one_line=False,
226                 use_short_repeated_primitives=False,
227                 pointy_brackets=False,
228                 use_index_order=False,
229                 float_format=None,
230                 double_format=None,
231                 use_field_number=False,
232                 descriptor_pool=None,
233                 message_formatter=None,
234                 print_unknown_fields=False,
235                 force_colon=False):
236  printer = _Printer(
237      out=out, indent=indent, as_utf8=as_utf8,
238      as_one_line=as_one_line,
239      use_short_repeated_primitives=use_short_repeated_primitives,
240      pointy_brackets=pointy_brackets,
241      use_index_order=use_index_order,
242      float_format=float_format,
243      double_format=double_format,
244      use_field_number=use_field_number,
245      descriptor_pool=descriptor_pool,
246      message_formatter=message_formatter,
247      print_unknown_fields=print_unknown_fields,
248      force_colon=force_colon)
249  printer.PrintMessage(message)
250
251
252def PrintField(field,
253               value,
254               out,
255               indent=0,
256               as_utf8=False,
257               as_one_line=False,
258               use_short_repeated_primitives=False,
259               pointy_brackets=False,
260               use_index_order=False,
261               float_format=None,
262               double_format=None,
263               message_formatter=None,
264               print_unknown_fields=False,
265               force_colon=False):
266  """Print a single field name/value pair."""
267  printer = _Printer(out, indent, as_utf8, as_one_line,
268                     use_short_repeated_primitives, pointy_brackets,
269                     use_index_order, float_format, double_format,
270                     message_formatter=message_formatter,
271                     print_unknown_fields=print_unknown_fields,
272                     force_colon=force_colon)
273  printer.PrintField(field, value)
274
275
276def PrintFieldValue(field,
277                    value,
278                    out,
279                    indent=0,
280                    as_utf8=False,
281                    as_one_line=False,
282                    use_short_repeated_primitives=False,
283                    pointy_brackets=False,
284                    use_index_order=False,
285                    float_format=None,
286                    double_format=None,
287                    message_formatter=None,
288                    print_unknown_fields=False,
289                    force_colon=False):
290  """Print a single field value (not including name)."""
291  printer = _Printer(out, indent, as_utf8, as_one_line,
292                     use_short_repeated_primitives, pointy_brackets,
293                     use_index_order, float_format, double_format,
294                     message_formatter=message_formatter,
295                     print_unknown_fields=print_unknown_fields,
296                     force_colon=force_colon)
297  printer.PrintFieldValue(field, value)
298
299
300def _BuildMessageFromTypeName(type_name, descriptor_pool):
301  """Returns a protobuf message instance.
302
303  Args:
304    type_name: Fully-qualified protobuf  message type name string.
305    descriptor_pool: DescriptorPool instance.
306
307  Returns:
308    A Message instance of type matching type_name, or None if the a Descriptor
309    wasn't found matching type_name.
310  """
311  # pylint: disable=g-import-not-at-top
312  if descriptor_pool is None:
313    from google.protobuf import descriptor_pool as pool_mod
314    descriptor_pool = pool_mod.Default()
315  from google.protobuf import symbol_database
316  database = symbol_database.Default()
317  try:
318    message_descriptor = descriptor_pool.FindMessageTypeByName(type_name)
319  except KeyError:
320    return None
321  message_type = database.GetPrototype(message_descriptor)
322  return message_type()
323
324
325# These values must match WireType enum in google/protobuf/wire_format.h.
326WIRETYPE_LENGTH_DELIMITED = 2
327WIRETYPE_START_GROUP = 3
328
329
330class _Printer(object):
331  """Text format printer for protocol message."""
332
333  def __init__(
334      self,
335      out,
336      indent=0,
337      as_utf8=False,
338      as_one_line=False,
339      use_short_repeated_primitives=False,
340      pointy_brackets=False,
341      use_index_order=False,
342      float_format=None,
343      double_format=None,
344      use_field_number=False,
345      descriptor_pool=None,
346      message_formatter=None,
347      print_unknown_fields=False,
348      force_colon=False):
349    """Initialize the Printer.
350
351    Double values can be formatted compactly with 15 digits of precision
352    (which is the most that IEEE 754 "double" can guarantee) using
353    double_format='.15g'. To ensure that converting to text and back to a proto
354    will result in an identical value, double_format='.17g' should be used.
355
356    Args:
357      out: To record the text format result.
358      indent: The initial indent level for pretty print.
359      as_utf8: Return unescaped Unicode for non-ASCII characters.
360          In Python 3 actual Unicode characters may appear as is in strings.
361          In Python 2 the return value will be valid UTF-8 rather than ASCII.
362      as_one_line: Don't introduce newlines between fields.
363      use_short_repeated_primitives: Use short repeated format for primitives.
364      pointy_brackets: If True, use angle brackets instead of curly braces for
365        nesting.
366      use_index_order: If True, print fields of a proto message using the order
367        defined in source code instead of the field number. By default, use the
368        field number order.
369      float_format: If set, use this to specify float field formatting
370        (per the "Format Specification Mini-Language"); otherwise, shortest
371        float that has same value in wire will be printed. Also affect double
372        field if double_format is not set but float_format is set.
373      double_format: If set, use this to specify double field formatting
374        (per the "Format Specification Mini-Language"); if it is not set but
375        float_format is set, use float_format. Otherwise, str() is used.
376      use_field_number: If True, print field numbers instead of names.
377      descriptor_pool: A DescriptorPool used to resolve Any types.
378      message_formatter: A function(message, indent, as_one_line): unicode|None
379        to custom format selected sub-messages (usually based on message type).
380        Use to pretty print parts of the protobuf for easier diffing.
381      print_unknown_fields: If True, unknown fields will be printed.
382      force_colon: If set, a colon will be added after the field name even if
383        the field is a proto message.
384    """
385    self.out = out
386    self.indent = indent
387    self.as_utf8 = as_utf8
388    self.as_one_line = as_one_line
389    self.use_short_repeated_primitives = use_short_repeated_primitives
390    self.pointy_brackets = pointy_brackets
391    self.use_index_order = use_index_order
392    self.float_format = float_format
393    if double_format is not None:
394      self.double_format = double_format
395    else:
396      self.double_format = float_format
397    self.use_field_number = use_field_number
398    self.descriptor_pool = descriptor_pool
399    self.message_formatter = message_formatter
400    self.print_unknown_fields = print_unknown_fields
401    self.force_colon = force_colon
402
403  def _TryPrintAsAnyMessage(self, message):
404    """Serializes if message is a google.protobuf.Any field."""
405    if '/' not in message.type_url:
406      return False
407    packed_message = _BuildMessageFromTypeName(message.TypeName(),
408                                               self.descriptor_pool)
409    if packed_message:
410      packed_message.MergeFromString(message.value)
411      colon = ':' if self.force_colon else ''
412      self.out.write('%s[%s]%s ' % (self.indent * ' ', message.type_url, colon))
413      self._PrintMessageFieldValue(packed_message)
414      self.out.write(' ' if self.as_one_line else '\n')
415      return True
416    else:
417      return False
418
419  def _TryCustomFormatMessage(self, message):
420    formatted = self.message_formatter(message, self.indent, self.as_one_line)
421    if formatted is None:
422      return False
423
424    out = self.out
425    out.write(' ' * self.indent)
426    out.write(formatted)
427    out.write(' ' if self.as_one_line else '\n')
428    return True
429
430  def PrintMessage(self, message):
431    """Convert protobuf message to text format.
432
433    Args:
434      message: The protocol buffers message.
435    """
436    if self.message_formatter and self._TryCustomFormatMessage(message):
437      return
438    if (message.DESCRIPTOR.full_name == _ANY_FULL_TYPE_NAME and
439        self._TryPrintAsAnyMessage(message)):
440      return
441    fields = message.ListFields()
442    if self.use_index_order:
443      fields.sort(
444          key=lambda x: x[0].number if x[0].is_extension else x[0].index)
445    for field, value in fields:
446      if _IsMapEntry(field):
447        for key in sorted(value):
448          # This is slow for maps with submessage entries because it copies the
449          # entire tree.  Unfortunately this would take significant refactoring
450          # of this file to work around.
451          #
452          # TODO(haberman): refactor and optimize if this becomes an issue.
453          entry_submsg = value.GetEntryClass()(key=key, value=value[key])
454          self.PrintField(field, entry_submsg)
455      elif field.label == descriptor.FieldDescriptor.LABEL_REPEATED:
456        if (self.use_short_repeated_primitives
457            and field.cpp_type != descriptor.FieldDescriptor.CPPTYPE_MESSAGE
458            and field.cpp_type != descriptor.FieldDescriptor.CPPTYPE_STRING):
459          self._PrintShortRepeatedPrimitivesValue(field, value)
460        else:
461          for element in value:
462            self.PrintField(field, element)
463      else:
464        self.PrintField(field, value)
465
466    if self.print_unknown_fields:
467      self._PrintUnknownFields(message.UnknownFields())
468
469  def _PrintUnknownFields(self, unknown_fields):
470    """Print unknown fields."""
471    out = self.out
472    for field in unknown_fields:
473      out.write(' ' * self.indent)
474      out.write(str(field.field_number))
475      if field.wire_type == WIRETYPE_START_GROUP:
476        if self.as_one_line:
477          out.write(' { ')
478        else:
479          out.write(' {\n')
480          self.indent += 2
481
482        self._PrintUnknownFields(field.data)
483
484        if self.as_one_line:
485          out.write('} ')
486        else:
487          self.indent -= 2
488          out.write(' ' * self.indent + '}\n')
489      elif field.wire_type == WIRETYPE_LENGTH_DELIMITED:
490        try:
491          # If this field is parseable as a Message, it is probably
492          # an embedded message.
493          # pylint: disable=protected-access
494          (embedded_unknown_message, pos) = decoder._DecodeUnknownFieldSet(
495              memoryview(field.data), 0, len(field.data))
496        except Exception:    # pylint: disable=broad-except
497          pos = 0
498
499        if pos == len(field.data):
500          if self.as_one_line:
501            out.write(' { ')
502          else:
503            out.write(' {\n')
504            self.indent += 2
505
506          self._PrintUnknownFields(embedded_unknown_message)
507
508          if self.as_one_line:
509            out.write('} ')
510          else:
511            self.indent -= 2
512            out.write(' ' * self.indent + '}\n')
513        else:
514          # A string or bytes field. self.as_utf8 may not work.
515          out.write(': \"')
516          out.write(text_encoding.CEscape(field.data, False))
517          out.write('\" ' if self.as_one_line else '\"\n')
518      else:
519        # varint, fixed32, fixed64
520        out.write(': ')
521        out.write(str(field.data))
522        out.write(' ' if self.as_one_line else '\n')
523
524  def _PrintFieldName(self, field):
525    """Print field name."""
526    out = self.out
527    out.write(' ' * self.indent)
528    if self.use_field_number:
529      out.write(str(field.number))
530    else:
531      if field.is_extension:
532        out.write('[')
533        if (field.containing_type.GetOptions().message_set_wire_format and
534            field.type == descriptor.FieldDescriptor.TYPE_MESSAGE and
535            field.label == descriptor.FieldDescriptor.LABEL_OPTIONAL):
536          out.write(field.message_type.full_name)
537        else:
538          out.write(field.full_name)
539        out.write(']')
540      elif field.type == descriptor.FieldDescriptor.TYPE_GROUP:
541        # For groups, use the capitalized name.
542        out.write(field.message_type.name)
543      else:
544        out.write(field.name)
545
546    if (self.force_colon or
547        field.cpp_type != descriptor.FieldDescriptor.CPPTYPE_MESSAGE):
548      # The colon is optional in this case, but our cross-language golden files
549      # don't include it. Here, the colon is only included if force_colon is
550      # set to True
551      out.write(':')
552
553  def PrintField(self, field, value):
554    """Print a single field name/value pair."""
555    self._PrintFieldName(field)
556    self.out.write(' ')
557    self.PrintFieldValue(field, value)
558    self.out.write(' ' if self.as_one_line else '\n')
559
560  def _PrintShortRepeatedPrimitivesValue(self, field, value):
561    """"Prints short repeated primitives value."""
562    # Note: this is called only when value has at least one element.
563    self._PrintFieldName(field)
564    self.out.write(' [')
565    for i in six.moves.range(len(value) - 1):
566      self.PrintFieldValue(field, value[i])
567      self.out.write(', ')
568    self.PrintFieldValue(field, value[-1])
569    self.out.write(']')
570    if self.force_colon:
571      self.out.write(':')
572    self.out.write(' ' if self.as_one_line else '\n')
573
574  def _PrintMessageFieldValue(self, value):
575    if self.pointy_brackets:
576      openb = '<'
577      closeb = '>'
578    else:
579      openb = '{'
580      closeb = '}'
581
582    if self.as_one_line:
583      self.out.write('%s ' % openb)
584      self.PrintMessage(value)
585      self.out.write(closeb)
586    else:
587      self.out.write('%s\n' % openb)
588      self.indent += 2
589      self.PrintMessage(value)
590      self.indent -= 2
591      self.out.write(' ' * self.indent + closeb)
592
593  def PrintFieldValue(self, field, value):
594    """Print a single field value (not including name).
595
596    For repeated fields, the value should be a single element.
597
598    Args:
599      field: The descriptor of the field to be printed.
600      value: The value of the field.
601    """
602    out = self.out
603    if field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_MESSAGE:
604      self._PrintMessageFieldValue(value)
605    elif field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_ENUM:
606      enum_value = field.enum_type.values_by_number.get(value, None)
607      if enum_value is not None:
608        out.write(enum_value.name)
609      else:
610        out.write(str(value))
611    elif field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_STRING:
612      out.write('\"')
613      if isinstance(value, six.text_type) and (six.PY2 or not self.as_utf8):
614        out_value = value.encode('utf-8')
615      else:
616        out_value = value
617      if field.type == descriptor.FieldDescriptor.TYPE_BYTES:
618        # We always need to escape all binary data in TYPE_BYTES fields.
619        out_as_utf8 = False
620      else:
621        out_as_utf8 = self.as_utf8
622      out.write(text_encoding.CEscape(out_value, out_as_utf8))
623      out.write('\"')
624    elif field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_BOOL:
625      if value:
626        out.write('true')
627      else:
628        out.write('false')
629    elif field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_FLOAT:
630      if self.float_format is not None:
631        out.write('{1:{0}}'.format(self.float_format, value))
632      else:
633        out.write(str(type_checkers.ToShortestFloat(value)))
634    elif (field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_DOUBLE and
635          self.double_format is not None):
636      out.write('{1:{0}}'.format(self.double_format, value))
637    else:
638      out.write(str(value))
639
640
641def Parse(text,
642          message,
643          allow_unknown_extension=False,
644          allow_field_number=False,
645          descriptor_pool=None,
646          allow_unknown_field=False):
647  """Parses a text representation of a protocol message into a message.
648
649  NOTE: for historical reasons this function does not clear the input
650  message. This is different from what the binary msg.ParseFrom(...) does.
651  If text contains a field already set in message, the value is appended if the
652  field is repeated. Otherwise, an error is raised.
653
654  Example::
655
656    a = MyProto()
657    a.repeated_field.append('test')
658    b = MyProto()
659
660    # Repeated fields are combined
661    text_format.Parse(repr(a), b)
662    text_format.Parse(repr(a), b) # repeated_field contains ["test", "test"]
663
664    # Non-repeated fields cannot be overwritten
665    a.singular_field = 1
666    b.singular_field = 2
667    text_format.Parse(repr(a), b) # ParseError
668
669    # Binary version:
670    b.ParseFromString(a.SerializeToString()) # repeated_field is now "test"
671
672  Caller is responsible for clearing the message as needed.
673
674  Args:
675    text (str): Message text representation.
676    message (Message): A protocol buffer message to merge into.
677    allow_unknown_extension: if True, skip over missing extensions and keep
678      parsing
679    allow_field_number: if True, both field number and field name are allowed.
680    descriptor_pool (DescriptorPool): Descriptor pool used to resolve Any types.
681    allow_unknown_field: if True, skip over unknown field and keep
682      parsing. Avoid to use this option if possible. It may hide some
683      errors (e.g. spelling error on field name)
684
685  Returns:
686    Message: The same message passed as argument.
687
688  Raises:
689    ParseError: On text parsing problems.
690  """
691  return ParseLines(text.split(b'\n' if isinstance(text, bytes) else u'\n'),
692                    message,
693                    allow_unknown_extension,
694                    allow_field_number,
695                    descriptor_pool=descriptor_pool,
696                    allow_unknown_field=allow_unknown_field)
697
698
699def Merge(text,
700          message,
701          allow_unknown_extension=False,
702          allow_field_number=False,
703          descriptor_pool=None,
704          allow_unknown_field=False):
705  """Parses a text representation of a protocol message into a message.
706
707  Like Parse(), but allows repeated values for a non-repeated field, and uses
708  the last one. This means any non-repeated, top-level fields specified in text
709  replace those in the message.
710
711  Args:
712    text (str): Message text representation.
713    message (Message): A protocol buffer message to merge into.
714    allow_unknown_extension: if True, skip over missing extensions and keep
715      parsing
716    allow_field_number: if True, both field number and field name are allowed.
717    descriptor_pool (DescriptorPool): Descriptor pool used to resolve Any types.
718    allow_unknown_field: if True, skip over unknown field and keep
719      parsing. Avoid to use this option if possible. It may hide some
720      errors (e.g. spelling error on field name)
721
722  Returns:
723    Message: The same message passed as argument.
724
725  Raises:
726    ParseError: On text parsing problems.
727  """
728  return MergeLines(
729      text.split(b'\n' if isinstance(text, bytes) else u'\n'),
730      message,
731      allow_unknown_extension,
732      allow_field_number,
733      descriptor_pool=descriptor_pool,
734      allow_unknown_field=allow_unknown_field)
735
736
737def ParseLines(lines,
738               message,
739               allow_unknown_extension=False,
740               allow_field_number=False,
741               descriptor_pool=None,
742               allow_unknown_field=False):
743  """Parses a text representation of a protocol message into a message.
744
745  See Parse() for caveats.
746
747  Args:
748    lines: An iterable of lines of a message's text representation.
749    message: A protocol buffer message to merge into.
750    allow_unknown_extension: if True, skip over missing extensions and keep
751      parsing
752    allow_field_number: if True, both field number and field name are allowed.
753    descriptor_pool: A DescriptorPool used to resolve Any types.
754    allow_unknown_field: if True, skip over unknown field and keep
755      parsing. Avoid to use this option if possible. It may hide some
756      errors (e.g. spelling error on field name)
757
758  Returns:
759    The same message passed as argument.
760
761  Raises:
762    ParseError: On text parsing problems.
763  """
764  parser = _Parser(allow_unknown_extension,
765                   allow_field_number,
766                   descriptor_pool=descriptor_pool,
767                   allow_unknown_field=allow_unknown_field)
768  return parser.ParseLines(lines, message)
769
770
771def MergeLines(lines,
772               message,
773               allow_unknown_extension=False,
774               allow_field_number=False,
775               descriptor_pool=None,
776               allow_unknown_field=False):
777  """Parses a text representation of a protocol message into a message.
778
779  See Merge() for more details.
780
781  Args:
782    lines: An iterable of lines of a message's text representation.
783    message: A protocol buffer message to merge into.
784    allow_unknown_extension: if True, skip over missing extensions and keep
785      parsing
786    allow_field_number: if True, both field number and field name are allowed.
787    descriptor_pool: A DescriptorPool used to resolve Any types.
788    allow_unknown_field: if True, skip over unknown field and keep
789      parsing. Avoid to use this option if possible. It may hide some
790      errors (e.g. spelling error on field name)
791
792  Returns:
793    The same message passed as argument.
794
795  Raises:
796    ParseError: On text parsing problems.
797  """
798  parser = _Parser(allow_unknown_extension,
799                   allow_field_number,
800                   descriptor_pool=descriptor_pool,
801                   allow_unknown_field=allow_unknown_field)
802  return parser.MergeLines(lines, message)
803
804
805class _Parser(object):
806  """Text format parser for protocol message."""
807
808  def __init__(self,
809               allow_unknown_extension=False,
810               allow_field_number=False,
811               descriptor_pool=None,
812               allow_unknown_field=False):
813    self.allow_unknown_extension = allow_unknown_extension
814    self.allow_field_number = allow_field_number
815    self.descriptor_pool = descriptor_pool
816    self.allow_unknown_field = allow_unknown_field
817
818  def ParseLines(self, lines, message):
819    """Parses a text representation of a protocol message into a message."""
820    self._allow_multiple_scalars = False
821    self._ParseOrMerge(lines, message)
822    return message
823
824  def MergeLines(self, lines, message):
825    """Merges a text representation of a protocol message into a message."""
826    self._allow_multiple_scalars = True
827    self._ParseOrMerge(lines, message)
828    return message
829
830  def _ParseOrMerge(self, lines, message):
831    """Converts a text representation of a protocol message into a message.
832
833    Args:
834      lines: Lines of a message's text representation.
835      message: A protocol buffer message to merge into.
836
837    Raises:
838      ParseError: On text parsing problems.
839    """
840    # Tokenize expects native str lines.
841    if six.PY2:
842      str_lines = (line if isinstance(line, str) else line.encode('utf-8')
843                   for line in lines)
844    else:
845      str_lines = (line if isinstance(line, str) else line.decode('utf-8')
846                   for line in lines)
847    tokenizer = Tokenizer(str_lines)
848    while not tokenizer.AtEnd():
849      self._MergeField(tokenizer, message)
850
851  def _MergeField(self, tokenizer, message):
852    """Merges a single protocol message field into a message.
853
854    Args:
855      tokenizer: A tokenizer to parse the field name and values.
856      message: A protocol message to record the data.
857
858    Raises:
859      ParseError: In case of text parsing problems.
860    """
861    message_descriptor = message.DESCRIPTOR
862    if (message_descriptor.full_name == _ANY_FULL_TYPE_NAME and
863        tokenizer.TryConsume('[')):
864      type_url_prefix, packed_type_name = self._ConsumeAnyTypeUrl(tokenizer)
865      tokenizer.Consume(']')
866      tokenizer.TryConsume(':')
867      if tokenizer.TryConsume('<'):
868        expanded_any_end_token = '>'
869      else:
870        tokenizer.Consume('{')
871        expanded_any_end_token = '}'
872      expanded_any_sub_message = _BuildMessageFromTypeName(packed_type_name,
873                                                           self.descriptor_pool)
874      if not expanded_any_sub_message:
875        raise ParseError('Type %s not found in descriptor pool' %
876                         packed_type_name)
877      while not tokenizer.TryConsume(expanded_any_end_token):
878        if tokenizer.AtEnd():
879          raise tokenizer.ParseErrorPreviousToken('Expected "%s".' %
880                                                  (expanded_any_end_token,))
881        self._MergeField(tokenizer, expanded_any_sub_message)
882      message.Pack(expanded_any_sub_message,
883                   type_url_prefix=type_url_prefix)
884      return
885
886    if tokenizer.TryConsume('['):
887      name = [tokenizer.ConsumeIdentifier()]
888      while tokenizer.TryConsume('.'):
889        name.append(tokenizer.ConsumeIdentifier())
890      name = '.'.join(name)
891
892      if not message_descriptor.is_extendable:
893        raise tokenizer.ParseErrorPreviousToken(
894            'Message type "%s" does not have extensions.' %
895            message_descriptor.full_name)
896      # pylint: disable=protected-access
897      field = message.Extensions._FindExtensionByName(name)
898      # pylint: enable=protected-access
899      if not field:
900        if self.allow_unknown_extension:
901          field = None
902        else:
903          raise tokenizer.ParseErrorPreviousToken(
904              'Extension "%s" not registered. '
905              'Did you import the _pb2 module which defines it? '
906              'If you are trying to place the extension in the MessageSet '
907              'field of another message that is in an Any or MessageSet field, '
908              'that message\'s _pb2 module must be imported as well' % name)
909      elif message_descriptor != field.containing_type:
910        raise tokenizer.ParseErrorPreviousToken(
911            'Extension "%s" does not extend message type "%s".' %
912            (name, message_descriptor.full_name))
913
914      tokenizer.Consume(']')
915
916    else:
917      name = tokenizer.ConsumeIdentifierOrNumber()
918      if self.allow_field_number and name.isdigit():
919        number = ParseInteger(name, True, True)
920        field = message_descriptor.fields_by_number.get(number, None)
921        if not field and message_descriptor.is_extendable:
922          field = message.Extensions._FindExtensionByNumber(number)
923      else:
924        field = message_descriptor.fields_by_name.get(name, None)
925
926        # Group names are expected to be capitalized as they appear in the
927        # .proto file, which actually matches their type names, not their field
928        # names.
929        if not field:
930          field = message_descriptor.fields_by_name.get(name.lower(), None)
931          if field and field.type != descriptor.FieldDescriptor.TYPE_GROUP:
932            field = None
933
934        if (field and field.type == descriptor.FieldDescriptor.TYPE_GROUP and
935            field.message_type.name != name):
936          field = None
937
938      if not field and not self.allow_unknown_field:
939        raise tokenizer.ParseErrorPreviousToken(
940            'Message type "%s" has no field named "%s".' %
941            (message_descriptor.full_name, name))
942
943    if field:
944      if not self._allow_multiple_scalars and field.containing_oneof:
945        # Check if there's a different field set in this oneof.
946        # Note that we ignore the case if the same field was set before, and we
947        # apply _allow_multiple_scalars to non-scalar fields as well.
948        which_oneof = message.WhichOneof(field.containing_oneof.name)
949        if which_oneof is not None and which_oneof != field.name:
950          raise tokenizer.ParseErrorPreviousToken(
951              'Field "%s" is specified along with field "%s", another member '
952              'of oneof "%s" for message type "%s".' %
953              (field.name, which_oneof, field.containing_oneof.name,
954               message_descriptor.full_name))
955
956      if field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_MESSAGE:
957        tokenizer.TryConsume(':')
958        merger = self._MergeMessageField
959      else:
960        tokenizer.Consume(':')
961        merger = self._MergeScalarField
962
963      if (field.label == descriptor.FieldDescriptor.LABEL_REPEATED and
964          tokenizer.TryConsume('[')):
965        # Short repeated format, e.g. "foo: [1, 2, 3]"
966        if not tokenizer.TryConsume(']'):
967          while True:
968            merger(tokenizer, message, field)
969            if tokenizer.TryConsume(']'):
970              break
971            tokenizer.Consume(',')
972
973      else:
974        merger(tokenizer, message, field)
975
976    else:  # Proto field is unknown.
977      assert (self.allow_unknown_extension or self.allow_unknown_field)
978      _SkipFieldContents(tokenizer)
979
980    # For historical reasons, fields may optionally be separated by commas or
981    # semicolons.
982    if not tokenizer.TryConsume(','):
983      tokenizer.TryConsume(';')
984
985  def _ConsumeAnyTypeUrl(self, tokenizer):
986    """Consumes a google.protobuf.Any type URL and returns the type name."""
987    # Consume "type.googleapis.com/".
988    prefix = [tokenizer.ConsumeIdentifier()]
989    tokenizer.Consume('.')
990    prefix.append(tokenizer.ConsumeIdentifier())
991    tokenizer.Consume('.')
992    prefix.append(tokenizer.ConsumeIdentifier())
993    tokenizer.Consume('/')
994    # Consume the fully-qualified type name.
995    name = [tokenizer.ConsumeIdentifier()]
996    while tokenizer.TryConsume('.'):
997      name.append(tokenizer.ConsumeIdentifier())
998    return '.'.join(prefix), '.'.join(name)
999
1000  def _MergeMessageField(self, tokenizer, message, field):
1001    """Merges a single scalar field into a message.
1002
1003    Args:
1004      tokenizer: A tokenizer to parse the field value.
1005      message: The message of which field is a member.
1006      field: The descriptor of the field to be merged.
1007
1008    Raises:
1009      ParseError: In case of text parsing problems.
1010    """
1011    is_map_entry = _IsMapEntry(field)
1012
1013    if tokenizer.TryConsume('<'):
1014      end_token = '>'
1015    else:
1016      tokenizer.Consume('{')
1017      end_token = '}'
1018
1019    if field.label == descriptor.FieldDescriptor.LABEL_REPEATED:
1020      if field.is_extension:
1021        sub_message = message.Extensions[field].add()
1022      elif is_map_entry:
1023        sub_message = getattr(message, field.name).GetEntryClass()()
1024      else:
1025        sub_message = getattr(message, field.name).add()
1026    else:
1027      if field.is_extension:
1028        if (not self._allow_multiple_scalars and
1029            message.HasExtension(field)):
1030          raise tokenizer.ParseErrorPreviousToken(
1031              'Message type "%s" should not have multiple "%s" extensions.' %
1032              (message.DESCRIPTOR.full_name, field.full_name))
1033        sub_message = message.Extensions[field]
1034      else:
1035        # Also apply _allow_multiple_scalars to message field.
1036        # TODO(jieluo): Change to _allow_singular_overwrites.
1037        if (not self._allow_multiple_scalars and
1038            message.HasField(field.name)):
1039          raise tokenizer.ParseErrorPreviousToken(
1040              'Message type "%s" should not have multiple "%s" fields.' %
1041              (message.DESCRIPTOR.full_name, field.name))
1042        sub_message = getattr(message, field.name)
1043      sub_message.SetInParent()
1044
1045    while not tokenizer.TryConsume(end_token):
1046      if tokenizer.AtEnd():
1047        raise tokenizer.ParseErrorPreviousToken('Expected "%s".' % (end_token,))
1048      self._MergeField(tokenizer, sub_message)
1049
1050    if is_map_entry:
1051      value_cpptype = field.message_type.fields_by_name['value'].cpp_type
1052      if value_cpptype == descriptor.FieldDescriptor.CPPTYPE_MESSAGE:
1053        value = getattr(message, field.name)[sub_message.key]
1054        value.MergeFrom(sub_message.value)
1055      else:
1056        getattr(message, field.name)[sub_message.key] = sub_message.value
1057
1058  @staticmethod
1059  def _IsProto3Syntax(message):
1060    message_descriptor = message.DESCRIPTOR
1061    return (hasattr(message_descriptor, 'syntax') and
1062            message_descriptor.syntax == 'proto3')
1063
1064  def _MergeScalarField(self, tokenizer, message, field):
1065    """Merges a single scalar field into a message.
1066
1067    Args:
1068      tokenizer: A tokenizer to parse the field value.
1069      message: A protocol message to record the data.
1070      field: The descriptor of the field to be merged.
1071
1072    Raises:
1073      ParseError: In case of text parsing problems.
1074      RuntimeError: On runtime errors.
1075    """
1076    _ = self.allow_unknown_extension
1077    value = None
1078
1079    if field.type in (descriptor.FieldDescriptor.TYPE_INT32,
1080                      descriptor.FieldDescriptor.TYPE_SINT32,
1081                      descriptor.FieldDescriptor.TYPE_SFIXED32):
1082      value = _ConsumeInt32(tokenizer)
1083    elif field.type in (descriptor.FieldDescriptor.TYPE_INT64,
1084                        descriptor.FieldDescriptor.TYPE_SINT64,
1085                        descriptor.FieldDescriptor.TYPE_SFIXED64):
1086      value = _ConsumeInt64(tokenizer)
1087    elif field.type in (descriptor.FieldDescriptor.TYPE_UINT32,
1088                        descriptor.FieldDescriptor.TYPE_FIXED32):
1089      value = _ConsumeUint32(tokenizer)
1090    elif field.type in (descriptor.FieldDescriptor.TYPE_UINT64,
1091                        descriptor.FieldDescriptor.TYPE_FIXED64):
1092      value = _ConsumeUint64(tokenizer)
1093    elif field.type in (descriptor.FieldDescriptor.TYPE_FLOAT,
1094                        descriptor.FieldDescriptor.TYPE_DOUBLE):
1095      value = tokenizer.ConsumeFloat()
1096    elif field.type == descriptor.FieldDescriptor.TYPE_BOOL:
1097      value = tokenizer.ConsumeBool()
1098    elif field.type == descriptor.FieldDescriptor.TYPE_STRING:
1099      value = tokenizer.ConsumeString()
1100    elif field.type == descriptor.FieldDescriptor.TYPE_BYTES:
1101      value = tokenizer.ConsumeByteString()
1102    elif field.type == descriptor.FieldDescriptor.TYPE_ENUM:
1103      value = tokenizer.ConsumeEnum(field)
1104    else:
1105      raise RuntimeError('Unknown field type %d' % field.type)
1106
1107    if field.label == descriptor.FieldDescriptor.LABEL_REPEATED:
1108      if field.is_extension:
1109        message.Extensions[field].append(value)
1110      else:
1111        getattr(message, field.name).append(value)
1112    else:
1113      if field.is_extension:
1114        if (not self._allow_multiple_scalars and
1115            not self._IsProto3Syntax(message) and
1116            message.HasExtension(field)):
1117          raise tokenizer.ParseErrorPreviousToken(
1118              'Message type "%s" should not have multiple "%s" extensions.' %
1119              (message.DESCRIPTOR.full_name, field.full_name))
1120        else:
1121          message.Extensions[field] = value
1122      else:
1123        duplicate_error = False
1124        if not self._allow_multiple_scalars:
1125          if self._IsProto3Syntax(message):
1126            # Proto3 doesn't represent presence so we try best effort to check
1127            # multiple scalars by compare to default values.
1128            duplicate_error = bool(getattr(message, field.name))
1129          else:
1130            duplicate_error = message.HasField(field.name)
1131
1132        if duplicate_error:
1133          raise tokenizer.ParseErrorPreviousToken(
1134              'Message type "%s" should not have multiple "%s" fields.' %
1135              (message.DESCRIPTOR.full_name, field.name))
1136        else:
1137          setattr(message, field.name, value)
1138
1139
1140def _SkipFieldContents(tokenizer):
1141  """Skips over contents (value or message) of a field.
1142
1143  Args:
1144    tokenizer: A tokenizer to parse the field name and values.
1145  """
1146  # Try to guess the type of this field.
1147  # If this field is not a message, there should be a ":" between the
1148  # field name and the field value and also the field value should not
1149  # start with "{" or "<" which indicates the beginning of a message body.
1150  # If there is no ":" or there is a "{" or "<" after ":", this field has
1151  # to be a message or the input is ill-formed.
1152  if tokenizer.TryConsume(':') and not tokenizer.LookingAt(
1153      '{') and not tokenizer.LookingAt('<'):
1154    _SkipFieldValue(tokenizer)
1155  else:
1156    _SkipFieldMessage(tokenizer)
1157
1158
1159def _SkipField(tokenizer):
1160  """Skips over a complete field (name and value/message).
1161
1162  Args:
1163    tokenizer: A tokenizer to parse the field name and values.
1164  """
1165  if tokenizer.TryConsume('['):
1166    # Consume extension name.
1167    tokenizer.ConsumeIdentifier()
1168    while tokenizer.TryConsume('.'):
1169      tokenizer.ConsumeIdentifier()
1170    tokenizer.Consume(']')
1171  else:
1172    tokenizer.ConsumeIdentifierOrNumber()
1173
1174  _SkipFieldContents(tokenizer)
1175
1176  # For historical reasons, fields may optionally be separated by commas or
1177  # semicolons.
1178  if not tokenizer.TryConsume(','):
1179    tokenizer.TryConsume(';')
1180
1181
1182def _SkipFieldMessage(tokenizer):
1183  """Skips over a field message.
1184
1185  Args:
1186    tokenizer: A tokenizer to parse the field name and values.
1187  """
1188
1189  if tokenizer.TryConsume('<'):
1190    delimiter = '>'
1191  else:
1192    tokenizer.Consume('{')
1193    delimiter = '}'
1194
1195  while not tokenizer.LookingAt('>') and not tokenizer.LookingAt('}'):
1196    _SkipField(tokenizer)
1197
1198  tokenizer.Consume(delimiter)
1199
1200
1201def _SkipFieldValue(tokenizer):
1202  """Skips over a field value.
1203
1204  Args:
1205    tokenizer: A tokenizer to parse the field name and values.
1206
1207  Raises:
1208    ParseError: In case an invalid field value is found.
1209  """
1210  # String/bytes tokens can come in multiple adjacent string literals.
1211  # If we can consume one, consume as many as we can.
1212  if tokenizer.TryConsumeByteString():
1213    while tokenizer.TryConsumeByteString():
1214      pass
1215    return
1216
1217  if (not tokenizer.TryConsumeIdentifier() and
1218      not _TryConsumeInt64(tokenizer) and not _TryConsumeUint64(tokenizer) and
1219      not tokenizer.TryConsumeFloat()):
1220    raise ParseError('Invalid field value: ' + tokenizer.token)
1221
1222
1223class Tokenizer(object):
1224  """Protocol buffer text representation tokenizer.
1225
1226  This class handles the lower level string parsing by splitting it into
1227  meaningful tokens.
1228
1229  It was directly ported from the Java protocol buffer API.
1230  """
1231
1232  _WHITESPACE = re.compile(r'\s+')
1233  _COMMENT = re.compile(r'(\s*#.*$)', re.MULTILINE)
1234  _WHITESPACE_OR_COMMENT = re.compile(r'(\s|(#.*$))+', re.MULTILINE)
1235  _TOKEN = re.compile('|'.join([
1236      r'[a-zA-Z_][0-9a-zA-Z_+-]*',  # an identifier
1237      r'([0-9+-]|(\.[0-9]))[0-9a-zA-Z_.+-]*',  # a number
1238  ] + [  # quoted str for each quote mark
1239      # Avoid backtracking! https://stackoverflow.com/a/844267
1240      r'{qt}[^{qt}\n\\]*((\\.)+[^{qt}\n\\]*)*({qt}|\\?$)'.format(qt=mark)
1241      for mark in _QUOTES
1242  ]))
1243
1244  _IDENTIFIER = re.compile(r'[^\d\W]\w*')
1245  _IDENTIFIER_OR_NUMBER = re.compile(r'\w+')
1246
1247  def __init__(self, lines, skip_comments=True):
1248    self._position = 0
1249    self._line = -1
1250    self._column = 0
1251    self._token_start = None
1252    self.token = ''
1253    self._lines = iter(lines)
1254    self._current_line = ''
1255    self._previous_line = 0
1256    self._previous_column = 0
1257    self._more_lines = True
1258    self._skip_comments = skip_comments
1259    self._whitespace_pattern = (skip_comments and self._WHITESPACE_OR_COMMENT
1260                                or self._WHITESPACE)
1261    self._SkipWhitespace()
1262    self.NextToken()
1263
1264  def LookingAt(self, token):
1265    return self.token == token
1266
1267  def AtEnd(self):
1268    """Checks the end of the text was reached.
1269
1270    Returns:
1271      True iff the end was reached.
1272    """
1273    return not self.token
1274
1275  def _PopLine(self):
1276    while len(self._current_line) <= self._column:
1277      try:
1278        self._current_line = next(self._lines)
1279      except StopIteration:
1280        self._current_line = ''
1281        self._more_lines = False
1282        return
1283      else:
1284        self._line += 1
1285        self._column = 0
1286
1287  def _SkipWhitespace(self):
1288    while True:
1289      self._PopLine()
1290      match = self._whitespace_pattern.match(self._current_line, self._column)
1291      if not match:
1292        break
1293      length = len(match.group(0))
1294      self._column += length
1295
1296  def TryConsume(self, token):
1297    """Tries to consume a given piece of text.
1298
1299    Args:
1300      token: Text to consume.
1301
1302    Returns:
1303      True iff the text was consumed.
1304    """
1305    if self.token == token:
1306      self.NextToken()
1307      return True
1308    return False
1309
1310  def Consume(self, token):
1311    """Consumes a piece of text.
1312
1313    Args:
1314      token: Text to consume.
1315
1316    Raises:
1317      ParseError: If the text couldn't be consumed.
1318    """
1319    if not self.TryConsume(token):
1320      raise self.ParseError('Expected "%s".' % token)
1321
1322  def ConsumeComment(self):
1323    result = self.token
1324    if not self._COMMENT.match(result):
1325      raise self.ParseError('Expected comment.')
1326    self.NextToken()
1327    return result
1328
1329  def ConsumeCommentOrTrailingComment(self):
1330    """Consumes a comment, returns a 2-tuple (trailing bool, comment str)."""
1331
1332    # Tokenizer initializes _previous_line and _previous_column to 0. As the
1333    # tokenizer starts, it looks like there is a previous token on the line.
1334    just_started = self._line == 0 and self._column == 0
1335
1336    before_parsing = self._previous_line
1337    comment = self.ConsumeComment()
1338
1339    # A trailing comment is a comment on the same line than the previous token.
1340    trailing = (self._previous_line == before_parsing
1341                and not just_started)
1342
1343    return trailing, comment
1344
1345  def TryConsumeIdentifier(self):
1346    try:
1347      self.ConsumeIdentifier()
1348      return True
1349    except ParseError:
1350      return False
1351
1352  def ConsumeIdentifier(self):
1353    """Consumes protocol message field identifier.
1354
1355    Returns:
1356      Identifier string.
1357
1358    Raises:
1359      ParseError: If an identifier couldn't be consumed.
1360    """
1361    result = self.token
1362    if not self._IDENTIFIER.match(result):
1363      raise self.ParseError('Expected identifier.')
1364    self.NextToken()
1365    return result
1366
1367  def TryConsumeIdentifierOrNumber(self):
1368    try:
1369      self.ConsumeIdentifierOrNumber()
1370      return True
1371    except ParseError:
1372      return False
1373
1374  def ConsumeIdentifierOrNumber(self):
1375    """Consumes protocol message field identifier.
1376
1377    Returns:
1378      Identifier string.
1379
1380    Raises:
1381      ParseError: If an identifier couldn't be consumed.
1382    """
1383    result = self.token
1384    if not self._IDENTIFIER_OR_NUMBER.match(result):
1385      raise self.ParseError('Expected identifier or number, got %s.' % result)
1386    self.NextToken()
1387    return result
1388
1389  def TryConsumeInteger(self):
1390    try:
1391      # Note: is_long only affects value type, not whether an error is raised.
1392      self.ConsumeInteger()
1393      return True
1394    except ParseError:
1395      return False
1396
1397  def ConsumeInteger(self, is_long=False):
1398    """Consumes an integer number.
1399
1400    Args:
1401      is_long: True if the value should be returned as a long integer.
1402    Returns:
1403      The integer parsed.
1404
1405    Raises:
1406      ParseError: If an integer couldn't be consumed.
1407    """
1408    try:
1409      result = _ParseAbstractInteger(self.token, is_long=is_long)
1410    except ValueError as e:
1411      raise self.ParseError(str(e))
1412    self.NextToken()
1413    return result
1414
1415  def TryConsumeFloat(self):
1416    try:
1417      self.ConsumeFloat()
1418      return True
1419    except ParseError:
1420      return False
1421
1422  def ConsumeFloat(self):
1423    """Consumes an floating point number.
1424
1425    Returns:
1426      The number parsed.
1427
1428    Raises:
1429      ParseError: If a floating point number couldn't be consumed.
1430    """
1431    try:
1432      result = ParseFloat(self.token)
1433    except ValueError as e:
1434      raise self.ParseError(str(e))
1435    self.NextToken()
1436    return result
1437
1438  def ConsumeBool(self):
1439    """Consumes a boolean value.
1440
1441    Returns:
1442      The bool parsed.
1443
1444    Raises:
1445      ParseError: If a boolean value couldn't be consumed.
1446    """
1447    try:
1448      result = ParseBool(self.token)
1449    except ValueError as e:
1450      raise self.ParseError(str(e))
1451    self.NextToken()
1452    return result
1453
1454  def TryConsumeByteString(self):
1455    try:
1456      self.ConsumeByteString()
1457      return True
1458    except ParseError:
1459      return False
1460
1461  def ConsumeString(self):
1462    """Consumes a string value.
1463
1464    Returns:
1465      The string parsed.
1466
1467    Raises:
1468      ParseError: If a string value couldn't be consumed.
1469    """
1470    the_bytes = self.ConsumeByteString()
1471    try:
1472      return six.text_type(the_bytes, 'utf-8')
1473    except UnicodeDecodeError as e:
1474      raise self._StringParseError(e)
1475
1476  def ConsumeByteString(self):
1477    """Consumes a byte array value.
1478
1479    Returns:
1480      The array parsed (as a string).
1481
1482    Raises:
1483      ParseError: If a byte array value couldn't be consumed.
1484    """
1485    the_list = [self._ConsumeSingleByteString()]
1486    while self.token and self.token[0] in _QUOTES:
1487      the_list.append(self._ConsumeSingleByteString())
1488    return b''.join(the_list)
1489
1490  def _ConsumeSingleByteString(self):
1491    """Consume one token of a string literal.
1492
1493    String literals (whether bytes or text) can come in multiple adjacent
1494    tokens which are automatically concatenated, like in C or Python.  This
1495    method only consumes one token.
1496
1497    Returns:
1498      The token parsed.
1499    Raises:
1500      ParseError: When the wrong format data is found.
1501    """
1502    text = self.token
1503    if len(text) < 1 or text[0] not in _QUOTES:
1504      raise self.ParseError('Expected string but found: %r' % (text,))
1505
1506    if len(text) < 2 or text[-1] != text[0]:
1507      raise self.ParseError('String missing ending quote: %r' % (text,))
1508
1509    try:
1510      result = text_encoding.CUnescape(text[1:-1])
1511    except ValueError as e:
1512      raise self.ParseError(str(e))
1513    self.NextToken()
1514    return result
1515
1516  def ConsumeEnum(self, field):
1517    try:
1518      result = ParseEnum(field, self.token)
1519    except ValueError as e:
1520      raise self.ParseError(str(e))
1521    self.NextToken()
1522    return result
1523
1524  def ParseErrorPreviousToken(self, message):
1525    """Creates and *returns* a ParseError for the previously read token.
1526
1527    Args:
1528      message: A message to set for the exception.
1529
1530    Returns:
1531      A ParseError instance.
1532    """
1533    return ParseError(message, self._previous_line + 1,
1534                      self._previous_column + 1)
1535
1536  def ParseError(self, message):
1537    """Creates and *returns* a ParseError for the current token."""
1538    return ParseError('\'' + self._current_line + '\': ' + message,
1539                      self._line + 1, self._column + 1)
1540
1541  def _StringParseError(self, e):
1542    return self.ParseError('Couldn\'t parse string: ' + str(e))
1543
1544  def NextToken(self):
1545    """Reads the next meaningful token."""
1546    self._previous_line = self._line
1547    self._previous_column = self._column
1548
1549    self._column += len(self.token)
1550    self._SkipWhitespace()
1551
1552    if not self._more_lines:
1553      self.token = ''
1554      return
1555
1556    match = self._TOKEN.match(self._current_line, self._column)
1557    if not match and not self._skip_comments:
1558      match = self._COMMENT.match(self._current_line, self._column)
1559    if match:
1560      token = match.group(0)
1561      self.token = token
1562    else:
1563      self.token = self._current_line[self._column]
1564
1565# Aliased so it can still be accessed by current visibility violators.
1566# TODO(dbarnett): Migrate violators to textformat_tokenizer.
1567_Tokenizer = Tokenizer  # pylint: disable=invalid-name
1568
1569
1570def _ConsumeInt32(tokenizer):
1571  """Consumes a signed 32bit integer number from tokenizer.
1572
1573  Args:
1574    tokenizer: A tokenizer used to parse the number.
1575
1576  Returns:
1577    The integer parsed.
1578
1579  Raises:
1580    ParseError: If a signed 32bit integer couldn't be consumed.
1581  """
1582  return _ConsumeInteger(tokenizer, is_signed=True, is_long=False)
1583
1584
1585def _ConsumeUint32(tokenizer):
1586  """Consumes an unsigned 32bit integer number from tokenizer.
1587
1588  Args:
1589    tokenizer: A tokenizer used to parse the number.
1590
1591  Returns:
1592    The integer parsed.
1593
1594  Raises:
1595    ParseError: If an unsigned 32bit integer couldn't be consumed.
1596  """
1597  return _ConsumeInteger(tokenizer, is_signed=False, is_long=False)
1598
1599
1600def _TryConsumeInt64(tokenizer):
1601  try:
1602    _ConsumeInt64(tokenizer)
1603    return True
1604  except ParseError:
1605    return False
1606
1607
1608def _ConsumeInt64(tokenizer):
1609  """Consumes a signed 32bit integer number from tokenizer.
1610
1611  Args:
1612    tokenizer: A tokenizer used to parse the number.
1613
1614  Returns:
1615    The integer parsed.
1616
1617  Raises:
1618    ParseError: If a signed 32bit integer couldn't be consumed.
1619  """
1620  return _ConsumeInteger(tokenizer, is_signed=True, is_long=True)
1621
1622
1623def _TryConsumeUint64(tokenizer):
1624  try:
1625    _ConsumeUint64(tokenizer)
1626    return True
1627  except ParseError:
1628    return False
1629
1630
1631def _ConsumeUint64(tokenizer):
1632  """Consumes an unsigned 64bit integer number from tokenizer.
1633
1634  Args:
1635    tokenizer: A tokenizer used to parse the number.
1636
1637  Returns:
1638    The integer parsed.
1639
1640  Raises:
1641    ParseError: If an unsigned 64bit integer couldn't be consumed.
1642  """
1643  return _ConsumeInteger(tokenizer, is_signed=False, is_long=True)
1644
1645
1646def _TryConsumeInteger(tokenizer, is_signed=False, is_long=False):
1647  try:
1648    _ConsumeInteger(tokenizer, is_signed=is_signed, is_long=is_long)
1649    return True
1650  except ParseError:
1651    return False
1652
1653
1654def _ConsumeInteger(tokenizer, is_signed=False, is_long=False):
1655  """Consumes an integer number from tokenizer.
1656
1657  Args:
1658    tokenizer: A tokenizer used to parse the number.
1659    is_signed: True if a signed integer must be parsed.
1660    is_long: True if a long integer must be parsed.
1661
1662  Returns:
1663    The integer parsed.
1664
1665  Raises:
1666    ParseError: If an integer with given characteristics couldn't be consumed.
1667  """
1668  try:
1669    result = ParseInteger(tokenizer.token, is_signed=is_signed, is_long=is_long)
1670  except ValueError as e:
1671    raise tokenizer.ParseError(str(e))
1672  tokenizer.NextToken()
1673  return result
1674
1675
1676def ParseInteger(text, is_signed=False, is_long=False):
1677  """Parses an integer.
1678
1679  Args:
1680    text: The text to parse.
1681    is_signed: True if a signed integer must be parsed.
1682    is_long: True if a long integer must be parsed.
1683
1684  Returns:
1685    The integer value.
1686
1687  Raises:
1688    ValueError: Thrown Iff the text is not a valid integer.
1689  """
1690  # Do the actual parsing. Exception handling is propagated to caller.
1691  result = _ParseAbstractInteger(text, is_long=is_long)
1692
1693  # Check if the integer is sane. Exceptions handled by callers.
1694  checker = _INTEGER_CHECKERS[2 * int(is_long) + int(is_signed)]
1695  checker.CheckValue(result)
1696  return result
1697
1698
1699def _ParseAbstractInteger(text, is_long=False):
1700  """Parses an integer without checking size/signedness.
1701
1702  Args:
1703    text: The text to parse.
1704    is_long: True if the value should be returned as a long integer.
1705
1706  Returns:
1707    The integer value.
1708
1709  Raises:
1710    ValueError: Thrown Iff the text is not a valid integer.
1711  """
1712  # Do the actual parsing. Exception handling is propagated to caller.
1713  orig_text = text
1714  c_octal_match = re.match(r'(-?)0(\d+)$', text)
1715  if c_octal_match:
1716    # Python 3 no longer supports 0755 octal syntax without the 'o', so
1717    # we always use the '0o' prefix for multi-digit numbers starting with 0.
1718    text = c_octal_match.group(1) + '0o' + c_octal_match.group(2)
1719  try:
1720    # We force 32-bit values to int and 64-bit values to long to make
1721    # alternate implementations where the distinction is more significant
1722    # (e.g. the C++ implementation) simpler.
1723    if is_long:
1724      return long(text, 0)
1725    else:
1726      return int(text, 0)
1727  except ValueError:
1728    raise ValueError('Couldn\'t parse integer: %s' % orig_text)
1729
1730
1731def ParseFloat(text):
1732  """Parse a floating point number.
1733
1734  Args:
1735    text: Text to parse.
1736
1737  Returns:
1738    The number parsed.
1739
1740  Raises:
1741    ValueError: If a floating point number couldn't be parsed.
1742  """
1743  try:
1744    # Assume Python compatible syntax.
1745    return float(text)
1746  except ValueError:
1747    # Check alternative spellings.
1748    if _FLOAT_INFINITY.match(text):
1749      if text[0] == '-':
1750        return float('-inf')
1751      else:
1752        return float('inf')
1753    elif _FLOAT_NAN.match(text):
1754      return float('nan')
1755    else:
1756      # assume '1.0f' format
1757      try:
1758        return float(text.rstrip('f'))
1759      except ValueError:
1760        raise ValueError('Couldn\'t parse float: %s' % text)
1761
1762
1763def ParseBool(text):
1764  """Parse a boolean value.
1765
1766  Args:
1767    text: Text to parse.
1768
1769  Returns:
1770    Boolean values parsed
1771
1772  Raises:
1773    ValueError: If text is not a valid boolean.
1774  """
1775  if text in ('true', 't', '1', 'True'):
1776    return True
1777  elif text in ('false', 'f', '0', 'False'):
1778    return False
1779  else:
1780    raise ValueError('Expected "true" or "false".')
1781
1782
1783def ParseEnum(field, value):
1784  """Parse an enum value.
1785
1786  The value can be specified by a number (the enum value), or by
1787  a string literal (the enum name).
1788
1789  Args:
1790    field: Enum field descriptor.
1791    value: String value.
1792
1793  Returns:
1794    Enum value number.
1795
1796  Raises:
1797    ValueError: If the enum value could not be parsed.
1798  """
1799  enum_descriptor = field.enum_type
1800  try:
1801    number = int(value, 0)
1802  except ValueError:
1803    # Identifier.
1804    enum_value = enum_descriptor.values_by_name.get(value, None)
1805    if enum_value is None:
1806      raise ValueError('Enum type "%s" has no value named %s.' %
1807                       (enum_descriptor.full_name, value))
1808  else:
1809    # Numeric value.
1810    if hasattr(field.file, 'syntax'):
1811      # Attribute is checked for compatibility.
1812      if field.file.syntax == 'proto3':
1813        # Proto3 accept numeric unknown enums.
1814        return number
1815    enum_value = enum_descriptor.values_by_number.get(number, None)
1816    if enum_value is None:
1817      raise ValueError('Enum type "%s" has no value with number %d.' %
1818                       (enum_descriptor.full_name, number))
1819  return enum_value.number
1820