• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# Protocol Buffers - Google's data interchange format
2# Copyright 2008 Google Inc.  All rights reserved.
3# https://developers.google.com/protocol-buffers/
4#
5# Redistribution and use in source and binary forms, with or without
6# modification, are permitted provided that the following conditions are
7# met:
8#
9#     * Redistributions of source code must retain the above copyright
10# notice, this list of conditions and the following disclaimer.
11#     * Redistributions in binary form must reproduce the above
12# copyright notice, this list of conditions and the following disclaimer
13# in the documentation and/or other materials provided with the
14# distribution.
15#     * Neither the name of Google Inc. nor the names of its
16# contributors may be used to endorse or promote products derived from
17# this software without specific prior written permission.
18#
19# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
31"""Contains routines for printing protocol messages in text format.
32
33Simple usage example::
34
35  # Create a proto object and serialize it to a text proto string.
36  message = my_proto_pb2.MyMessage(foo='bar')
37  text_proto = text_format.MessageToString(message)
38
39  # Parse a text proto string.
40  message = text_format.Parse(text_proto, my_proto_pb2.MyMessage())
41"""
42
43__author__ = 'kenton@google.com (Kenton Varda)'
44
45# TODO(b/129989314) Import thread contention leads to test failures.
46import encodings.raw_unicode_escape  # pylint: disable=unused-import
47import encodings.unicode_escape  # pylint: disable=unused-import
48import io
49import math
50import re
51import six
52
53from google.protobuf.internal import decoder
54from google.protobuf.internal import type_checkers
55from google.protobuf import descriptor
56from google.protobuf import text_encoding
57
58if six.PY3:
59  long = int  # pylint: disable=redefined-builtin,invalid-name
60
61# pylint: disable=g-import-not-at-top
62__all__ = ['MessageToString', 'Parse', 'PrintMessage', 'PrintField',
63           'PrintFieldValue', 'Merge', 'MessageToBytes']
64
65_INTEGER_CHECKERS = (type_checkers.Uint32ValueChecker(),
66                     type_checkers.Int32ValueChecker(),
67                     type_checkers.Uint64ValueChecker(),
68                     type_checkers.Int64ValueChecker())
69_FLOAT_INFINITY = re.compile('-?inf(?:inity)?f?$', re.IGNORECASE)
70_FLOAT_NAN = re.compile('nanf?$', re.IGNORECASE)
71_QUOTES = frozenset(("'", '"'))
72_ANY_FULL_TYPE_NAME = 'google.protobuf.Any'
73
74
75class Error(Exception):
76  """Top-level module error for text_format."""
77
78
79class ParseError(Error):
80  """Thrown in case of text parsing or tokenizing error."""
81
82  def __init__(self, message=None, line=None, column=None):
83    if message is not None and line is not None:
84      loc = str(line)
85      if column is not None:
86        loc += ':{0}'.format(column)
87      message = '{0} : {1}'.format(loc, message)
88    if message is not None:
89      super(ParseError, self).__init__(message)
90    else:
91      super(ParseError, self).__init__()
92    self._line = line
93    self._column = column
94
95  def GetLine(self):
96    return self._line
97
98  def GetColumn(self):
99    return self._column
100
101
102class TextWriter(object):
103
104  def __init__(self, as_utf8):
105    if six.PY2:
106      self._writer = io.BytesIO()
107    else:
108      self._writer = io.StringIO()
109
110  def write(self, val):
111    if six.PY2:
112      if isinstance(val, six.text_type):
113        val = val.encode('utf-8')
114    return self._writer.write(val)
115
116  def close(self):
117    return self._writer.close()
118
119  def getvalue(self):
120    return self._writer.getvalue()
121
122
123def MessageToString(
124    message,
125    as_utf8=False,
126    as_one_line=False,
127    use_short_repeated_primitives=False,
128    pointy_brackets=False,
129    use_index_order=False,
130    float_format=None,
131    double_format=None,
132    use_field_number=False,
133    descriptor_pool=None,
134    indent=0,
135    message_formatter=None,
136    print_unknown_fields=False,
137    force_colon=False):
138  # type: (...) -> str
139  """Convert protobuf message to text format.
140
141  Double values can be formatted compactly with 15 digits of
142  precision (which is the most that IEEE 754 "double" can guarantee)
143  using double_format='.15g'. To ensure that converting to text and back to a
144  proto will result in an identical value, double_format='.17g' should be used.
145
146  Args:
147    message: The protocol buffers message.
148    as_utf8: Return unescaped Unicode for non-ASCII characters.
149        In Python 3 actual Unicode characters may appear as is in strings.
150        In Python 2 the return value will be valid UTF-8 rather than only ASCII.
151    as_one_line: Don't introduce newlines between fields.
152    use_short_repeated_primitives: Use short repeated format for primitives.
153    pointy_brackets: If True, use angle brackets instead of curly braces for
154      nesting.
155    use_index_order: If True, fields of a proto message will be printed using
156      the order defined in source code instead of the field number, extensions
157      will be printed at the end of the message and their relative order is
158      determined by the extension number. By default, use the field number
159      order.
160    float_format (str): If set, use this to specify float field formatting
161      (per the "Format Specification Mini-Language"); otherwise, shortest float
162      that has same value in wire will be printed. Also affect double field
163      if double_format is not set but float_format is set.
164    double_format (str): If set, use this to specify double field formatting
165      (per the "Format Specification Mini-Language"); if it is not set but
166      float_format is set, use float_format. Otherwise, use ``str()``
167    use_field_number: If True, print field numbers instead of names.
168    descriptor_pool (DescriptorPool): Descriptor pool used to resolve Any types.
169    indent (int): The initial indent level, in terms of spaces, for pretty
170      print.
171    message_formatter (function(message, indent, as_one_line) -> unicode|None):
172      Custom formatter for selected sub-messages (usually based on message
173      type). Use to pretty print parts of the protobuf for easier diffing.
174    print_unknown_fields: If True, unknown fields will be printed.
175    force_colon: If set, a colon will be added after the field name even if the
176      field is a proto message.
177
178  Returns:
179    str: A string of the text formatted protocol buffer message.
180  """
181  out = TextWriter(as_utf8)
182  printer = _Printer(
183      out,
184      indent,
185      as_utf8,
186      as_one_line,
187      use_short_repeated_primitives,
188      pointy_brackets,
189      use_index_order,
190      float_format,
191      double_format,
192      use_field_number,
193      descriptor_pool,
194      message_formatter,
195      print_unknown_fields=print_unknown_fields,
196      force_colon=force_colon)
197  printer.PrintMessage(message)
198  result = out.getvalue()
199  out.close()
200  if as_one_line:
201    return result.rstrip()
202  return result
203
204
205def MessageToBytes(message, **kwargs):
206  # type: (...) -> bytes
207  """Convert protobuf message to encoded text format.  See MessageToString."""
208  text = MessageToString(message, **kwargs)
209  if isinstance(text, bytes):
210    return text
211  codec = 'utf-8' if kwargs.get('as_utf8') else 'ascii'
212  return text.encode(codec)
213
214
215def _IsMapEntry(field):
216  return (field.type == descriptor.FieldDescriptor.TYPE_MESSAGE and
217          field.message_type.has_options and
218          field.message_type.GetOptions().map_entry)
219
220
221def PrintMessage(message,
222                 out,
223                 indent=0,
224                 as_utf8=False,
225                 as_one_line=False,
226                 use_short_repeated_primitives=False,
227                 pointy_brackets=False,
228                 use_index_order=False,
229                 float_format=None,
230                 double_format=None,
231                 use_field_number=False,
232                 descriptor_pool=None,
233                 message_formatter=None,
234                 print_unknown_fields=False,
235                 force_colon=False):
236  printer = _Printer(
237      out=out, indent=indent, as_utf8=as_utf8,
238      as_one_line=as_one_line,
239      use_short_repeated_primitives=use_short_repeated_primitives,
240      pointy_brackets=pointy_brackets,
241      use_index_order=use_index_order,
242      float_format=float_format,
243      double_format=double_format,
244      use_field_number=use_field_number,
245      descriptor_pool=descriptor_pool,
246      message_formatter=message_formatter,
247      print_unknown_fields=print_unknown_fields,
248      force_colon=force_colon)
249  printer.PrintMessage(message)
250
251
252def PrintField(field,
253               value,
254               out,
255               indent=0,
256               as_utf8=False,
257               as_one_line=False,
258               use_short_repeated_primitives=False,
259               pointy_brackets=False,
260               use_index_order=False,
261               float_format=None,
262               double_format=None,
263               message_formatter=None,
264               print_unknown_fields=False,
265               force_colon=False):
266  """Print a single field name/value pair."""
267  printer = _Printer(out, indent, as_utf8, as_one_line,
268                     use_short_repeated_primitives, pointy_brackets,
269                     use_index_order, float_format, double_format,
270                     message_formatter=message_formatter,
271                     print_unknown_fields=print_unknown_fields,
272                     force_colon=force_colon)
273  printer.PrintField(field, value)
274
275
276def PrintFieldValue(field,
277                    value,
278                    out,
279                    indent=0,
280                    as_utf8=False,
281                    as_one_line=False,
282                    use_short_repeated_primitives=False,
283                    pointy_brackets=False,
284                    use_index_order=False,
285                    float_format=None,
286                    double_format=None,
287                    message_formatter=None,
288                    print_unknown_fields=False,
289                    force_colon=False):
290  """Print a single field value (not including name)."""
291  printer = _Printer(out, indent, as_utf8, as_one_line,
292                     use_short_repeated_primitives, pointy_brackets,
293                     use_index_order, float_format, double_format,
294                     message_formatter=message_formatter,
295                     print_unknown_fields=print_unknown_fields,
296                     force_colon=force_colon)
297  printer.PrintFieldValue(field, value)
298
299
300def _BuildMessageFromTypeName(type_name, descriptor_pool):
301  """Returns a protobuf message instance.
302
303  Args:
304    type_name: Fully-qualified protobuf  message type name string.
305    descriptor_pool: DescriptorPool instance.
306
307  Returns:
308    A Message instance of type matching type_name, or None if the a Descriptor
309    wasn't found matching type_name.
310  """
311  # pylint: disable=g-import-not-at-top
312  if descriptor_pool is None:
313    from google.protobuf import descriptor_pool as pool_mod
314    descriptor_pool = pool_mod.Default()
315  from google.protobuf import symbol_database
316  database = symbol_database.Default()
317  try:
318    message_descriptor = descriptor_pool.FindMessageTypeByName(type_name)
319  except KeyError:
320    return None
321  message_type = database.GetPrototype(message_descriptor)
322  return message_type()
323
324
325# These values must match WireType enum in google/protobuf/wire_format.h.
326WIRETYPE_LENGTH_DELIMITED = 2
327WIRETYPE_START_GROUP = 3
328
329
330class _Printer(object):
331  """Text format printer for protocol message."""
332
333  def __init__(
334      self,
335      out,
336      indent=0,
337      as_utf8=False,
338      as_one_line=False,
339      use_short_repeated_primitives=False,
340      pointy_brackets=False,
341      use_index_order=False,
342      float_format=None,
343      double_format=None,
344      use_field_number=False,
345      descriptor_pool=None,
346      message_formatter=None,
347      print_unknown_fields=False,
348      force_colon=False):
349    """Initialize the Printer.
350
351    Double values can be formatted compactly with 15 digits of precision
352    (which is the most that IEEE 754 "double" can guarantee) using
353    double_format='.15g'. To ensure that converting to text and back to a proto
354    will result in an identical value, double_format='.17g' should be used.
355
356    Args:
357      out: To record the text format result.
358      indent: The initial indent level for pretty print.
359      as_utf8: Return unescaped Unicode for non-ASCII characters.
360          In Python 3 actual Unicode characters may appear as is in strings.
361          In Python 2 the return value will be valid UTF-8 rather than ASCII.
362      as_one_line: Don't introduce newlines between fields.
363      use_short_repeated_primitives: Use short repeated format for primitives.
364      pointy_brackets: If True, use angle brackets instead of curly braces for
365        nesting.
366      use_index_order: If True, print fields of a proto message using the order
367        defined in source code instead of the field number. By default, use the
368        field number order.
369      float_format: If set, use this to specify float field formatting
370        (per the "Format Specification Mini-Language"); otherwise, shortest
371        float that has same value in wire will be printed. Also affect double
372        field if double_format is not set but float_format is set.
373      double_format: If set, use this to specify double field formatting
374        (per the "Format Specification Mini-Language"); if it is not set but
375        float_format is set, use float_format. Otherwise, str() is used.
376      use_field_number: If True, print field numbers instead of names.
377      descriptor_pool: A DescriptorPool used to resolve Any types.
378      message_formatter: A function(message, indent, as_one_line): unicode|None
379        to custom format selected sub-messages (usually based on message type).
380        Use to pretty print parts of the protobuf for easier diffing.
381      print_unknown_fields: If True, unknown fields will be printed.
382      force_colon: If set, a colon will be added after the field name even if
383        the field is a proto message.
384    """
385    self.out = out
386    self.indent = indent
387    self.as_utf8 = as_utf8
388    self.as_one_line = as_one_line
389    self.use_short_repeated_primitives = use_short_repeated_primitives
390    self.pointy_brackets = pointy_brackets
391    self.use_index_order = use_index_order
392    self.float_format = float_format
393    if double_format is not None:
394      self.double_format = double_format
395    else:
396      self.double_format = float_format
397    self.use_field_number = use_field_number
398    self.descriptor_pool = descriptor_pool
399    self.message_formatter = message_formatter
400    self.print_unknown_fields = print_unknown_fields
401    self.force_colon = force_colon
402
403  def _TryPrintAsAnyMessage(self, message):
404    """Serializes if message is a google.protobuf.Any field."""
405    if '/' not in message.type_url:
406      return False
407    packed_message = _BuildMessageFromTypeName(message.TypeName(),
408                                               self.descriptor_pool)
409    if packed_message:
410      packed_message.MergeFromString(message.value)
411      colon = ':' if self.force_colon else ''
412      self.out.write('%s[%s]%s ' % (self.indent * ' ', message.type_url, colon))
413      self._PrintMessageFieldValue(packed_message)
414      self.out.write(' ' if self.as_one_line else '\n')
415      return True
416    else:
417      return False
418
419  def _TryCustomFormatMessage(self, message):
420    formatted = self.message_formatter(message, self.indent, self.as_one_line)
421    if formatted is None:
422      return False
423
424    out = self.out
425    out.write(' ' * self.indent)
426    out.write(formatted)
427    out.write(' ' if self.as_one_line else '\n')
428    return True
429
430  def PrintMessage(self, message):
431    """Convert protobuf message to text format.
432
433    Args:
434      message: The protocol buffers message.
435    """
436    if self.message_formatter and self._TryCustomFormatMessage(message):
437      return
438    if (message.DESCRIPTOR.full_name == _ANY_FULL_TYPE_NAME and
439        self._TryPrintAsAnyMessage(message)):
440      return
441    fields = message.ListFields()
442    if self.use_index_order:
443      fields.sort(
444          key=lambda x: x[0].number if x[0].is_extension else x[0].index)
445    for field, value in fields:
446      if _IsMapEntry(field):
447        for key in sorted(value):
448          # This is slow for maps with submessage entries because it copies the
449          # entire tree.  Unfortunately this would take significant refactoring
450          # of this file to work around.
451          #
452          # TODO(haberman): refactor and optimize if this becomes an issue.
453          entry_submsg = value.GetEntryClass()(key=key, value=value[key])
454          self.PrintField(field, entry_submsg)
455      elif field.label == descriptor.FieldDescriptor.LABEL_REPEATED:
456        if (self.use_short_repeated_primitives
457            and field.cpp_type != descriptor.FieldDescriptor.CPPTYPE_MESSAGE
458            and field.cpp_type != descriptor.FieldDescriptor.CPPTYPE_STRING):
459          self._PrintShortRepeatedPrimitivesValue(field, value)
460        else:
461          for element in value:
462            self.PrintField(field, element)
463      else:
464        self.PrintField(field, value)
465
466    if self.print_unknown_fields:
467      self._PrintUnknownFields(message.UnknownFields())
468
469  def _PrintUnknownFields(self, unknown_fields):
470    """Print unknown fields."""
471    out = self.out
472    for field in unknown_fields:
473      out.write(' ' * self.indent)
474      out.write(str(field.field_number))
475      if field.wire_type == WIRETYPE_START_GROUP:
476        if self.as_one_line:
477          out.write(' { ')
478        else:
479          out.write(' {\n')
480          self.indent += 2
481
482        self._PrintUnknownFields(field.data)
483
484        if self.as_one_line:
485          out.write('} ')
486        else:
487          self.indent -= 2
488          out.write(' ' * self.indent + '}\n')
489      elif field.wire_type == WIRETYPE_LENGTH_DELIMITED:
490        try:
491          # If this field is parseable as a Message, it is probably
492          # an embedded message.
493          # pylint: disable=protected-access
494          (embedded_unknown_message, pos) = decoder._DecodeUnknownFieldSet(
495              memoryview(field.data), 0, len(field.data))
496        except Exception:    # pylint: disable=broad-except
497          pos = 0
498
499        if pos == len(field.data):
500          if self.as_one_line:
501            out.write(' { ')
502          else:
503            out.write(' {\n')
504            self.indent += 2
505
506          self._PrintUnknownFields(embedded_unknown_message)
507
508          if self.as_one_line:
509            out.write('} ')
510          else:
511            self.indent -= 2
512            out.write(' ' * self.indent + '}\n')
513        else:
514          # A string or bytes field. self.as_utf8 may not work.
515          out.write(': \"')
516          out.write(text_encoding.CEscape(field.data, False))
517          out.write('\" ' if self.as_one_line else '\"\n')
518      else:
519        # varint, fixed32, fixed64
520        out.write(': ')
521        out.write(str(field.data))
522        out.write(' ' if self.as_one_line else '\n')
523
524  def _PrintFieldName(self, field):
525    """Print field name."""
526    out = self.out
527    out.write(' ' * self.indent)
528    if self.use_field_number:
529      out.write(str(field.number))
530    else:
531      if field.is_extension:
532        out.write('[')
533        if (field.containing_type.GetOptions().message_set_wire_format and
534            field.type == descriptor.FieldDescriptor.TYPE_MESSAGE and
535            field.label == descriptor.FieldDescriptor.LABEL_OPTIONAL):
536          out.write(field.message_type.full_name)
537        else:
538          out.write(field.full_name)
539        out.write(']')
540      elif field.type == descriptor.FieldDescriptor.TYPE_GROUP:
541        # For groups, use the capitalized name.
542        out.write(field.message_type.name)
543      else:
544        out.write(field.name)
545
546    if (self.force_colon or
547        field.cpp_type != descriptor.FieldDescriptor.CPPTYPE_MESSAGE):
548      # The colon is optional in this case, but our cross-language golden files
549      # don't include it. Here, the colon is only included if force_colon is
550      # set to True
551      out.write(':')
552
553  def PrintField(self, field, value):
554    """Print a single field name/value pair."""
555    self._PrintFieldName(field)
556    self.out.write(' ')
557    self.PrintFieldValue(field, value)
558    self.out.write(' ' if self.as_one_line else '\n')
559
560  def _PrintShortRepeatedPrimitivesValue(self, field, value):
561    """"Prints short repeated primitives value."""
562    # Note: this is called only when value has at least one element.
563    self._PrintFieldName(field)
564    self.out.write(' [')
565    for i in six.moves.range(len(value) - 1):
566      self.PrintFieldValue(field, value[i])
567      self.out.write(', ')
568    self.PrintFieldValue(field, value[-1])
569    self.out.write(']')
570    if self.force_colon:
571      self.out.write(':')
572    self.out.write(' ' if self.as_one_line else '\n')
573
574  def _PrintMessageFieldValue(self, value):
575    if self.pointy_brackets:
576      openb = '<'
577      closeb = '>'
578    else:
579      openb = '{'
580      closeb = '}'
581
582    if self.as_one_line:
583      self.out.write('%s ' % openb)
584      self.PrintMessage(value)
585      self.out.write(closeb)
586    else:
587      self.out.write('%s\n' % openb)
588      self.indent += 2
589      self.PrintMessage(value)
590      self.indent -= 2
591      self.out.write(' ' * self.indent + closeb)
592
593  def PrintFieldValue(self, field, value):
594    """Print a single field value (not including name).
595
596    For repeated fields, the value should be a single element.
597
598    Args:
599      field: The descriptor of the field to be printed.
600      value: The value of the field.
601    """
602    out = self.out
603    if field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_MESSAGE:
604      self._PrintMessageFieldValue(value)
605    elif field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_ENUM:
606      enum_value = field.enum_type.values_by_number.get(value, None)
607      if enum_value is not None:
608        out.write(enum_value.name)
609      else:
610        out.write(str(value))
611    elif field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_STRING:
612      out.write('\"')
613      if isinstance(value, six.text_type) and (six.PY2 or not self.as_utf8):
614        out_value = value.encode('utf-8')
615      else:
616        out_value = value
617      if field.type == descriptor.FieldDescriptor.TYPE_BYTES:
618        # We always need to escape all binary data in TYPE_BYTES fields.
619        out_as_utf8 = False
620      else:
621        out_as_utf8 = self.as_utf8
622      out.write(text_encoding.CEscape(out_value, out_as_utf8))
623      out.write('\"')
624    elif field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_BOOL:
625      if value:
626        out.write('true')
627      else:
628        out.write('false')
629    elif field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_FLOAT:
630      if self.float_format is not None:
631        out.write('{1:{0}}'.format(self.float_format, value))
632      else:
633        if math.isnan(value):
634          out.write(str(value))
635        else:
636          out.write(str(type_checkers.ToShortestFloat(value)))
637    elif (field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_DOUBLE and
638          self.double_format is not None):
639      out.write('{1:{0}}'.format(self.double_format, value))
640    else:
641      out.write(str(value))
642
643
644def Parse(text,
645          message,
646          allow_unknown_extension=False,
647          allow_field_number=False,
648          descriptor_pool=None,
649          allow_unknown_field=False):
650  """Parses a text representation of a protocol message into a message.
651
652  NOTE: for historical reasons this function does not clear the input
653  message. This is different from what the binary msg.ParseFrom(...) does.
654  If text contains a field already set in message, the value is appended if the
655  field is repeated. Otherwise, an error is raised.
656
657  Example::
658
659    a = MyProto()
660    a.repeated_field.append('test')
661    b = MyProto()
662
663    # Repeated fields are combined
664    text_format.Parse(repr(a), b)
665    text_format.Parse(repr(a), b) # repeated_field contains ["test", "test"]
666
667    # Non-repeated fields cannot be overwritten
668    a.singular_field = 1
669    b.singular_field = 2
670    text_format.Parse(repr(a), b) # ParseError
671
672    # Binary version:
673    b.ParseFromString(a.SerializeToString()) # repeated_field is now "test"
674
675  Caller is responsible for clearing the message as needed.
676
677  Args:
678    text (str): Message text representation.
679    message (Message): A protocol buffer message to merge into.
680    allow_unknown_extension: if True, skip over missing extensions and keep
681      parsing
682    allow_field_number: if True, both field number and field name are allowed.
683    descriptor_pool (DescriptorPool): Descriptor pool used to resolve Any types.
684    allow_unknown_field: if True, skip over unknown field and keep
685      parsing. Avoid to use this option if possible. It may hide some
686      errors (e.g. spelling error on field name)
687
688  Returns:
689    Message: The same message passed as argument.
690
691  Raises:
692    ParseError: On text parsing problems.
693  """
694  return ParseLines(text.split(b'\n' if isinstance(text, bytes) else u'\n'),
695                    message,
696                    allow_unknown_extension,
697                    allow_field_number,
698                    descriptor_pool=descriptor_pool,
699                    allow_unknown_field=allow_unknown_field)
700
701
702def Merge(text,
703          message,
704          allow_unknown_extension=False,
705          allow_field_number=False,
706          descriptor_pool=None,
707          allow_unknown_field=False):
708  """Parses a text representation of a protocol message into a message.
709
710  Like Parse(), but allows repeated values for a non-repeated field, and uses
711  the last one. This means any non-repeated, top-level fields specified in text
712  replace those in the message.
713
714  Args:
715    text (str): Message text representation.
716    message (Message): A protocol buffer message to merge into.
717    allow_unknown_extension: if True, skip over missing extensions and keep
718      parsing
719    allow_field_number: if True, both field number and field name are allowed.
720    descriptor_pool (DescriptorPool): Descriptor pool used to resolve Any types.
721    allow_unknown_field: if True, skip over unknown field and keep
722      parsing. Avoid to use this option if possible. It may hide some
723      errors (e.g. spelling error on field name)
724
725  Returns:
726    Message: The same message passed as argument.
727
728  Raises:
729    ParseError: On text parsing problems.
730  """
731  return MergeLines(
732      text.split(b'\n' if isinstance(text, bytes) else u'\n'),
733      message,
734      allow_unknown_extension,
735      allow_field_number,
736      descriptor_pool=descriptor_pool,
737      allow_unknown_field=allow_unknown_field)
738
739
740def ParseLines(lines,
741               message,
742               allow_unknown_extension=False,
743               allow_field_number=False,
744               descriptor_pool=None,
745               allow_unknown_field=False):
746  """Parses a text representation of a protocol message into a message.
747
748  See Parse() for caveats.
749
750  Args:
751    lines: An iterable of lines of a message's text representation.
752    message: A protocol buffer message to merge into.
753    allow_unknown_extension: if True, skip over missing extensions and keep
754      parsing
755    allow_field_number: if True, both field number and field name are allowed.
756    descriptor_pool: A DescriptorPool used to resolve Any types.
757    allow_unknown_field: if True, skip over unknown field and keep
758      parsing. Avoid to use this option if possible. It may hide some
759      errors (e.g. spelling error on field name)
760
761  Returns:
762    The same message passed as argument.
763
764  Raises:
765    ParseError: On text parsing problems.
766  """
767  parser = _Parser(allow_unknown_extension,
768                   allow_field_number,
769                   descriptor_pool=descriptor_pool,
770                   allow_unknown_field=allow_unknown_field)
771  return parser.ParseLines(lines, message)
772
773
774def MergeLines(lines,
775               message,
776               allow_unknown_extension=False,
777               allow_field_number=False,
778               descriptor_pool=None,
779               allow_unknown_field=False):
780  """Parses a text representation of a protocol message into a message.
781
782  See Merge() for more details.
783
784  Args:
785    lines: An iterable of lines of a message's text representation.
786    message: A protocol buffer message to merge into.
787    allow_unknown_extension: if True, skip over missing extensions and keep
788      parsing
789    allow_field_number: if True, both field number and field name are allowed.
790    descriptor_pool: A DescriptorPool used to resolve Any types.
791    allow_unknown_field: if True, skip over unknown field and keep
792      parsing. Avoid to use this option if possible. It may hide some
793      errors (e.g. spelling error on field name)
794
795  Returns:
796    The same message passed as argument.
797
798  Raises:
799    ParseError: On text parsing problems.
800  """
801  parser = _Parser(allow_unknown_extension,
802                   allow_field_number,
803                   descriptor_pool=descriptor_pool,
804                   allow_unknown_field=allow_unknown_field)
805  return parser.MergeLines(lines, message)
806
807
808class _Parser(object):
809  """Text format parser for protocol message."""
810
811  def __init__(self,
812               allow_unknown_extension=False,
813               allow_field_number=False,
814               descriptor_pool=None,
815               allow_unknown_field=False):
816    self.allow_unknown_extension = allow_unknown_extension
817    self.allow_field_number = allow_field_number
818    self.descriptor_pool = descriptor_pool
819    self.allow_unknown_field = allow_unknown_field
820
821  def ParseLines(self, lines, message):
822    """Parses a text representation of a protocol message into a message."""
823    self._allow_multiple_scalars = False
824    self._ParseOrMerge(lines, message)
825    return message
826
827  def MergeLines(self, lines, message):
828    """Merges a text representation of a protocol message into a message."""
829    self._allow_multiple_scalars = True
830    self._ParseOrMerge(lines, message)
831    return message
832
833  def _ParseOrMerge(self, lines, message):
834    """Converts a text representation of a protocol message into a message.
835
836    Args:
837      lines: Lines of a message's text representation.
838      message: A protocol buffer message to merge into.
839
840    Raises:
841      ParseError: On text parsing problems.
842    """
843    # Tokenize expects native str lines.
844    if six.PY2:
845      str_lines = (line if isinstance(line, str) else line.encode('utf-8')
846                   for line in lines)
847    else:
848      str_lines = (line if isinstance(line, str) else line.decode('utf-8')
849                   for line in lines)
850    tokenizer = Tokenizer(str_lines)
851    while not tokenizer.AtEnd():
852      self._MergeField(tokenizer, message)
853
854  def _MergeField(self, tokenizer, message):
855    """Merges a single protocol message field into a message.
856
857    Args:
858      tokenizer: A tokenizer to parse the field name and values.
859      message: A protocol message to record the data.
860
861    Raises:
862      ParseError: In case of text parsing problems.
863    """
864    message_descriptor = message.DESCRIPTOR
865    if (message_descriptor.full_name == _ANY_FULL_TYPE_NAME and
866        tokenizer.TryConsume('[')):
867      type_url_prefix, packed_type_name = self._ConsumeAnyTypeUrl(tokenizer)
868      tokenizer.Consume(']')
869      tokenizer.TryConsume(':')
870      if tokenizer.TryConsume('<'):
871        expanded_any_end_token = '>'
872      else:
873        tokenizer.Consume('{')
874        expanded_any_end_token = '}'
875      expanded_any_sub_message = _BuildMessageFromTypeName(packed_type_name,
876                                                           self.descriptor_pool)
877      if not expanded_any_sub_message:
878        raise ParseError('Type %s not found in descriptor pool' %
879                         packed_type_name)
880      while not tokenizer.TryConsume(expanded_any_end_token):
881        if tokenizer.AtEnd():
882          raise tokenizer.ParseErrorPreviousToken('Expected "%s".' %
883                                                  (expanded_any_end_token,))
884        self._MergeField(tokenizer, expanded_any_sub_message)
885      deterministic = False
886
887      message.Pack(expanded_any_sub_message,
888                   type_url_prefix=type_url_prefix,
889                   deterministic=deterministic)
890      return
891
892    if tokenizer.TryConsume('['):
893      name = [tokenizer.ConsumeIdentifier()]
894      while tokenizer.TryConsume('.'):
895        name.append(tokenizer.ConsumeIdentifier())
896      name = '.'.join(name)
897
898      if not message_descriptor.is_extendable:
899        raise tokenizer.ParseErrorPreviousToken(
900            'Message type "%s" does not have extensions.' %
901            message_descriptor.full_name)
902      # pylint: disable=protected-access
903      field = message.Extensions._FindExtensionByName(name)
904      # pylint: enable=protected-access
905      if not field:
906        if self.allow_unknown_extension:
907          field = None
908        else:
909          raise tokenizer.ParseErrorPreviousToken(
910              'Extension "%s" not registered. '
911              'Did you import the _pb2 module which defines it? '
912              'If you are trying to place the extension in the MessageSet '
913              'field of another message that is in an Any or MessageSet field, '
914              'that message\'s _pb2 module must be imported as well' % name)
915      elif message_descriptor != field.containing_type:
916        raise tokenizer.ParseErrorPreviousToken(
917            'Extension "%s" does not extend message type "%s".' %
918            (name, message_descriptor.full_name))
919
920      tokenizer.Consume(']')
921
922    else:
923      name = tokenizer.ConsumeIdentifierOrNumber()
924      if self.allow_field_number and name.isdigit():
925        number = ParseInteger(name, True, True)
926        field = message_descriptor.fields_by_number.get(number, None)
927        if not field and message_descriptor.is_extendable:
928          field = message.Extensions._FindExtensionByNumber(number)
929      else:
930        field = message_descriptor.fields_by_name.get(name, None)
931
932        # Group names are expected to be capitalized as they appear in the
933        # .proto file, which actually matches their type names, not their field
934        # names.
935        if not field:
936          field = message_descriptor.fields_by_name.get(name.lower(), None)
937          if field and field.type != descriptor.FieldDescriptor.TYPE_GROUP:
938            field = None
939
940        if (field and field.type == descriptor.FieldDescriptor.TYPE_GROUP and
941            field.message_type.name != name):
942          field = None
943
944      if not field and not self.allow_unknown_field:
945        raise tokenizer.ParseErrorPreviousToken(
946            'Message type "%s" has no field named "%s".' %
947            (message_descriptor.full_name, name))
948
949    if field:
950      if not self._allow_multiple_scalars and field.containing_oneof:
951        # Check if there's a different field set in this oneof.
952        # Note that we ignore the case if the same field was set before, and we
953        # apply _allow_multiple_scalars to non-scalar fields as well.
954        which_oneof = message.WhichOneof(field.containing_oneof.name)
955        if which_oneof is not None and which_oneof != field.name:
956          raise tokenizer.ParseErrorPreviousToken(
957              'Field "%s" is specified along with field "%s", another member '
958              'of oneof "%s" for message type "%s".' %
959              (field.name, which_oneof, field.containing_oneof.name,
960               message_descriptor.full_name))
961
962      if field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_MESSAGE:
963        tokenizer.TryConsume(':')
964        merger = self._MergeMessageField
965      else:
966        tokenizer.Consume(':')
967        merger = self._MergeScalarField
968
969      if (field.label == descriptor.FieldDescriptor.LABEL_REPEATED and
970          tokenizer.TryConsume('[')):
971        # Short repeated format, e.g. "foo: [1, 2, 3]"
972        if not tokenizer.TryConsume(']'):
973          while True:
974            merger(tokenizer, message, field)
975            if tokenizer.TryConsume(']'):
976              break
977            tokenizer.Consume(',')
978
979      else:
980        merger(tokenizer, message, field)
981
982    else:  # Proto field is unknown.
983      assert (self.allow_unknown_extension or self.allow_unknown_field)
984      _SkipFieldContents(tokenizer)
985
986    # For historical reasons, fields may optionally be separated by commas or
987    # semicolons.
988    if not tokenizer.TryConsume(','):
989      tokenizer.TryConsume(';')
990
991  def _ConsumeAnyTypeUrl(self, tokenizer):
992    """Consumes a google.protobuf.Any type URL and returns the type name."""
993    # Consume "type.googleapis.com/".
994    prefix = [tokenizer.ConsumeIdentifier()]
995    tokenizer.Consume('.')
996    prefix.append(tokenizer.ConsumeIdentifier())
997    tokenizer.Consume('.')
998    prefix.append(tokenizer.ConsumeIdentifier())
999    tokenizer.Consume('/')
1000    # Consume the fully-qualified type name.
1001    name = [tokenizer.ConsumeIdentifier()]
1002    while tokenizer.TryConsume('.'):
1003      name.append(tokenizer.ConsumeIdentifier())
1004    return '.'.join(prefix), '.'.join(name)
1005
1006  def _MergeMessageField(self, tokenizer, message, field):
1007    """Merges a single scalar field into a message.
1008
1009    Args:
1010      tokenizer: A tokenizer to parse the field value.
1011      message: The message of which field is a member.
1012      field: The descriptor of the field to be merged.
1013
1014    Raises:
1015      ParseError: In case of text parsing problems.
1016    """
1017    is_map_entry = _IsMapEntry(field)
1018
1019    if tokenizer.TryConsume('<'):
1020      end_token = '>'
1021    else:
1022      tokenizer.Consume('{')
1023      end_token = '}'
1024
1025    if field.label == descriptor.FieldDescriptor.LABEL_REPEATED:
1026      if field.is_extension:
1027        sub_message = message.Extensions[field].add()
1028      elif is_map_entry:
1029        sub_message = getattr(message, field.name).GetEntryClass()()
1030      else:
1031        sub_message = getattr(message, field.name).add()
1032    else:
1033      if field.is_extension:
1034        if (not self._allow_multiple_scalars and
1035            message.HasExtension(field)):
1036          raise tokenizer.ParseErrorPreviousToken(
1037              'Message type "%s" should not have multiple "%s" extensions.' %
1038              (message.DESCRIPTOR.full_name, field.full_name))
1039        sub_message = message.Extensions[field]
1040      else:
1041        # Also apply _allow_multiple_scalars to message field.
1042        # TODO(jieluo): Change to _allow_singular_overwrites.
1043        if (not self._allow_multiple_scalars and
1044            message.HasField(field.name)):
1045          raise tokenizer.ParseErrorPreviousToken(
1046              'Message type "%s" should not have multiple "%s" fields.' %
1047              (message.DESCRIPTOR.full_name, field.name))
1048        sub_message = getattr(message, field.name)
1049      sub_message.SetInParent()
1050
1051    while not tokenizer.TryConsume(end_token):
1052      if tokenizer.AtEnd():
1053        raise tokenizer.ParseErrorPreviousToken('Expected "%s".' % (end_token,))
1054      self._MergeField(tokenizer, sub_message)
1055
1056    if is_map_entry:
1057      value_cpptype = field.message_type.fields_by_name['value'].cpp_type
1058      if value_cpptype == descriptor.FieldDescriptor.CPPTYPE_MESSAGE:
1059        value = getattr(message, field.name)[sub_message.key]
1060        value.MergeFrom(sub_message.value)
1061      else:
1062        getattr(message, field.name)[sub_message.key] = sub_message.value
1063
1064  @staticmethod
1065  def _IsProto3Syntax(message):
1066    message_descriptor = message.DESCRIPTOR
1067    return (hasattr(message_descriptor, 'syntax') and
1068            message_descriptor.syntax == 'proto3')
1069
1070  def _MergeScalarField(self, tokenizer, message, field):
1071    """Merges a single scalar field into a message.
1072
1073    Args:
1074      tokenizer: A tokenizer to parse the field value.
1075      message: A protocol message to record the data.
1076      field: The descriptor of the field to be merged.
1077
1078    Raises:
1079      ParseError: In case of text parsing problems.
1080      RuntimeError: On runtime errors.
1081    """
1082    _ = self.allow_unknown_extension
1083    value = None
1084
1085    if field.type in (descriptor.FieldDescriptor.TYPE_INT32,
1086                      descriptor.FieldDescriptor.TYPE_SINT32,
1087                      descriptor.FieldDescriptor.TYPE_SFIXED32):
1088      value = _ConsumeInt32(tokenizer)
1089    elif field.type in (descriptor.FieldDescriptor.TYPE_INT64,
1090                        descriptor.FieldDescriptor.TYPE_SINT64,
1091                        descriptor.FieldDescriptor.TYPE_SFIXED64):
1092      value = _ConsumeInt64(tokenizer)
1093    elif field.type in (descriptor.FieldDescriptor.TYPE_UINT32,
1094                        descriptor.FieldDescriptor.TYPE_FIXED32):
1095      value = _ConsumeUint32(tokenizer)
1096    elif field.type in (descriptor.FieldDescriptor.TYPE_UINT64,
1097                        descriptor.FieldDescriptor.TYPE_FIXED64):
1098      value = _ConsumeUint64(tokenizer)
1099    elif field.type in (descriptor.FieldDescriptor.TYPE_FLOAT,
1100                        descriptor.FieldDescriptor.TYPE_DOUBLE):
1101      value = tokenizer.ConsumeFloat()
1102    elif field.type == descriptor.FieldDescriptor.TYPE_BOOL:
1103      value = tokenizer.ConsumeBool()
1104    elif field.type == descriptor.FieldDescriptor.TYPE_STRING:
1105      value = tokenizer.ConsumeString()
1106    elif field.type == descriptor.FieldDescriptor.TYPE_BYTES:
1107      value = tokenizer.ConsumeByteString()
1108    elif field.type == descriptor.FieldDescriptor.TYPE_ENUM:
1109      value = tokenizer.ConsumeEnum(field)
1110    else:
1111      raise RuntimeError('Unknown field type %d' % field.type)
1112
1113    if field.label == descriptor.FieldDescriptor.LABEL_REPEATED:
1114      if field.is_extension:
1115        message.Extensions[field].append(value)
1116      else:
1117        getattr(message, field.name).append(value)
1118    else:
1119      if field.is_extension:
1120        if (not self._allow_multiple_scalars and
1121            not self._IsProto3Syntax(message) and
1122            message.HasExtension(field)):
1123          raise tokenizer.ParseErrorPreviousToken(
1124              'Message type "%s" should not have multiple "%s" extensions.' %
1125              (message.DESCRIPTOR.full_name, field.full_name))
1126        else:
1127          message.Extensions[field] = value
1128      else:
1129        duplicate_error = False
1130        if not self._allow_multiple_scalars:
1131          if self._IsProto3Syntax(message):
1132            # Proto3 doesn't represent presence so we try best effort to check
1133            # multiple scalars by compare to default values.
1134            duplicate_error = bool(getattr(message, field.name))
1135          else:
1136            duplicate_error = message.HasField(field.name)
1137
1138        if duplicate_error:
1139          raise tokenizer.ParseErrorPreviousToken(
1140              'Message type "%s" should not have multiple "%s" fields.' %
1141              (message.DESCRIPTOR.full_name, field.name))
1142        else:
1143          setattr(message, field.name, value)
1144
1145
1146def _SkipFieldContents(tokenizer):
1147  """Skips over contents (value or message) of a field.
1148
1149  Args:
1150    tokenizer: A tokenizer to parse the field name and values.
1151  """
1152  # Try to guess the type of this field.
1153  # If this field is not a message, there should be a ":" between the
1154  # field name and the field value and also the field value should not
1155  # start with "{" or "<" which indicates the beginning of a message body.
1156  # If there is no ":" or there is a "{" or "<" after ":", this field has
1157  # to be a message or the input is ill-formed.
1158  if tokenizer.TryConsume(':') and not tokenizer.LookingAt(
1159      '{') and not tokenizer.LookingAt('<'):
1160    _SkipFieldValue(tokenizer)
1161  else:
1162    _SkipFieldMessage(tokenizer)
1163
1164
1165def _SkipField(tokenizer):
1166  """Skips over a complete field (name and value/message).
1167
1168  Args:
1169    tokenizer: A tokenizer to parse the field name and values.
1170  """
1171  if tokenizer.TryConsume('['):
1172    # Consume extension name.
1173    tokenizer.ConsumeIdentifier()
1174    while tokenizer.TryConsume('.'):
1175      tokenizer.ConsumeIdentifier()
1176    tokenizer.Consume(']')
1177  else:
1178    tokenizer.ConsumeIdentifierOrNumber()
1179
1180  _SkipFieldContents(tokenizer)
1181
1182  # For historical reasons, fields may optionally be separated by commas or
1183  # semicolons.
1184  if not tokenizer.TryConsume(','):
1185    tokenizer.TryConsume(';')
1186
1187
1188def _SkipFieldMessage(tokenizer):
1189  """Skips over a field message.
1190
1191  Args:
1192    tokenizer: A tokenizer to parse the field name and values.
1193  """
1194
1195  if tokenizer.TryConsume('<'):
1196    delimiter = '>'
1197  else:
1198    tokenizer.Consume('{')
1199    delimiter = '}'
1200
1201  while not tokenizer.LookingAt('>') and not tokenizer.LookingAt('}'):
1202    _SkipField(tokenizer)
1203
1204  tokenizer.Consume(delimiter)
1205
1206
1207def _SkipFieldValue(tokenizer):
1208  """Skips over a field value.
1209
1210  Args:
1211    tokenizer: A tokenizer to parse the field name and values.
1212
1213  Raises:
1214    ParseError: In case an invalid field value is found.
1215  """
1216  # String/bytes tokens can come in multiple adjacent string literals.
1217  # If we can consume one, consume as many as we can.
1218  if tokenizer.TryConsumeByteString():
1219    while tokenizer.TryConsumeByteString():
1220      pass
1221    return
1222
1223  if (not tokenizer.TryConsumeIdentifier() and
1224      not _TryConsumeInt64(tokenizer) and not _TryConsumeUint64(tokenizer) and
1225      not tokenizer.TryConsumeFloat()):
1226    raise ParseError('Invalid field value: ' + tokenizer.token)
1227
1228
1229class Tokenizer(object):
1230  """Protocol buffer text representation tokenizer.
1231
1232  This class handles the lower level string parsing by splitting it into
1233  meaningful tokens.
1234
1235  It was directly ported from the Java protocol buffer API.
1236  """
1237
1238  _WHITESPACE = re.compile(r'\s+')
1239  _COMMENT = re.compile(r'(\s*#.*$)', re.MULTILINE)
1240  _WHITESPACE_OR_COMMENT = re.compile(r'(\s|(#.*$))+', re.MULTILINE)
1241  _TOKEN = re.compile('|'.join([
1242      r'[a-zA-Z_][0-9a-zA-Z_+-]*',  # an identifier
1243      r'([0-9+-]|(\.[0-9]))[0-9a-zA-Z_.+-]*',  # a number
1244  ] + [  # quoted str for each quote mark
1245      # Avoid backtracking! https://stackoverflow.com/a/844267
1246      r'{qt}[^{qt}\n\\]*((\\.)+[^{qt}\n\\]*)*({qt}|\\?$)'.format(qt=mark)
1247      for mark in _QUOTES
1248  ]))
1249
1250  _IDENTIFIER = re.compile(r'[^\d\W]\w*')
1251  _IDENTIFIER_OR_NUMBER = re.compile(r'\w+')
1252
1253  def __init__(self, lines, skip_comments=True):
1254    self._position = 0
1255    self._line = -1
1256    self._column = 0
1257    self._token_start = None
1258    self.token = ''
1259    self._lines = iter(lines)
1260    self._current_line = ''
1261    self._previous_line = 0
1262    self._previous_column = 0
1263    self._more_lines = True
1264    self._skip_comments = skip_comments
1265    self._whitespace_pattern = (skip_comments and self._WHITESPACE_OR_COMMENT
1266                                or self._WHITESPACE)
1267    self._SkipWhitespace()
1268    self.NextToken()
1269
1270  def LookingAt(self, token):
1271    return self.token == token
1272
1273  def AtEnd(self):
1274    """Checks the end of the text was reached.
1275
1276    Returns:
1277      True iff the end was reached.
1278    """
1279    return not self.token
1280
1281  def _PopLine(self):
1282    while len(self._current_line) <= self._column:
1283      try:
1284        self._current_line = next(self._lines)
1285      except StopIteration:
1286        self._current_line = ''
1287        self._more_lines = False
1288        return
1289      else:
1290        self._line += 1
1291        self._column = 0
1292
1293  def _SkipWhitespace(self):
1294    while True:
1295      self._PopLine()
1296      match = self._whitespace_pattern.match(self._current_line, self._column)
1297      if not match:
1298        break
1299      length = len(match.group(0))
1300      self._column += length
1301
1302  def TryConsume(self, token):
1303    """Tries to consume a given piece of text.
1304
1305    Args:
1306      token: Text to consume.
1307
1308    Returns:
1309      True iff the text was consumed.
1310    """
1311    if self.token == token:
1312      self.NextToken()
1313      return True
1314    return False
1315
1316  def Consume(self, token):
1317    """Consumes a piece of text.
1318
1319    Args:
1320      token: Text to consume.
1321
1322    Raises:
1323      ParseError: If the text couldn't be consumed.
1324    """
1325    if not self.TryConsume(token):
1326      raise self.ParseError('Expected "%s".' % token)
1327
1328  def ConsumeComment(self):
1329    result = self.token
1330    if not self._COMMENT.match(result):
1331      raise self.ParseError('Expected comment.')
1332    self.NextToken()
1333    return result
1334
1335  def ConsumeCommentOrTrailingComment(self):
1336    """Consumes a comment, returns a 2-tuple (trailing bool, comment str)."""
1337
1338    # Tokenizer initializes _previous_line and _previous_column to 0. As the
1339    # tokenizer starts, it looks like there is a previous token on the line.
1340    just_started = self._line == 0 and self._column == 0
1341
1342    before_parsing = self._previous_line
1343    comment = self.ConsumeComment()
1344
1345    # A trailing comment is a comment on the same line than the previous token.
1346    trailing = (self._previous_line == before_parsing
1347                and not just_started)
1348
1349    return trailing, comment
1350
1351  def TryConsumeIdentifier(self):
1352    try:
1353      self.ConsumeIdentifier()
1354      return True
1355    except ParseError:
1356      return False
1357
1358  def ConsumeIdentifier(self):
1359    """Consumes protocol message field identifier.
1360
1361    Returns:
1362      Identifier string.
1363
1364    Raises:
1365      ParseError: If an identifier couldn't be consumed.
1366    """
1367    result = self.token
1368    if not self._IDENTIFIER.match(result):
1369      raise self.ParseError('Expected identifier.')
1370    self.NextToken()
1371    return result
1372
1373  def TryConsumeIdentifierOrNumber(self):
1374    try:
1375      self.ConsumeIdentifierOrNumber()
1376      return True
1377    except ParseError:
1378      return False
1379
1380  def ConsumeIdentifierOrNumber(self):
1381    """Consumes protocol message field identifier.
1382
1383    Returns:
1384      Identifier string.
1385
1386    Raises:
1387      ParseError: If an identifier couldn't be consumed.
1388    """
1389    result = self.token
1390    if not self._IDENTIFIER_OR_NUMBER.match(result):
1391      raise self.ParseError('Expected identifier or number, got %s.' % result)
1392    self.NextToken()
1393    return result
1394
1395  def TryConsumeInteger(self):
1396    try:
1397      # Note: is_long only affects value type, not whether an error is raised.
1398      self.ConsumeInteger()
1399      return True
1400    except ParseError:
1401      return False
1402
1403  def ConsumeInteger(self, is_long=False):
1404    """Consumes an integer number.
1405
1406    Args:
1407      is_long: True if the value should be returned as a long integer.
1408    Returns:
1409      The integer parsed.
1410
1411    Raises:
1412      ParseError: If an integer couldn't be consumed.
1413    """
1414    try:
1415      result = _ParseAbstractInteger(self.token, is_long=is_long)
1416    except ValueError as e:
1417      raise self.ParseError(str(e))
1418    self.NextToken()
1419    return result
1420
1421  def TryConsumeFloat(self):
1422    try:
1423      self.ConsumeFloat()
1424      return True
1425    except ParseError:
1426      return False
1427
1428  def ConsumeFloat(self):
1429    """Consumes an floating point number.
1430
1431    Returns:
1432      The number parsed.
1433
1434    Raises:
1435      ParseError: If a floating point number couldn't be consumed.
1436    """
1437    try:
1438      result = ParseFloat(self.token)
1439    except ValueError as e:
1440      raise self.ParseError(str(e))
1441    self.NextToken()
1442    return result
1443
1444  def ConsumeBool(self):
1445    """Consumes a boolean value.
1446
1447    Returns:
1448      The bool parsed.
1449
1450    Raises:
1451      ParseError: If a boolean value couldn't be consumed.
1452    """
1453    try:
1454      result = ParseBool(self.token)
1455    except ValueError as e:
1456      raise self.ParseError(str(e))
1457    self.NextToken()
1458    return result
1459
1460  def TryConsumeByteString(self):
1461    try:
1462      self.ConsumeByteString()
1463      return True
1464    except ParseError:
1465      return False
1466
1467  def ConsumeString(self):
1468    """Consumes a string value.
1469
1470    Returns:
1471      The string parsed.
1472
1473    Raises:
1474      ParseError: If a string value couldn't be consumed.
1475    """
1476    the_bytes = self.ConsumeByteString()
1477    try:
1478      return six.text_type(the_bytes, 'utf-8')
1479    except UnicodeDecodeError as e:
1480      raise self._StringParseError(e)
1481
1482  def ConsumeByteString(self):
1483    """Consumes a byte array value.
1484
1485    Returns:
1486      The array parsed (as a string).
1487
1488    Raises:
1489      ParseError: If a byte array value couldn't be consumed.
1490    """
1491    the_list = [self._ConsumeSingleByteString()]
1492    while self.token and self.token[0] in _QUOTES:
1493      the_list.append(self._ConsumeSingleByteString())
1494    return b''.join(the_list)
1495
1496  def _ConsumeSingleByteString(self):
1497    """Consume one token of a string literal.
1498
1499    String literals (whether bytes or text) can come in multiple adjacent
1500    tokens which are automatically concatenated, like in C or Python.  This
1501    method only consumes one token.
1502
1503    Returns:
1504      The token parsed.
1505    Raises:
1506      ParseError: When the wrong format data is found.
1507    """
1508    text = self.token
1509    if len(text) < 1 or text[0] not in _QUOTES:
1510      raise self.ParseError('Expected string but found: %r' % (text,))
1511
1512    if len(text) < 2 or text[-1] != text[0]:
1513      raise self.ParseError('String missing ending quote: %r' % (text,))
1514
1515    try:
1516      result = text_encoding.CUnescape(text[1:-1])
1517    except ValueError as e:
1518      raise self.ParseError(str(e))
1519    self.NextToken()
1520    return result
1521
1522  def ConsumeEnum(self, field):
1523    try:
1524      result = ParseEnum(field, self.token)
1525    except ValueError as e:
1526      raise self.ParseError(str(e))
1527    self.NextToken()
1528    return result
1529
1530  def ParseErrorPreviousToken(self, message):
1531    """Creates and *returns* a ParseError for the previously read token.
1532
1533    Args:
1534      message: A message to set for the exception.
1535
1536    Returns:
1537      A ParseError instance.
1538    """
1539    return ParseError(message, self._previous_line + 1,
1540                      self._previous_column + 1)
1541
1542  def ParseError(self, message):
1543    """Creates and *returns* a ParseError for the current token."""
1544    return ParseError('\'' + self._current_line + '\': ' + message,
1545                      self._line + 1, self._column + 1)
1546
1547  def _StringParseError(self, e):
1548    return self.ParseError('Couldn\'t parse string: ' + str(e))
1549
1550  def NextToken(self):
1551    """Reads the next meaningful token."""
1552    self._previous_line = self._line
1553    self._previous_column = self._column
1554
1555    self._column += len(self.token)
1556    self._SkipWhitespace()
1557
1558    if not self._more_lines:
1559      self.token = ''
1560      return
1561
1562    match = self._TOKEN.match(self._current_line, self._column)
1563    if not match and not self._skip_comments:
1564      match = self._COMMENT.match(self._current_line, self._column)
1565    if match:
1566      token = match.group(0)
1567      self.token = token
1568    else:
1569      self.token = self._current_line[self._column]
1570
1571# Aliased so it can still be accessed by current visibility violators.
1572# TODO(dbarnett): Migrate violators to textformat_tokenizer.
1573_Tokenizer = Tokenizer  # pylint: disable=invalid-name
1574
1575
1576def _ConsumeInt32(tokenizer):
1577  """Consumes a signed 32bit integer number from tokenizer.
1578
1579  Args:
1580    tokenizer: A tokenizer used to parse the number.
1581
1582  Returns:
1583    The integer parsed.
1584
1585  Raises:
1586    ParseError: If a signed 32bit integer couldn't be consumed.
1587  """
1588  return _ConsumeInteger(tokenizer, is_signed=True, is_long=False)
1589
1590
1591def _ConsumeUint32(tokenizer):
1592  """Consumes an unsigned 32bit integer number from tokenizer.
1593
1594  Args:
1595    tokenizer: A tokenizer used to parse the number.
1596
1597  Returns:
1598    The integer parsed.
1599
1600  Raises:
1601    ParseError: If an unsigned 32bit integer couldn't be consumed.
1602  """
1603  return _ConsumeInteger(tokenizer, is_signed=False, is_long=False)
1604
1605
1606def _TryConsumeInt64(tokenizer):
1607  try:
1608    _ConsumeInt64(tokenizer)
1609    return True
1610  except ParseError:
1611    return False
1612
1613
1614def _ConsumeInt64(tokenizer):
1615  """Consumes a signed 32bit integer number from tokenizer.
1616
1617  Args:
1618    tokenizer: A tokenizer used to parse the number.
1619
1620  Returns:
1621    The integer parsed.
1622
1623  Raises:
1624    ParseError: If a signed 32bit integer couldn't be consumed.
1625  """
1626  return _ConsumeInteger(tokenizer, is_signed=True, is_long=True)
1627
1628
1629def _TryConsumeUint64(tokenizer):
1630  try:
1631    _ConsumeUint64(tokenizer)
1632    return True
1633  except ParseError:
1634    return False
1635
1636
1637def _ConsumeUint64(tokenizer):
1638  """Consumes an unsigned 64bit integer number from tokenizer.
1639
1640  Args:
1641    tokenizer: A tokenizer used to parse the number.
1642
1643  Returns:
1644    The integer parsed.
1645
1646  Raises:
1647    ParseError: If an unsigned 64bit integer couldn't be consumed.
1648  """
1649  return _ConsumeInteger(tokenizer, is_signed=False, is_long=True)
1650
1651
1652def _TryConsumeInteger(tokenizer, is_signed=False, is_long=False):
1653  try:
1654    _ConsumeInteger(tokenizer, is_signed=is_signed, is_long=is_long)
1655    return True
1656  except ParseError:
1657    return False
1658
1659
1660def _ConsumeInteger(tokenizer, is_signed=False, is_long=False):
1661  """Consumes an integer number from tokenizer.
1662
1663  Args:
1664    tokenizer: A tokenizer used to parse the number.
1665    is_signed: True if a signed integer must be parsed.
1666    is_long: True if a long integer must be parsed.
1667
1668  Returns:
1669    The integer parsed.
1670
1671  Raises:
1672    ParseError: If an integer with given characteristics couldn't be consumed.
1673  """
1674  try:
1675    result = ParseInteger(tokenizer.token, is_signed=is_signed, is_long=is_long)
1676  except ValueError as e:
1677    raise tokenizer.ParseError(str(e))
1678  tokenizer.NextToken()
1679  return result
1680
1681
1682def ParseInteger(text, is_signed=False, is_long=False):
1683  """Parses an integer.
1684
1685  Args:
1686    text: The text to parse.
1687    is_signed: True if a signed integer must be parsed.
1688    is_long: True if a long integer must be parsed.
1689
1690  Returns:
1691    The integer value.
1692
1693  Raises:
1694    ValueError: Thrown Iff the text is not a valid integer.
1695  """
1696  # Do the actual parsing. Exception handling is propagated to caller.
1697  result = _ParseAbstractInteger(text, is_long=is_long)
1698
1699  # Check if the integer is sane. Exceptions handled by callers.
1700  checker = _INTEGER_CHECKERS[2 * int(is_long) + int(is_signed)]
1701  checker.CheckValue(result)
1702  return result
1703
1704
1705def _ParseAbstractInteger(text, is_long=False):
1706  """Parses an integer without checking size/signedness.
1707
1708  Args:
1709    text: The text to parse.
1710    is_long: True if the value should be returned as a long integer.
1711
1712  Returns:
1713    The integer value.
1714
1715  Raises:
1716    ValueError: Thrown Iff the text is not a valid integer.
1717  """
1718  # Do the actual parsing. Exception handling is propagated to caller.
1719  orig_text = text
1720  c_octal_match = re.match(r'(-?)0(\d+)$', text)
1721  if c_octal_match:
1722    # Python 3 no longer supports 0755 octal syntax without the 'o', so
1723    # we always use the '0o' prefix for multi-digit numbers starting with 0.
1724    text = c_octal_match.group(1) + '0o' + c_octal_match.group(2)
1725  try:
1726    # We force 32-bit values to int and 64-bit values to long to make
1727    # alternate implementations where the distinction is more significant
1728    # (e.g. the C++ implementation) simpler.
1729    if is_long:
1730      return long(text, 0)
1731    else:
1732      return int(text, 0)
1733  except ValueError:
1734    raise ValueError('Couldn\'t parse integer: %s' % orig_text)
1735
1736
1737def ParseFloat(text):
1738  """Parse a floating point number.
1739
1740  Args:
1741    text: Text to parse.
1742
1743  Returns:
1744    The number parsed.
1745
1746  Raises:
1747    ValueError: If a floating point number couldn't be parsed.
1748  """
1749  try:
1750    # Assume Python compatible syntax.
1751    return float(text)
1752  except ValueError:
1753    # Check alternative spellings.
1754    if _FLOAT_INFINITY.match(text):
1755      if text[0] == '-':
1756        return float('-inf')
1757      else:
1758        return float('inf')
1759    elif _FLOAT_NAN.match(text):
1760      return float('nan')
1761    else:
1762      # assume '1.0f' format
1763      try:
1764        return float(text.rstrip('f'))
1765      except ValueError:
1766        raise ValueError('Couldn\'t parse float: %s' % text)
1767
1768
1769def ParseBool(text):
1770  """Parse a boolean value.
1771
1772  Args:
1773    text: Text to parse.
1774
1775  Returns:
1776    Boolean values parsed
1777
1778  Raises:
1779    ValueError: If text is not a valid boolean.
1780  """
1781  if text in ('true', 't', '1', 'True'):
1782    return True
1783  elif text in ('false', 'f', '0', 'False'):
1784    return False
1785  else:
1786    raise ValueError('Expected "true" or "false".')
1787
1788
1789def ParseEnum(field, value):
1790  """Parse an enum value.
1791
1792  The value can be specified by a number (the enum value), or by
1793  a string literal (the enum name).
1794
1795  Args:
1796    field: Enum field descriptor.
1797    value: String value.
1798
1799  Returns:
1800    Enum value number.
1801
1802  Raises:
1803    ValueError: If the enum value could not be parsed.
1804  """
1805  enum_descriptor = field.enum_type
1806  try:
1807    number = int(value, 0)
1808  except ValueError:
1809    # Identifier.
1810    enum_value = enum_descriptor.values_by_name.get(value, None)
1811    if enum_value is None:
1812      raise ValueError('Enum type "%s" has no value named %s.' %
1813                       (enum_descriptor.full_name, value))
1814  else:
1815    # Numeric value.
1816    if hasattr(field.file, 'syntax'):
1817      # Attribute is checked for compatibility.
1818      if field.file.syntax == 'proto3':
1819        # Proto3 accept numeric unknown enums.
1820        return number
1821    enum_value = enum_descriptor.values_by_number.get(number, None)
1822    if enum_value is None:
1823      raise ValueError('Enum type "%s" has no value with number %d.' %
1824                       (enum_descriptor.full_name, number))
1825  return enum_value.number
1826