• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# Protocol Buffers - Google's data interchange format
2# Copyright 2008 Google Inc.  All rights reserved.
3# https://developers.google.com/protocol-buffers/
4#
5# Redistribution and use in source and binary forms, with or without
6# modification, are permitted provided that the following conditions are
7# met:
8#
9#     * Redistributions of source code must retain the above copyright
10# notice, this list of conditions and the following disclaimer.
11#     * Redistributions in binary form must reproduce the above
12# copyright notice, this list of conditions and the following disclaimer
13# in the documentation and/or other materials provided with the
14# distribution.
15#     * Neither the name of Google Inc. nor the names of its
16# contributors may be used to endorse or promote products derived from
17# this software without specific prior written permission.
18#
19# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
31"""Contains routines for printing protocol messages in text format.
32
33Simple usage example::
34
35  # Create a proto object and serialize it to a text proto string.
36  message = my_proto_pb2.MyMessage(foo='bar')
37  text_proto = text_format.MessageToString(message)
38
39  # Parse a text proto string.
40  message = text_format.Parse(text_proto, my_proto_pb2.MyMessage())
41"""
42
43__author__ = 'kenton@google.com (Kenton Varda)'
44
45# TODO(b/129989314) Import thread contention leads to test failures.
46import encodings.raw_unicode_escape  # pylint: disable=unused-import
47import encodings.unicode_escape  # pylint: disable=unused-import
48import io
49import math
50import re
51
52from google.protobuf.internal import decoder
53from google.protobuf.internal import type_checkers
54from google.protobuf import descriptor
55from google.protobuf import text_encoding
56
57# pylint: disable=g-import-not-at-top
58__all__ = ['MessageToString', 'Parse', 'PrintMessage', 'PrintField',
59           'PrintFieldValue', 'Merge', 'MessageToBytes']
60
61_INTEGER_CHECKERS = (type_checkers.Uint32ValueChecker(),
62                     type_checkers.Int32ValueChecker(),
63                     type_checkers.Uint64ValueChecker(),
64                     type_checkers.Int64ValueChecker())
65_FLOAT_INFINITY = re.compile('-?inf(?:inity)?f?$', re.IGNORECASE)
66_FLOAT_NAN = re.compile('nanf?$', re.IGNORECASE)
67_QUOTES = frozenset(("'", '"'))
68_ANY_FULL_TYPE_NAME = 'google.protobuf.Any'
69
70
71class Error(Exception):
72  """Top-level module error for text_format."""
73
74
75class ParseError(Error):
76  """Thrown in case of text parsing or tokenizing error."""
77
78  def __init__(self, message=None, line=None, column=None):
79    if message is not None and line is not None:
80      loc = str(line)
81      if column is not None:
82        loc += ':{0}'.format(column)
83      message = '{0} : {1}'.format(loc, message)
84    if message is not None:
85      super(ParseError, self).__init__(message)
86    else:
87      super(ParseError, self).__init__()
88    self._line = line
89    self._column = column
90
91  def GetLine(self):
92    return self._line
93
94  def GetColumn(self):
95    return self._column
96
97
98class TextWriter(object):
99
100  def __init__(self, as_utf8):
101    self._writer = io.StringIO()
102
103  def write(self, val):
104    return self._writer.write(val)
105
106  def close(self):
107    return self._writer.close()
108
109  def getvalue(self):
110    return self._writer.getvalue()
111
112
113def MessageToString(
114    message,
115    as_utf8=False,
116    as_one_line=False,
117    use_short_repeated_primitives=False,
118    pointy_brackets=False,
119    use_index_order=False,
120    float_format=None,
121    double_format=None,
122    use_field_number=False,
123    descriptor_pool=None,
124    indent=0,
125    message_formatter=None,
126    print_unknown_fields=False,
127    force_colon=False):
128  # type: (...) -> str
129  """Convert protobuf message to text format.
130
131  Double values can be formatted compactly with 15 digits of
132  precision (which is the most that IEEE 754 "double" can guarantee)
133  using double_format='.15g'. To ensure that converting to text and back to a
134  proto will result in an identical value, double_format='.17g' should be used.
135
136  Args:
137    message: The protocol buffers message.
138    as_utf8: Return unescaped Unicode for non-ASCII characters.
139        In Python 3 actual Unicode characters may appear as is in strings.
140        In Python 2 the return value will be valid UTF-8 rather than only ASCII.
141    as_one_line: Don't introduce newlines between fields.
142    use_short_repeated_primitives: Use short repeated format for primitives.
143    pointy_brackets: If True, use angle brackets instead of curly braces for
144      nesting.
145    use_index_order: If True, fields of a proto message will be printed using
146      the order defined in source code instead of the field number, extensions
147      will be printed at the end of the message and their relative order is
148      determined by the extension number. By default, use the field number
149      order.
150    float_format (str): If set, use this to specify float field formatting
151      (per the "Format Specification Mini-Language"); otherwise, shortest float
152      that has same value in wire will be printed. Also affect double field
153      if double_format is not set but float_format is set.
154    double_format (str): If set, use this to specify double field formatting
155      (per the "Format Specification Mini-Language"); if it is not set but
156      float_format is set, use float_format. Otherwise, use ``str()``
157    use_field_number: If True, print field numbers instead of names.
158    descriptor_pool (DescriptorPool): Descriptor pool used to resolve Any types.
159    indent (int): The initial indent level, in terms of spaces, for pretty
160      print.
161    message_formatter (function(message, indent, as_one_line) -> unicode|None):
162      Custom formatter for selected sub-messages (usually based on message
163      type). Use to pretty print parts of the protobuf for easier diffing.
164    print_unknown_fields: If True, unknown fields will be printed.
165    force_colon: If set, a colon will be added after the field name even if the
166      field is a proto message.
167
168  Returns:
169    str: A string of the text formatted protocol buffer message.
170  """
171  out = TextWriter(as_utf8)
172  printer = _Printer(
173      out,
174      indent,
175      as_utf8,
176      as_one_line,
177      use_short_repeated_primitives,
178      pointy_brackets,
179      use_index_order,
180      float_format,
181      double_format,
182      use_field_number,
183      descriptor_pool,
184      message_formatter,
185      print_unknown_fields=print_unknown_fields,
186      force_colon=force_colon)
187  printer.PrintMessage(message)
188  result = out.getvalue()
189  out.close()
190  if as_one_line:
191    return result.rstrip()
192  return result
193
194
195def MessageToBytes(message, **kwargs):
196  # type: (...) -> bytes
197  """Convert protobuf message to encoded text format.  See MessageToString."""
198  text = MessageToString(message, **kwargs)
199  if isinstance(text, bytes):
200    return text
201  codec = 'utf-8' if kwargs.get('as_utf8') else 'ascii'
202  return text.encode(codec)
203
204
205def _IsMapEntry(field):
206  return (field.type == descriptor.FieldDescriptor.TYPE_MESSAGE and
207          field.message_type.has_options and
208          field.message_type.GetOptions().map_entry)
209
210
211def PrintMessage(message,
212                 out,
213                 indent=0,
214                 as_utf8=False,
215                 as_one_line=False,
216                 use_short_repeated_primitives=False,
217                 pointy_brackets=False,
218                 use_index_order=False,
219                 float_format=None,
220                 double_format=None,
221                 use_field_number=False,
222                 descriptor_pool=None,
223                 message_formatter=None,
224                 print_unknown_fields=False,
225                 force_colon=False):
226  printer = _Printer(
227      out=out, indent=indent, as_utf8=as_utf8,
228      as_one_line=as_one_line,
229      use_short_repeated_primitives=use_short_repeated_primitives,
230      pointy_brackets=pointy_brackets,
231      use_index_order=use_index_order,
232      float_format=float_format,
233      double_format=double_format,
234      use_field_number=use_field_number,
235      descriptor_pool=descriptor_pool,
236      message_formatter=message_formatter,
237      print_unknown_fields=print_unknown_fields,
238      force_colon=force_colon)
239  printer.PrintMessage(message)
240
241
242def PrintField(field,
243               value,
244               out,
245               indent=0,
246               as_utf8=False,
247               as_one_line=False,
248               use_short_repeated_primitives=False,
249               pointy_brackets=False,
250               use_index_order=False,
251               float_format=None,
252               double_format=None,
253               message_formatter=None,
254               print_unknown_fields=False,
255               force_colon=False):
256  """Print a single field name/value pair."""
257  printer = _Printer(out, indent, as_utf8, as_one_line,
258                     use_short_repeated_primitives, pointy_brackets,
259                     use_index_order, float_format, double_format,
260                     message_formatter=message_formatter,
261                     print_unknown_fields=print_unknown_fields,
262                     force_colon=force_colon)
263  printer.PrintField(field, value)
264
265
266def PrintFieldValue(field,
267                    value,
268                    out,
269                    indent=0,
270                    as_utf8=False,
271                    as_one_line=False,
272                    use_short_repeated_primitives=False,
273                    pointy_brackets=False,
274                    use_index_order=False,
275                    float_format=None,
276                    double_format=None,
277                    message_formatter=None,
278                    print_unknown_fields=False,
279                    force_colon=False):
280  """Print a single field value (not including name)."""
281  printer = _Printer(out, indent, as_utf8, as_one_line,
282                     use_short_repeated_primitives, pointy_brackets,
283                     use_index_order, float_format, double_format,
284                     message_formatter=message_formatter,
285                     print_unknown_fields=print_unknown_fields,
286                     force_colon=force_colon)
287  printer.PrintFieldValue(field, value)
288
289
290def _BuildMessageFromTypeName(type_name, descriptor_pool):
291  """Returns a protobuf message instance.
292
293  Args:
294    type_name: Fully-qualified protobuf  message type name string.
295    descriptor_pool: DescriptorPool instance.
296
297  Returns:
298    A Message instance of type matching type_name, or None if the a Descriptor
299    wasn't found matching type_name.
300  """
301  # pylint: disable=g-import-not-at-top
302  if descriptor_pool is None:
303    from google.protobuf import descriptor_pool as pool_mod
304    descriptor_pool = pool_mod.Default()
305  from google.protobuf import symbol_database
306  database = symbol_database.Default()
307  try:
308    message_descriptor = descriptor_pool.FindMessageTypeByName(type_name)
309  except KeyError:
310    return None
311  message_type = database.GetPrototype(message_descriptor)
312  return message_type()
313
314
315# These values must match WireType enum in google/protobuf/wire_format.h.
316WIRETYPE_LENGTH_DELIMITED = 2
317WIRETYPE_START_GROUP = 3
318
319
320class _Printer(object):
321  """Text format printer for protocol message."""
322
323  def __init__(
324      self,
325      out,
326      indent=0,
327      as_utf8=False,
328      as_one_line=False,
329      use_short_repeated_primitives=False,
330      pointy_brackets=False,
331      use_index_order=False,
332      float_format=None,
333      double_format=None,
334      use_field_number=False,
335      descriptor_pool=None,
336      message_formatter=None,
337      print_unknown_fields=False,
338      force_colon=False):
339    """Initialize the Printer.
340
341    Double values can be formatted compactly with 15 digits of precision
342    (which is the most that IEEE 754 "double" can guarantee) using
343    double_format='.15g'. To ensure that converting to text and back to a proto
344    will result in an identical value, double_format='.17g' should be used.
345
346    Args:
347      out: To record the text format result.
348      indent: The initial indent level for pretty print.
349      as_utf8: Return unescaped Unicode for non-ASCII characters.
350          In Python 3 actual Unicode characters may appear as is in strings.
351          In Python 2 the return value will be valid UTF-8 rather than ASCII.
352      as_one_line: Don't introduce newlines between fields.
353      use_short_repeated_primitives: Use short repeated format for primitives.
354      pointy_brackets: If True, use angle brackets instead of curly braces for
355        nesting.
356      use_index_order: If True, print fields of a proto message using the order
357        defined in source code instead of the field number. By default, use the
358        field number order.
359      float_format: If set, use this to specify float field formatting
360        (per the "Format Specification Mini-Language"); otherwise, shortest
361        float that has same value in wire will be printed. Also affect double
362        field if double_format is not set but float_format is set.
363      double_format: If set, use this to specify double field formatting
364        (per the "Format Specification Mini-Language"); if it is not set but
365        float_format is set, use float_format. Otherwise, str() is used.
366      use_field_number: If True, print field numbers instead of names.
367      descriptor_pool: A DescriptorPool used to resolve Any types.
368      message_formatter: A function(message, indent, as_one_line): unicode|None
369        to custom format selected sub-messages (usually based on message type).
370        Use to pretty print parts of the protobuf for easier diffing.
371      print_unknown_fields: If True, unknown fields will be printed.
372      force_colon: If set, a colon will be added after the field name even if
373        the field is a proto message.
374    """
375    self.out = out
376    self.indent = indent
377    self.as_utf8 = as_utf8
378    self.as_one_line = as_one_line
379    self.use_short_repeated_primitives = use_short_repeated_primitives
380    self.pointy_brackets = pointy_brackets
381    self.use_index_order = use_index_order
382    self.float_format = float_format
383    if double_format is not None:
384      self.double_format = double_format
385    else:
386      self.double_format = float_format
387    self.use_field_number = use_field_number
388    self.descriptor_pool = descriptor_pool
389    self.message_formatter = message_formatter
390    self.print_unknown_fields = print_unknown_fields
391    self.force_colon = force_colon
392
393  def _TryPrintAsAnyMessage(self, message):
394    """Serializes if message is a google.protobuf.Any field."""
395    if '/' not in message.type_url:
396      return False
397    packed_message = _BuildMessageFromTypeName(message.TypeName(),
398                                               self.descriptor_pool)
399    if packed_message:
400      packed_message.MergeFromString(message.value)
401      colon = ':' if self.force_colon else ''
402      self.out.write('%s[%s]%s ' % (self.indent * ' ', message.type_url, colon))
403      self._PrintMessageFieldValue(packed_message)
404      self.out.write(' ' if self.as_one_line else '\n')
405      return True
406    else:
407      return False
408
409  def _TryCustomFormatMessage(self, message):
410    formatted = self.message_formatter(message, self.indent, self.as_one_line)
411    if formatted is None:
412      return False
413
414    out = self.out
415    out.write(' ' * self.indent)
416    out.write(formatted)
417    out.write(' ' if self.as_one_line else '\n')
418    return True
419
420  def PrintMessage(self, message):
421    """Convert protobuf message to text format.
422
423    Args:
424      message: The protocol buffers message.
425    """
426    if self.message_formatter and self._TryCustomFormatMessage(message):
427      return
428    if (message.DESCRIPTOR.full_name == _ANY_FULL_TYPE_NAME and
429        self._TryPrintAsAnyMessage(message)):
430      return
431    fields = message.ListFields()
432    if self.use_index_order:
433      fields.sort(
434          key=lambda x: x[0].number if x[0].is_extension else x[0].index)
435    for field, value in fields:
436      if _IsMapEntry(field):
437        for key in sorted(value):
438          # This is slow for maps with submessage entries because it copies the
439          # entire tree.  Unfortunately this would take significant refactoring
440          # of this file to work around.
441          #
442          # TODO(haberman): refactor and optimize if this becomes an issue.
443          entry_submsg = value.GetEntryClass()(key=key, value=value[key])
444          self.PrintField(field, entry_submsg)
445      elif field.label == descriptor.FieldDescriptor.LABEL_REPEATED:
446        if (self.use_short_repeated_primitives
447            and field.cpp_type != descriptor.FieldDescriptor.CPPTYPE_MESSAGE
448            and field.cpp_type != descriptor.FieldDescriptor.CPPTYPE_STRING):
449          self._PrintShortRepeatedPrimitivesValue(field, value)
450        else:
451          for element in value:
452            self.PrintField(field, element)
453      else:
454        self.PrintField(field, value)
455
456    if self.print_unknown_fields:
457      self._PrintUnknownFields(message.UnknownFields())
458
459  def _PrintUnknownFields(self, unknown_fields):
460    """Print unknown fields."""
461    out = self.out
462    for field in unknown_fields:
463      out.write(' ' * self.indent)
464      out.write(str(field.field_number))
465      if field.wire_type == WIRETYPE_START_GROUP:
466        if self.as_one_line:
467          out.write(' { ')
468        else:
469          out.write(' {\n')
470          self.indent += 2
471
472        self._PrintUnknownFields(field.data)
473
474        if self.as_one_line:
475          out.write('} ')
476        else:
477          self.indent -= 2
478          out.write(' ' * self.indent + '}\n')
479      elif field.wire_type == WIRETYPE_LENGTH_DELIMITED:
480        try:
481          # If this field is parseable as a Message, it is probably
482          # an embedded message.
483          # pylint: disable=protected-access
484          (embedded_unknown_message, pos) = decoder._DecodeUnknownFieldSet(
485              memoryview(field.data), 0, len(field.data))
486        except Exception:    # pylint: disable=broad-except
487          pos = 0
488
489        if pos == len(field.data):
490          if self.as_one_line:
491            out.write(' { ')
492          else:
493            out.write(' {\n')
494            self.indent += 2
495
496          self._PrintUnknownFields(embedded_unknown_message)
497
498          if self.as_one_line:
499            out.write('} ')
500          else:
501            self.indent -= 2
502            out.write(' ' * self.indent + '}\n')
503        else:
504          # A string or bytes field. self.as_utf8 may not work.
505          out.write(': \"')
506          out.write(text_encoding.CEscape(field.data, False))
507          out.write('\" ' if self.as_one_line else '\"\n')
508      else:
509        # varint, fixed32, fixed64
510        out.write(': ')
511        out.write(str(field.data))
512        out.write(' ' if self.as_one_line else '\n')
513
514  def _PrintFieldName(self, field):
515    """Print field name."""
516    out = self.out
517    out.write(' ' * self.indent)
518    if self.use_field_number:
519      out.write(str(field.number))
520    else:
521      if field.is_extension:
522        out.write('[')
523        if (field.containing_type.GetOptions().message_set_wire_format and
524            field.type == descriptor.FieldDescriptor.TYPE_MESSAGE and
525            field.label == descriptor.FieldDescriptor.LABEL_OPTIONAL):
526          out.write(field.message_type.full_name)
527        else:
528          out.write(field.full_name)
529        out.write(']')
530      elif field.type == descriptor.FieldDescriptor.TYPE_GROUP:
531        # For groups, use the capitalized name.
532        out.write(field.message_type.name)
533      else:
534          out.write(field.name)
535
536    if (self.force_colon or
537        field.cpp_type != descriptor.FieldDescriptor.CPPTYPE_MESSAGE):
538      # The colon is optional in this case, but our cross-language golden files
539      # don't include it. Here, the colon is only included if force_colon is
540      # set to True
541      out.write(':')
542
543  def PrintField(self, field, value):
544    """Print a single field name/value pair."""
545    self._PrintFieldName(field)
546    self.out.write(' ')
547    self.PrintFieldValue(field, value)
548    self.out.write(' ' if self.as_one_line else '\n')
549
550  def _PrintShortRepeatedPrimitivesValue(self, field, value):
551    """"Prints short repeated primitives value."""
552    # Note: this is called only when value has at least one element.
553    self._PrintFieldName(field)
554    self.out.write(' [')
555    for i in range(len(value) - 1):
556      self.PrintFieldValue(field, value[i])
557      self.out.write(', ')
558    self.PrintFieldValue(field, value[-1])
559    self.out.write(']')
560    self.out.write(' ' if self.as_one_line else '\n')
561
562  def _PrintMessageFieldValue(self, value):
563    if self.pointy_brackets:
564      openb = '<'
565      closeb = '>'
566    else:
567      openb = '{'
568      closeb = '}'
569
570    if self.as_one_line:
571      self.out.write('%s ' % openb)
572      self.PrintMessage(value)
573      self.out.write(closeb)
574    else:
575      self.out.write('%s\n' % openb)
576      self.indent += 2
577      self.PrintMessage(value)
578      self.indent -= 2
579      self.out.write(' ' * self.indent + closeb)
580
581  def PrintFieldValue(self, field, value):
582    """Print a single field value (not including name).
583
584    For repeated fields, the value should be a single element.
585
586    Args:
587      field: The descriptor of the field to be printed.
588      value: The value of the field.
589    """
590    out = self.out
591    if field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_MESSAGE:
592      self._PrintMessageFieldValue(value)
593    elif field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_ENUM:
594      enum_value = field.enum_type.values_by_number.get(value, None)
595      if enum_value is not None:
596        out.write(enum_value.name)
597      else:
598        out.write(str(value))
599    elif field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_STRING:
600      out.write('\"')
601      if isinstance(value, str) and not self.as_utf8:
602        out_value = value.encode('utf-8')
603      else:
604        out_value = value
605      if field.type == descriptor.FieldDescriptor.TYPE_BYTES:
606        # We always need to escape all binary data in TYPE_BYTES fields.
607        out_as_utf8 = False
608      else:
609        out_as_utf8 = self.as_utf8
610      out.write(text_encoding.CEscape(out_value, out_as_utf8))
611      out.write('\"')
612    elif field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_BOOL:
613      if value:
614        out.write('true')
615      else:
616        out.write('false')
617    elif field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_FLOAT:
618      if self.float_format is not None:
619        out.write('{1:{0}}'.format(self.float_format, value))
620      else:
621        if math.isnan(value):
622          out.write(str(value))
623        else:
624          out.write(str(type_checkers.ToShortestFloat(value)))
625    elif (field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_DOUBLE and
626          self.double_format is not None):
627      out.write('{1:{0}}'.format(self.double_format, value))
628    else:
629      out.write(str(value))
630
631
632def Parse(text,
633          message,
634          allow_unknown_extension=False,
635          allow_field_number=False,
636          descriptor_pool=None,
637          allow_unknown_field=False):
638  """Parses a text representation of a protocol message into a message.
639
640  NOTE: for historical reasons this function does not clear the input
641  message. This is different from what the binary msg.ParseFrom(...) does.
642  If text contains a field already set in message, the value is appended if the
643  field is repeated. Otherwise, an error is raised.
644
645  Example::
646
647    a = MyProto()
648    a.repeated_field.append('test')
649    b = MyProto()
650
651    # Repeated fields are combined
652    text_format.Parse(repr(a), b)
653    text_format.Parse(repr(a), b) # repeated_field contains ["test", "test"]
654
655    # Non-repeated fields cannot be overwritten
656    a.singular_field = 1
657    b.singular_field = 2
658    text_format.Parse(repr(a), b) # ParseError
659
660    # Binary version:
661    b.ParseFromString(a.SerializeToString()) # repeated_field is now "test"
662
663  Caller is responsible for clearing the message as needed.
664
665  Args:
666    text (str): Message text representation.
667    message (Message): A protocol buffer message to merge into.
668    allow_unknown_extension: if True, skip over missing extensions and keep
669      parsing
670    allow_field_number: if True, both field number and field name are allowed.
671    descriptor_pool (DescriptorPool): Descriptor pool used to resolve Any types.
672    allow_unknown_field: if True, skip over unknown field and keep
673      parsing. Avoid to use this option if possible. It may hide some
674      errors (e.g. spelling error on field name)
675
676  Returns:
677    Message: The same message passed as argument.
678
679  Raises:
680    ParseError: On text parsing problems.
681  """
682  return ParseLines(text.split(b'\n' if isinstance(text, bytes) else u'\n'),
683                    message,
684                    allow_unknown_extension,
685                    allow_field_number,
686                    descriptor_pool=descriptor_pool,
687                    allow_unknown_field=allow_unknown_field)
688
689
690def Merge(text,
691          message,
692          allow_unknown_extension=False,
693          allow_field_number=False,
694          descriptor_pool=None,
695          allow_unknown_field=False):
696  """Parses a text representation of a protocol message into a message.
697
698  Like Parse(), but allows repeated values for a non-repeated field, and uses
699  the last one. This means any non-repeated, top-level fields specified in text
700  replace those in the message.
701
702  Args:
703    text (str): Message text representation.
704    message (Message): A protocol buffer message to merge into.
705    allow_unknown_extension: if True, skip over missing extensions and keep
706      parsing
707    allow_field_number: if True, both field number and field name are allowed.
708    descriptor_pool (DescriptorPool): Descriptor pool used to resolve Any types.
709    allow_unknown_field: if True, skip over unknown field and keep
710      parsing. Avoid to use this option if possible. It may hide some
711      errors (e.g. spelling error on field name)
712
713  Returns:
714    Message: The same message passed as argument.
715
716  Raises:
717    ParseError: On text parsing problems.
718  """
719  return MergeLines(
720      text.split(b'\n' if isinstance(text, bytes) else u'\n'),
721      message,
722      allow_unknown_extension,
723      allow_field_number,
724      descriptor_pool=descriptor_pool,
725      allow_unknown_field=allow_unknown_field)
726
727
728def ParseLines(lines,
729               message,
730               allow_unknown_extension=False,
731               allow_field_number=False,
732               descriptor_pool=None,
733               allow_unknown_field=False):
734  """Parses a text representation of a protocol message into a message.
735
736  See Parse() for caveats.
737
738  Args:
739    lines: An iterable of lines of a message's text representation.
740    message: A protocol buffer message to merge into.
741    allow_unknown_extension: if True, skip over missing extensions and keep
742      parsing
743    allow_field_number: if True, both field number and field name are allowed.
744    descriptor_pool: A DescriptorPool used to resolve Any types.
745    allow_unknown_field: if True, skip over unknown field and keep
746      parsing. Avoid to use this option if possible. It may hide some
747      errors (e.g. spelling error on field name)
748
749  Returns:
750    The same message passed as argument.
751
752  Raises:
753    ParseError: On text parsing problems.
754  """
755  parser = _Parser(allow_unknown_extension,
756                   allow_field_number,
757                   descriptor_pool=descriptor_pool,
758                   allow_unknown_field=allow_unknown_field)
759  return parser.ParseLines(lines, message)
760
761
762def MergeLines(lines,
763               message,
764               allow_unknown_extension=False,
765               allow_field_number=False,
766               descriptor_pool=None,
767               allow_unknown_field=False):
768  """Parses a text representation of a protocol message into a message.
769
770  See Merge() for more details.
771
772  Args:
773    lines: An iterable of lines of a message's text representation.
774    message: A protocol buffer message to merge into.
775    allow_unknown_extension: if True, skip over missing extensions and keep
776      parsing
777    allow_field_number: if True, both field number and field name are allowed.
778    descriptor_pool: A DescriptorPool used to resolve Any types.
779    allow_unknown_field: if True, skip over unknown field and keep
780      parsing. Avoid to use this option if possible. It may hide some
781      errors (e.g. spelling error on field name)
782
783  Returns:
784    The same message passed as argument.
785
786  Raises:
787    ParseError: On text parsing problems.
788  """
789  parser = _Parser(allow_unknown_extension,
790                   allow_field_number,
791                   descriptor_pool=descriptor_pool,
792                   allow_unknown_field=allow_unknown_field)
793  return parser.MergeLines(lines, message)
794
795
796class _Parser(object):
797  """Text format parser for protocol message."""
798
799  def __init__(self,
800               allow_unknown_extension=False,
801               allow_field_number=False,
802               descriptor_pool=None,
803               allow_unknown_field=False):
804    self.allow_unknown_extension = allow_unknown_extension
805    self.allow_field_number = allow_field_number
806    self.descriptor_pool = descriptor_pool
807    self.allow_unknown_field = allow_unknown_field
808
809  def ParseLines(self, lines, message):
810    """Parses a text representation of a protocol message into a message."""
811    self._allow_multiple_scalars = False
812    self._ParseOrMerge(lines, message)
813    return message
814
815  def MergeLines(self, lines, message):
816    """Merges a text representation of a protocol message into a message."""
817    self._allow_multiple_scalars = True
818    self._ParseOrMerge(lines, message)
819    return message
820
821  def _ParseOrMerge(self, lines, message):
822    """Converts a text representation of a protocol message into a message.
823
824    Args:
825      lines: Lines of a message's text representation.
826      message: A protocol buffer message to merge into.
827
828    Raises:
829      ParseError: On text parsing problems.
830    """
831    # Tokenize expects native str lines.
832    str_lines = (
833        line if isinstance(line, str) else line.decode('utf-8')
834        for line in lines)
835    tokenizer = Tokenizer(str_lines)
836    while not tokenizer.AtEnd():
837      self._MergeField(tokenizer, message)
838
839  def _MergeField(self, tokenizer, message):
840    """Merges a single protocol message field into a message.
841
842    Args:
843      tokenizer: A tokenizer to parse the field name and values.
844      message: A protocol message to record the data.
845
846    Raises:
847      ParseError: In case of text parsing problems.
848    """
849    message_descriptor = message.DESCRIPTOR
850    if (message_descriptor.full_name == _ANY_FULL_TYPE_NAME and
851        tokenizer.TryConsume('[')):
852      type_url_prefix, packed_type_name = self._ConsumeAnyTypeUrl(tokenizer)
853      tokenizer.Consume(']')
854      tokenizer.TryConsume(':')
855      if tokenizer.TryConsume('<'):
856        expanded_any_end_token = '>'
857      else:
858        tokenizer.Consume('{')
859        expanded_any_end_token = '}'
860      expanded_any_sub_message = _BuildMessageFromTypeName(packed_type_name,
861                                                           self.descriptor_pool)
862      if not expanded_any_sub_message:
863        raise ParseError('Type %s not found in descriptor pool' %
864                         packed_type_name)
865      while not tokenizer.TryConsume(expanded_any_end_token):
866        if tokenizer.AtEnd():
867          raise tokenizer.ParseErrorPreviousToken('Expected "%s".' %
868                                                  (expanded_any_end_token,))
869        self._MergeField(tokenizer, expanded_any_sub_message)
870      deterministic = False
871
872      message.Pack(expanded_any_sub_message,
873                   type_url_prefix=type_url_prefix,
874                   deterministic=deterministic)
875      return
876
877    if tokenizer.TryConsume('['):
878      name = [tokenizer.ConsumeIdentifier()]
879      while tokenizer.TryConsume('.'):
880        name.append(tokenizer.ConsumeIdentifier())
881      name = '.'.join(name)
882
883      if not message_descriptor.is_extendable:
884        raise tokenizer.ParseErrorPreviousToken(
885            'Message type "%s" does not have extensions.' %
886            message_descriptor.full_name)
887      # pylint: disable=protected-access
888      field = message.Extensions._FindExtensionByName(name)
889      # pylint: enable=protected-access
890
891
892      if not field:
893        if self.allow_unknown_extension:
894          field = None
895        else:
896          raise tokenizer.ParseErrorPreviousToken(
897              'Extension "%s" not registered. '
898              'Did you import the _pb2 module which defines it? '
899              'If you are trying to place the extension in the MessageSet '
900              'field of another message that is in an Any or MessageSet field, '
901              'that message\'s _pb2 module must be imported as well' % name)
902      elif message_descriptor != field.containing_type:
903        raise tokenizer.ParseErrorPreviousToken(
904            'Extension "%s" does not extend message type "%s".' %
905            (name, message_descriptor.full_name))
906
907      tokenizer.Consume(']')
908
909    else:
910      name = tokenizer.ConsumeIdentifierOrNumber()
911      if self.allow_field_number and name.isdigit():
912        number = ParseInteger(name, True, True)
913        field = message_descriptor.fields_by_number.get(number, None)
914        if not field and message_descriptor.is_extendable:
915          field = message.Extensions._FindExtensionByNumber(number)
916      else:
917        field = message_descriptor.fields_by_name.get(name, None)
918
919        # Group names are expected to be capitalized as they appear in the
920        # .proto file, which actually matches their type names, not their field
921        # names.
922        if not field:
923          field = message_descriptor.fields_by_name.get(name.lower(), None)
924          if field and field.type != descriptor.FieldDescriptor.TYPE_GROUP:
925            field = None
926
927        if (field and field.type == descriptor.FieldDescriptor.TYPE_GROUP and
928            field.message_type.name != name):
929          field = None
930
931      if not field and not self.allow_unknown_field:
932        raise tokenizer.ParseErrorPreviousToken(
933            'Message type "%s" has no field named "%s".' %
934            (message_descriptor.full_name, name))
935
936    if field:
937      if not self._allow_multiple_scalars and field.containing_oneof:
938        # Check if there's a different field set in this oneof.
939        # Note that we ignore the case if the same field was set before, and we
940        # apply _allow_multiple_scalars to non-scalar fields as well.
941        which_oneof = message.WhichOneof(field.containing_oneof.name)
942        if which_oneof is not None and which_oneof != field.name:
943          raise tokenizer.ParseErrorPreviousToken(
944              'Field "%s" is specified along with field "%s", another member '
945              'of oneof "%s" for message type "%s".' %
946              (field.name, which_oneof, field.containing_oneof.name,
947               message_descriptor.full_name))
948
949      if field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_MESSAGE:
950        tokenizer.TryConsume(':')
951        merger = self._MergeMessageField
952      else:
953        tokenizer.Consume(':')
954        merger = self._MergeScalarField
955
956      if (field.label == descriptor.FieldDescriptor.LABEL_REPEATED and
957          tokenizer.TryConsume('[')):
958        # Short repeated format, e.g. "foo: [1, 2, 3]"
959        if not tokenizer.TryConsume(']'):
960          while True:
961            merger(tokenizer, message, field)
962            if tokenizer.TryConsume(']'):
963              break
964            tokenizer.Consume(',')
965
966      else:
967        merger(tokenizer, message, field)
968
969    else:  # Proto field is unknown.
970      assert (self.allow_unknown_extension or self.allow_unknown_field)
971      _SkipFieldContents(tokenizer)
972
973    # For historical reasons, fields may optionally be separated by commas or
974    # semicolons.
975    if not tokenizer.TryConsume(','):
976      tokenizer.TryConsume(';')
977
978
979  def _ConsumeAnyTypeUrl(self, tokenizer):
980    """Consumes a google.protobuf.Any type URL and returns the type name."""
981    # Consume "type.googleapis.com/".
982    prefix = [tokenizer.ConsumeIdentifier()]
983    tokenizer.Consume('.')
984    prefix.append(tokenizer.ConsumeIdentifier())
985    tokenizer.Consume('.')
986    prefix.append(tokenizer.ConsumeIdentifier())
987    tokenizer.Consume('/')
988    # Consume the fully-qualified type name.
989    name = [tokenizer.ConsumeIdentifier()]
990    while tokenizer.TryConsume('.'):
991      name.append(tokenizer.ConsumeIdentifier())
992    return '.'.join(prefix), '.'.join(name)
993
994  def _MergeMessageField(self, tokenizer, message, field):
995    """Merges a single scalar field into a message.
996
997    Args:
998      tokenizer: A tokenizer to parse the field value.
999      message: The message of which field is a member.
1000      field: The descriptor of the field to be merged.
1001
1002    Raises:
1003      ParseError: In case of text parsing problems.
1004    """
1005    is_map_entry = _IsMapEntry(field)
1006
1007    if tokenizer.TryConsume('<'):
1008      end_token = '>'
1009    else:
1010      tokenizer.Consume('{')
1011      end_token = '}'
1012
1013    if field.label == descriptor.FieldDescriptor.LABEL_REPEATED:
1014      if field.is_extension:
1015        sub_message = message.Extensions[field].add()
1016      elif is_map_entry:
1017        sub_message = getattr(message, field.name).GetEntryClass()()
1018      else:
1019        sub_message = getattr(message, field.name).add()
1020    else:
1021      if field.is_extension:
1022        if (not self._allow_multiple_scalars and
1023            message.HasExtension(field)):
1024          raise tokenizer.ParseErrorPreviousToken(
1025              'Message type "%s" should not have multiple "%s" extensions.' %
1026              (message.DESCRIPTOR.full_name, field.full_name))
1027        sub_message = message.Extensions[field]
1028      else:
1029        # Also apply _allow_multiple_scalars to message field.
1030        # TODO(jieluo): Change to _allow_singular_overwrites.
1031        if (not self._allow_multiple_scalars and
1032            message.HasField(field.name)):
1033          raise tokenizer.ParseErrorPreviousToken(
1034              'Message type "%s" should not have multiple "%s" fields.' %
1035              (message.DESCRIPTOR.full_name, field.name))
1036        sub_message = getattr(message, field.name)
1037      sub_message.SetInParent()
1038
1039    while not tokenizer.TryConsume(end_token):
1040      if tokenizer.AtEnd():
1041        raise tokenizer.ParseErrorPreviousToken('Expected "%s".' % (end_token,))
1042      self._MergeField(tokenizer, sub_message)
1043
1044    if is_map_entry:
1045      value_cpptype = field.message_type.fields_by_name['value'].cpp_type
1046      if value_cpptype == descriptor.FieldDescriptor.CPPTYPE_MESSAGE:
1047        value = getattr(message, field.name)[sub_message.key]
1048        value.CopyFrom(sub_message.value)
1049      else:
1050        getattr(message, field.name)[sub_message.key] = sub_message.value
1051
1052  @staticmethod
1053  def _IsProto3Syntax(message):
1054    message_descriptor = message.DESCRIPTOR
1055    return (hasattr(message_descriptor, 'syntax') and
1056            message_descriptor.syntax == 'proto3')
1057
1058  def _MergeScalarField(self, tokenizer, message, field):
1059    """Merges a single scalar field into a message.
1060
1061    Args:
1062      tokenizer: A tokenizer to parse the field value.
1063      message: A protocol message to record the data.
1064      field: The descriptor of the field to be merged.
1065
1066    Raises:
1067      ParseError: In case of text parsing problems.
1068      RuntimeError: On runtime errors.
1069    """
1070    _ = self.allow_unknown_extension
1071    value = None
1072
1073    if field.type in (descriptor.FieldDescriptor.TYPE_INT32,
1074                      descriptor.FieldDescriptor.TYPE_SINT32,
1075                      descriptor.FieldDescriptor.TYPE_SFIXED32):
1076      value = _ConsumeInt32(tokenizer)
1077    elif field.type in (descriptor.FieldDescriptor.TYPE_INT64,
1078                        descriptor.FieldDescriptor.TYPE_SINT64,
1079                        descriptor.FieldDescriptor.TYPE_SFIXED64):
1080      value = _ConsumeInt64(tokenizer)
1081    elif field.type in (descriptor.FieldDescriptor.TYPE_UINT32,
1082                        descriptor.FieldDescriptor.TYPE_FIXED32):
1083      value = _ConsumeUint32(tokenizer)
1084    elif field.type in (descriptor.FieldDescriptor.TYPE_UINT64,
1085                        descriptor.FieldDescriptor.TYPE_FIXED64):
1086      value = _ConsumeUint64(tokenizer)
1087    elif field.type in (descriptor.FieldDescriptor.TYPE_FLOAT,
1088                        descriptor.FieldDescriptor.TYPE_DOUBLE):
1089      value = tokenizer.ConsumeFloat()
1090    elif field.type == descriptor.FieldDescriptor.TYPE_BOOL:
1091      value = tokenizer.ConsumeBool()
1092    elif field.type == descriptor.FieldDescriptor.TYPE_STRING:
1093      value = tokenizer.ConsumeString()
1094    elif field.type == descriptor.FieldDescriptor.TYPE_BYTES:
1095      value = tokenizer.ConsumeByteString()
1096    elif field.type == descriptor.FieldDescriptor.TYPE_ENUM:
1097      value = tokenizer.ConsumeEnum(field)
1098    else:
1099      raise RuntimeError('Unknown field type %d' % field.type)
1100
1101    if field.label == descriptor.FieldDescriptor.LABEL_REPEATED:
1102      if field.is_extension:
1103        message.Extensions[field].append(value)
1104      else:
1105        getattr(message, field.name).append(value)
1106    else:
1107      if field.is_extension:
1108        if (not self._allow_multiple_scalars and
1109            not self._IsProto3Syntax(message) and
1110            message.HasExtension(field)):
1111          raise tokenizer.ParseErrorPreviousToken(
1112              'Message type "%s" should not have multiple "%s" extensions.' %
1113              (message.DESCRIPTOR.full_name, field.full_name))
1114        else:
1115          message.Extensions[field] = value
1116      else:
1117        duplicate_error = False
1118        if not self._allow_multiple_scalars:
1119          if self._IsProto3Syntax(message):
1120            # Proto3 doesn't represent presence so we try best effort to check
1121            # multiple scalars by compare to default values.
1122            duplicate_error = bool(getattr(message, field.name))
1123          else:
1124            duplicate_error = message.HasField(field.name)
1125
1126        if duplicate_error:
1127          raise tokenizer.ParseErrorPreviousToken(
1128              'Message type "%s" should not have multiple "%s" fields.' %
1129              (message.DESCRIPTOR.full_name, field.name))
1130        else:
1131          setattr(message, field.name, value)
1132
1133
1134def _SkipFieldContents(tokenizer):
1135  """Skips over contents (value or message) of a field.
1136
1137  Args:
1138    tokenizer: A tokenizer to parse the field name and values.
1139  """
1140  # Try to guess the type of this field.
1141  # If this field is not a message, there should be a ":" between the
1142  # field name and the field value and also the field value should not
1143  # start with "{" or "<" which indicates the beginning of a message body.
1144  # If there is no ":" or there is a "{" or "<" after ":", this field has
1145  # to be a message or the input is ill-formed.
1146  if tokenizer.TryConsume(':') and not tokenizer.LookingAt(
1147      '{') and not tokenizer.LookingAt('<'):
1148    _SkipFieldValue(tokenizer)
1149  else:
1150    _SkipFieldMessage(tokenizer)
1151
1152
1153def _SkipField(tokenizer):
1154  """Skips over a complete field (name and value/message).
1155
1156  Args:
1157    tokenizer: A tokenizer to parse the field name and values.
1158  """
1159  if tokenizer.TryConsume('['):
1160    # Consume extension name.
1161    tokenizer.ConsumeIdentifier()
1162    while tokenizer.TryConsume('.'):
1163      tokenizer.ConsumeIdentifier()
1164    tokenizer.Consume(']')
1165  else:
1166    tokenizer.ConsumeIdentifierOrNumber()
1167
1168  _SkipFieldContents(tokenizer)
1169
1170  # For historical reasons, fields may optionally be separated by commas or
1171  # semicolons.
1172  if not tokenizer.TryConsume(','):
1173    tokenizer.TryConsume(';')
1174
1175
1176def _SkipFieldMessage(tokenizer):
1177  """Skips over a field message.
1178
1179  Args:
1180    tokenizer: A tokenizer to parse the field name and values.
1181  """
1182
1183  if tokenizer.TryConsume('<'):
1184    delimiter = '>'
1185  else:
1186    tokenizer.Consume('{')
1187    delimiter = '}'
1188
1189  while not tokenizer.LookingAt('>') and not tokenizer.LookingAt('}'):
1190    _SkipField(tokenizer)
1191
1192  tokenizer.Consume(delimiter)
1193
1194
1195def _SkipFieldValue(tokenizer):
1196  """Skips over a field value.
1197
1198  Args:
1199    tokenizer: A tokenizer to parse the field name and values.
1200
1201  Raises:
1202    ParseError: In case an invalid field value is found.
1203  """
1204  # String/bytes tokens can come in multiple adjacent string literals.
1205  # If we can consume one, consume as many as we can.
1206  if tokenizer.TryConsumeByteString():
1207    while tokenizer.TryConsumeByteString():
1208      pass
1209    return
1210
1211  if (not tokenizer.TryConsumeIdentifier() and
1212      not _TryConsumeInt64(tokenizer) and not _TryConsumeUint64(tokenizer) and
1213      not tokenizer.TryConsumeFloat()):
1214    raise ParseError('Invalid field value: ' + tokenizer.token)
1215
1216
1217class Tokenizer(object):
1218  """Protocol buffer text representation tokenizer.
1219
1220  This class handles the lower level string parsing by splitting it into
1221  meaningful tokens.
1222
1223  It was directly ported from the Java protocol buffer API.
1224  """
1225
1226  _WHITESPACE = re.compile(r'\s+')
1227  _COMMENT = re.compile(r'(\s*#.*$)', re.MULTILINE)
1228  _WHITESPACE_OR_COMMENT = re.compile(r'(\s|(#.*$))+', re.MULTILINE)
1229  _TOKEN = re.compile('|'.join([
1230      r'[a-zA-Z_][0-9a-zA-Z_+-]*',  # an identifier
1231      r'([0-9+-]|(\.[0-9]))[0-9a-zA-Z_.+-]*',  # a number
1232  ] + [  # quoted str for each quote mark
1233      # Avoid backtracking! https://stackoverflow.com/a/844267
1234      r'{qt}[^{qt}\n\\]*((\\.)+[^{qt}\n\\]*)*({qt}|\\?$)'.format(qt=mark)
1235      for mark in _QUOTES
1236  ]))
1237
1238  _IDENTIFIER = re.compile(r'[^\d\W]\w*')
1239  _IDENTIFIER_OR_NUMBER = re.compile(r'\w+')
1240
1241  def __init__(self, lines, skip_comments=True):
1242    self._position = 0
1243    self._line = -1
1244    self._column = 0
1245    self._token_start = None
1246    self.token = ''
1247    self._lines = iter(lines)
1248    self._current_line = ''
1249    self._previous_line = 0
1250    self._previous_column = 0
1251    self._more_lines = True
1252    self._skip_comments = skip_comments
1253    self._whitespace_pattern = (skip_comments and self._WHITESPACE_OR_COMMENT
1254                                or self._WHITESPACE)
1255    self._SkipWhitespace()
1256    self.NextToken()
1257
1258  def LookingAt(self, token):
1259    return self.token == token
1260
1261  def AtEnd(self):
1262    """Checks the end of the text was reached.
1263
1264    Returns:
1265      True iff the end was reached.
1266    """
1267    return not self.token
1268
1269  def _PopLine(self):
1270    while len(self._current_line) <= self._column:
1271      try:
1272        self._current_line = next(self._lines)
1273      except StopIteration:
1274        self._current_line = ''
1275        self._more_lines = False
1276        return
1277      else:
1278        self._line += 1
1279        self._column = 0
1280
1281  def _SkipWhitespace(self):
1282    while True:
1283      self._PopLine()
1284      match = self._whitespace_pattern.match(self._current_line, self._column)
1285      if not match:
1286        break
1287      length = len(match.group(0))
1288      self._column += length
1289
1290  def TryConsume(self, token):
1291    """Tries to consume a given piece of text.
1292
1293    Args:
1294      token: Text to consume.
1295
1296    Returns:
1297      True iff the text was consumed.
1298    """
1299    if self.token == token:
1300      self.NextToken()
1301      return True
1302    return False
1303
1304  def Consume(self, token):
1305    """Consumes a piece of text.
1306
1307    Args:
1308      token: Text to consume.
1309
1310    Raises:
1311      ParseError: If the text couldn't be consumed.
1312    """
1313    if not self.TryConsume(token):
1314      raise self.ParseError('Expected "%s".' % token)
1315
1316  def ConsumeComment(self):
1317    result = self.token
1318    if not self._COMMENT.match(result):
1319      raise self.ParseError('Expected comment.')
1320    self.NextToken()
1321    return result
1322
1323  def ConsumeCommentOrTrailingComment(self):
1324    """Consumes a comment, returns a 2-tuple (trailing bool, comment str)."""
1325
1326    # Tokenizer initializes _previous_line and _previous_column to 0. As the
1327    # tokenizer starts, it looks like there is a previous token on the line.
1328    just_started = self._line == 0 and self._column == 0
1329
1330    before_parsing = self._previous_line
1331    comment = self.ConsumeComment()
1332
1333    # A trailing comment is a comment on the same line than the previous token.
1334    trailing = (self._previous_line == before_parsing
1335                and not just_started)
1336
1337    return trailing, comment
1338
1339  def TryConsumeIdentifier(self):
1340    try:
1341      self.ConsumeIdentifier()
1342      return True
1343    except ParseError:
1344      return False
1345
1346  def ConsumeIdentifier(self):
1347    """Consumes protocol message field identifier.
1348
1349    Returns:
1350      Identifier string.
1351
1352    Raises:
1353      ParseError: If an identifier couldn't be consumed.
1354    """
1355    result = self.token
1356    if not self._IDENTIFIER.match(result):
1357      raise self.ParseError('Expected identifier.')
1358    self.NextToken()
1359    return result
1360
1361  def TryConsumeIdentifierOrNumber(self):
1362    try:
1363      self.ConsumeIdentifierOrNumber()
1364      return True
1365    except ParseError:
1366      return False
1367
1368  def ConsumeIdentifierOrNumber(self):
1369    """Consumes protocol message field identifier.
1370
1371    Returns:
1372      Identifier string.
1373
1374    Raises:
1375      ParseError: If an identifier couldn't be consumed.
1376    """
1377    result = self.token
1378    if not self._IDENTIFIER_OR_NUMBER.match(result):
1379      raise self.ParseError('Expected identifier or number, got %s.' % result)
1380    self.NextToken()
1381    return result
1382
1383  def TryConsumeInteger(self):
1384    try:
1385      self.ConsumeInteger()
1386      return True
1387    except ParseError:
1388      return False
1389
1390  def ConsumeInteger(self):
1391    """Consumes an integer number.
1392
1393    Returns:
1394      The integer parsed.
1395
1396    Raises:
1397      ParseError: If an integer couldn't be consumed.
1398    """
1399    try:
1400      result = _ParseAbstractInteger(self.token)
1401    except ValueError as e:
1402      raise self.ParseError(str(e))
1403    self.NextToken()
1404    return result
1405
1406  def TryConsumeFloat(self):
1407    try:
1408      self.ConsumeFloat()
1409      return True
1410    except ParseError:
1411      return False
1412
1413  def ConsumeFloat(self):
1414    """Consumes an floating point number.
1415
1416    Returns:
1417      The number parsed.
1418
1419    Raises:
1420      ParseError: If a floating point number couldn't be consumed.
1421    """
1422    try:
1423      result = ParseFloat(self.token)
1424    except ValueError as e:
1425      raise self.ParseError(str(e))
1426    self.NextToken()
1427    return result
1428
1429  def ConsumeBool(self):
1430    """Consumes a boolean value.
1431
1432    Returns:
1433      The bool parsed.
1434
1435    Raises:
1436      ParseError: If a boolean value couldn't be consumed.
1437    """
1438    try:
1439      result = ParseBool(self.token)
1440    except ValueError as e:
1441      raise self.ParseError(str(e))
1442    self.NextToken()
1443    return result
1444
1445  def TryConsumeByteString(self):
1446    try:
1447      self.ConsumeByteString()
1448      return True
1449    except ParseError:
1450      return False
1451
1452  def ConsumeString(self):
1453    """Consumes a string value.
1454
1455    Returns:
1456      The string parsed.
1457
1458    Raises:
1459      ParseError: If a string value couldn't be consumed.
1460    """
1461    the_bytes = self.ConsumeByteString()
1462    try:
1463      return str(the_bytes, 'utf-8')
1464    except UnicodeDecodeError as e:
1465      raise self._StringParseError(e)
1466
1467  def ConsumeByteString(self):
1468    """Consumes a byte array value.
1469
1470    Returns:
1471      The array parsed (as a string).
1472
1473    Raises:
1474      ParseError: If a byte array value couldn't be consumed.
1475    """
1476    the_list = [self._ConsumeSingleByteString()]
1477    while self.token and self.token[0] in _QUOTES:
1478      the_list.append(self._ConsumeSingleByteString())
1479    return b''.join(the_list)
1480
1481  def _ConsumeSingleByteString(self):
1482    """Consume one token of a string literal.
1483
1484    String literals (whether bytes or text) can come in multiple adjacent
1485    tokens which are automatically concatenated, like in C or Python.  This
1486    method only consumes one token.
1487
1488    Returns:
1489      The token parsed.
1490    Raises:
1491      ParseError: When the wrong format data is found.
1492    """
1493    text = self.token
1494    if len(text) < 1 or text[0] not in _QUOTES:
1495      raise self.ParseError('Expected string but found: %r' % (text,))
1496
1497    if len(text) < 2 or text[-1] != text[0]:
1498      raise self.ParseError('String missing ending quote: %r' % (text,))
1499
1500    try:
1501      result = text_encoding.CUnescape(text[1:-1])
1502    except ValueError as e:
1503      raise self.ParseError(str(e))
1504    self.NextToken()
1505    return result
1506
1507  def ConsumeEnum(self, field):
1508    try:
1509      result = ParseEnum(field, self.token)
1510    except ValueError as e:
1511      raise self.ParseError(str(e))
1512    self.NextToken()
1513    return result
1514
1515  def ParseErrorPreviousToken(self, message):
1516    """Creates and *returns* a ParseError for the previously read token.
1517
1518    Args:
1519      message: A message to set for the exception.
1520
1521    Returns:
1522      A ParseError instance.
1523    """
1524    return ParseError(message, self._previous_line + 1,
1525                      self._previous_column + 1)
1526
1527  def ParseError(self, message):
1528    """Creates and *returns* a ParseError for the current token."""
1529    return ParseError('\'' + self._current_line + '\': ' + message,
1530                      self._line + 1, self._column + 1)
1531
1532  def _StringParseError(self, e):
1533    return self.ParseError('Couldn\'t parse string: ' + str(e))
1534
1535  def NextToken(self):
1536    """Reads the next meaningful token."""
1537    self._previous_line = self._line
1538    self._previous_column = self._column
1539
1540    self._column += len(self.token)
1541    self._SkipWhitespace()
1542
1543    if not self._more_lines:
1544      self.token = ''
1545      return
1546
1547    match = self._TOKEN.match(self._current_line, self._column)
1548    if not match and not self._skip_comments:
1549      match = self._COMMENT.match(self._current_line, self._column)
1550    if match:
1551      token = match.group(0)
1552      self.token = token
1553    else:
1554      self.token = self._current_line[self._column]
1555
1556# Aliased so it can still be accessed by current visibility violators.
1557# TODO(dbarnett): Migrate violators to textformat_tokenizer.
1558_Tokenizer = Tokenizer  # pylint: disable=invalid-name
1559
1560
1561def _ConsumeInt32(tokenizer):
1562  """Consumes a signed 32bit integer number from tokenizer.
1563
1564  Args:
1565    tokenizer: A tokenizer used to parse the number.
1566
1567  Returns:
1568    The integer parsed.
1569
1570  Raises:
1571    ParseError: If a signed 32bit integer couldn't be consumed.
1572  """
1573  return _ConsumeInteger(tokenizer, is_signed=True, is_long=False)
1574
1575
1576def _ConsumeUint32(tokenizer):
1577  """Consumes an unsigned 32bit integer number from tokenizer.
1578
1579  Args:
1580    tokenizer: A tokenizer used to parse the number.
1581
1582  Returns:
1583    The integer parsed.
1584
1585  Raises:
1586    ParseError: If an unsigned 32bit integer couldn't be consumed.
1587  """
1588  return _ConsumeInteger(tokenizer, is_signed=False, is_long=False)
1589
1590
1591def _TryConsumeInt64(tokenizer):
1592  try:
1593    _ConsumeInt64(tokenizer)
1594    return True
1595  except ParseError:
1596    return False
1597
1598
1599def _ConsumeInt64(tokenizer):
1600  """Consumes a signed 32bit integer number from tokenizer.
1601
1602  Args:
1603    tokenizer: A tokenizer used to parse the number.
1604
1605  Returns:
1606    The integer parsed.
1607
1608  Raises:
1609    ParseError: If a signed 32bit integer couldn't be consumed.
1610  """
1611  return _ConsumeInteger(tokenizer, is_signed=True, is_long=True)
1612
1613
1614def _TryConsumeUint64(tokenizer):
1615  try:
1616    _ConsumeUint64(tokenizer)
1617    return True
1618  except ParseError:
1619    return False
1620
1621
1622def _ConsumeUint64(tokenizer):
1623  """Consumes an unsigned 64bit integer number from tokenizer.
1624
1625  Args:
1626    tokenizer: A tokenizer used to parse the number.
1627
1628  Returns:
1629    The integer parsed.
1630
1631  Raises:
1632    ParseError: If an unsigned 64bit integer couldn't be consumed.
1633  """
1634  return _ConsumeInteger(tokenizer, is_signed=False, is_long=True)
1635
1636
1637def _ConsumeInteger(tokenizer, is_signed=False, is_long=False):
1638  """Consumes an integer number from tokenizer.
1639
1640  Args:
1641    tokenizer: A tokenizer used to parse the number.
1642    is_signed: True if a signed integer must be parsed.
1643    is_long: True if a long integer must be parsed.
1644
1645  Returns:
1646    The integer parsed.
1647
1648  Raises:
1649    ParseError: If an integer with given characteristics couldn't be consumed.
1650  """
1651  try:
1652    result = ParseInteger(tokenizer.token, is_signed=is_signed, is_long=is_long)
1653  except ValueError as e:
1654    raise tokenizer.ParseError(str(e))
1655  tokenizer.NextToken()
1656  return result
1657
1658
1659def ParseInteger(text, is_signed=False, is_long=False):
1660  """Parses an integer.
1661
1662  Args:
1663    text: The text to parse.
1664    is_signed: True if a signed integer must be parsed.
1665    is_long: True if a long integer must be parsed.
1666
1667  Returns:
1668    The integer value.
1669
1670  Raises:
1671    ValueError: Thrown Iff the text is not a valid integer.
1672  """
1673  # Do the actual parsing. Exception handling is propagated to caller.
1674  result = _ParseAbstractInteger(text)
1675
1676  # Check if the integer is sane. Exceptions handled by callers.
1677  checker = _INTEGER_CHECKERS[2 * int(is_long) + int(is_signed)]
1678  checker.CheckValue(result)
1679  return result
1680
1681
1682def _ParseAbstractInteger(text):
1683  """Parses an integer without checking size/signedness.
1684
1685  Args:
1686    text: The text to parse.
1687
1688  Returns:
1689    The integer value.
1690
1691  Raises:
1692    ValueError: Thrown Iff the text is not a valid integer.
1693  """
1694  # Do the actual parsing. Exception handling is propagated to caller.
1695  orig_text = text
1696  c_octal_match = re.match(r'(-?)0(\d+)$', text)
1697  if c_octal_match:
1698    # Python 3 no longer supports 0755 octal syntax without the 'o', so
1699    # we always use the '0o' prefix for multi-digit numbers starting with 0.
1700    text = c_octal_match.group(1) + '0o' + c_octal_match.group(2)
1701  try:
1702    return int(text, 0)
1703  except ValueError:
1704    raise ValueError('Couldn\'t parse integer: %s' % orig_text)
1705
1706
1707def ParseFloat(text):
1708  """Parse a floating point number.
1709
1710  Args:
1711    text: Text to parse.
1712
1713  Returns:
1714    The number parsed.
1715
1716  Raises:
1717    ValueError: If a floating point number couldn't be parsed.
1718  """
1719  try:
1720    # Assume Python compatible syntax.
1721    return float(text)
1722  except ValueError:
1723    # Check alternative spellings.
1724    if _FLOAT_INFINITY.match(text):
1725      if text[0] == '-':
1726        return float('-inf')
1727      else:
1728        return float('inf')
1729    elif _FLOAT_NAN.match(text):
1730      return float('nan')
1731    else:
1732      # assume '1.0f' format
1733      try:
1734        return float(text.rstrip('f'))
1735      except ValueError:
1736        raise ValueError('Couldn\'t parse float: %s' % text)
1737
1738
1739def ParseBool(text):
1740  """Parse a boolean value.
1741
1742  Args:
1743    text: Text to parse.
1744
1745  Returns:
1746    Boolean values parsed
1747
1748  Raises:
1749    ValueError: If text is not a valid boolean.
1750  """
1751  if text in ('true', 't', '1', 'True'):
1752    return True
1753  elif text in ('false', 'f', '0', 'False'):
1754    return False
1755  else:
1756    raise ValueError('Expected "true" or "false".')
1757
1758
1759def ParseEnum(field, value):
1760  """Parse an enum value.
1761
1762  The value can be specified by a number (the enum value), or by
1763  a string literal (the enum name).
1764
1765  Args:
1766    field: Enum field descriptor.
1767    value: String value.
1768
1769  Returns:
1770    Enum value number.
1771
1772  Raises:
1773    ValueError: If the enum value could not be parsed.
1774  """
1775  enum_descriptor = field.enum_type
1776  try:
1777    number = int(value, 0)
1778  except ValueError:
1779    # Identifier.
1780    enum_value = enum_descriptor.values_by_name.get(value, None)
1781    if enum_value is None:
1782      raise ValueError('Enum type "%s" has no value named %s.' %
1783                       (enum_descriptor.full_name, value))
1784  else:
1785    # Numeric value.
1786    if hasattr(field.file, 'syntax'):
1787      # Attribute is checked for compatibility.
1788      if field.file.syntax == 'proto3':
1789        # Proto3 accept numeric unknown enums.
1790        return number
1791    enum_value = enum_descriptor.values_by_number.get(number, None)
1792    if enum_value is None:
1793      raise ValueError('Enum type "%s" has no value with number %d.' %
1794                       (enum_descriptor.full_name, number))
1795  return enum_value.number
1796