• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# Protocol Buffers - Google's data interchange format
2# Copyright 2008 Google Inc.  All rights reserved.
3#
4# Use of this source code is governed by a BSD-style
5# license that can be found in the LICENSE file or at
6# https://developers.google.com/open-source/licenses/bsd
7
8"""Contains routines for printing protocol messages in text format.
9
10Simple usage example::
11
12  # Create a proto object and serialize it to a text proto string.
13  message = my_proto_pb2.MyMessage(foo='bar')
14  text_proto = text_format.MessageToString(message)
15
16  # Parse a text proto string.
17  message = text_format.Parse(text_proto, my_proto_pb2.MyMessage())
18"""
19
20__author__ = 'kenton@google.com (Kenton Varda)'
21
22# TODO Import thread contention leads to test failures.
23import encodings.raw_unicode_escape  # pylint: disable=unused-import
24import encodings.unicode_escape  # pylint: disable=unused-import
25import io
26import math
27import re
28
29from google.protobuf.internal import decoder
30from google.protobuf.internal import type_checkers
31from google.protobuf import descriptor
32from google.protobuf import text_encoding
33from google.protobuf import unknown_fields
34
35# pylint: disable=g-import-not-at-top
36__all__ = ['MessageToString', 'Parse', 'PrintMessage', 'PrintField',
37           'PrintFieldValue', 'Merge', 'MessageToBytes']
38
39_INTEGER_CHECKERS = (type_checkers.Uint32ValueChecker(),
40                     type_checkers.Int32ValueChecker(),
41                     type_checkers.Uint64ValueChecker(),
42                     type_checkers.Int64ValueChecker())
43_FLOAT_INFINITY = re.compile('-?inf(?:inity)?f?$', re.IGNORECASE)
44_FLOAT_NAN = re.compile('nanf?$', re.IGNORECASE)
45_QUOTES = frozenset(("'", '"'))
46_ANY_FULL_TYPE_NAME = 'google.protobuf.Any'
47_DEBUG_STRING_SILENT_MARKER = '\t '
48
49_as_utf8_default = True
50
51
52class Error(Exception):
53  """Top-level module error for text_format."""
54
55
56class ParseError(Error):
57  """Thrown in case of text parsing or tokenizing error."""
58
59  def __init__(self, message=None, line=None, column=None):
60    if message is not None and line is not None:
61      loc = str(line)
62      if column is not None:
63        loc += ':{0}'.format(column)
64      message = '{0} : {1}'.format(loc, message)
65    if message is not None:
66      super(ParseError, self).__init__(message)
67    else:
68      super(ParseError, self).__init__()
69    self._line = line
70    self._column = column
71
72  def GetLine(self):
73    return self._line
74
75  def GetColumn(self):
76    return self._column
77
78
79class TextWriter(object):
80
81  def __init__(self, as_utf8):
82    self._writer = io.StringIO()
83
84  def write(self, val):
85    return self._writer.write(val)
86
87  def close(self):
88    return self._writer.close()
89
90  def getvalue(self):
91    return self._writer.getvalue()
92
93
94def MessageToString(
95    message,
96    as_utf8=_as_utf8_default,
97    as_one_line=False,
98    use_short_repeated_primitives=False,
99    pointy_brackets=False,
100    use_index_order=False,
101    float_format=None,
102    double_format=None,
103    use_field_number=False,
104    descriptor_pool=None,
105    indent=0,
106    message_formatter=None,
107    print_unknown_fields=False,
108    force_colon=False) -> str:
109  """Convert protobuf message to text format.
110
111  Double values can be formatted compactly with 15 digits of
112  precision (which is the most that IEEE 754 "double" can guarantee)
113  using double_format='.15g'. To ensure that converting to text and back to a
114  proto will result in an identical value, double_format='.17g' should be used.
115
116  Args:
117    message: The protocol buffers message.
118    as_utf8: Return unescaped Unicode for non-ASCII characters.
119    as_one_line: Don't introduce newlines between fields.
120    use_short_repeated_primitives: Use short repeated format for primitives.
121    pointy_brackets: If True, use angle brackets instead of curly braces for
122      nesting.
123    use_index_order: If True, fields of a proto message will be printed using
124      the order defined in source code instead of the field number, extensions
125      will be printed at the end of the message and their relative order is
126      determined by the extension number. By default, use the field number
127      order.
128    float_format (str): If set, use this to specify float field formatting
129      (per the "Format Specification Mini-Language"); otherwise, shortest float
130      that has same value in wire will be printed. Also affect double field
131      if double_format is not set but float_format is set.
132    double_format (str): If set, use this to specify double field formatting
133      (per the "Format Specification Mini-Language"); if it is not set but
134      float_format is set, use float_format. Otherwise, use ``str()``
135    use_field_number: If True, print field numbers instead of names.
136    descriptor_pool (DescriptorPool): Descriptor pool used to resolve Any types.
137    indent (int): The initial indent level, in terms of spaces, for pretty
138      print.
139    message_formatter (function(message, indent, as_one_line) -> unicode|None):
140      Custom formatter for selected sub-messages (usually based on message
141      type). Use to pretty print parts of the protobuf for easier diffing.
142    print_unknown_fields: If True, unknown fields will be printed.
143    force_colon: If set, a colon will be added after the field name even if the
144      field is a proto message.
145
146  Returns:
147    str: A string of the text formatted protocol buffer message.
148  """
149  out = TextWriter(as_utf8)
150  printer = _Printer(
151      out,
152      indent,
153      as_utf8,
154      as_one_line,
155      use_short_repeated_primitives,
156      pointy_brackets,
157      use_index_order,
158      float_format,
159      double_format,
160      use_field_number,
161      descriptor_pool,
162      message_formatter,
163      print_unknown_fields=print_unknown_fields,
164      force_colon=force_colon)
165  printer.PrintMessage(message)
166  result = out.getvalue()
167  out.close()
168  if as_one_line:
169    return result.rstrip()
170  return result
171
172
173def MessageToBytes(message, **kwargs) -> bytes:
174  """Convert protobuf message to encoded text format.  See MessageToString."""
175  text = MessageToString(message, **kwargs)
176  if isinstance(text, bytes):
177    return text
178  codec = 'utf-8' if kwargs.get('as_utf8') else 'ascii'
179  return text.encode(codec)
180
181
182def _IsMapEntry(field):
183  return (field.type == descriptor.FieldDescriptor.TYPE_MESSAGE and
184          field.message_type.has_options and
185          field.message_type.GetOptions().map_entry)
186
187
188def _IsGroupLike(field):
189  """Determines if a field is consistent with a proto2 group.
190
191  Args:
192    field: The field descriptor.
193
194  Returns:
195    True if this field is group-like, false otherwise.
196  """
197  # Groups are always tag-delimited.
198  if field.type != descriptor.FieldDescriptor.TYPE_GROUP:
199    return False
200
201  # Group fields always are always the lowercase type name.
202  if field.name != field.message_type.name.lower():
203    return False
204
205  if field.message_type.file != field.file:
206    return False
207
208  # Group messages are always defined in the same scope as the field.  File
209  # level extensions will compare NULL == NULL here, which is why the file
210  # comparison above is necessary to ensure both come from the same file.
211  return (
212      field.message_type.containing_type == field.extension_scope
213      if field.is_extension
214      else field.message_type.containing_type == field.containing_type
215  )
216
217
218def PrintMessage(message,
219                 out,
220                 indent=0,
221                 as_utf8=_as_utf8_default,
222                 as_one_line=False,
223                 use_short_repeated_primitives=False,
224                 pointy_brackets=False,
225                 use_index_order=False,
226                 float_format=None,
227                 double_format=None,
228                 use_field_number=False,
229                 descriptor_pool=None,
230                 message_formatter=None,
231                 print_unknown_fields=False,
232                 force_colon=False):
233  """Convert the message to text format and write it to the out stream.
234
235  Args:
236    message: The Message object to convert to text format.
237    out: A file handle to write the message to.
238    indent: The initial indent level for pretty print.
239    as_utf8: Return unescaped Unicode for non-ASCII characters.
240    as_one_line: Don't introduce newlines between fields.
241    use_short_repeated_primitives: Use short repeated format for primitives.
242    pointy_brackets: If True, use angle brackets instead of curly braces for
243      nesting.
244    use_index_order: If True, print fields of a proto message using the order
245      defined in source code instead of the field number. By default, use the
246      field number order.
247    float_format: If set, use this to specify float field formatting
248      (per the "Format Specification Mini-Language"); otherwise, shortest
249      float that has same value in wire will be printed. Also affect double
250      field if double_format is not set but float_format is set.
251    double_format: If set, use this to specify double field formatting
252      (per the "Format Specification Mini-Language"); if it is not set but
253      float_format is set, use float_format. Otherwise, str() is used.
254    use_field_number: If True, print field numbers instead of names.
255    descriptor_pool: A DescriptorPool used to resolve Any types.
256    message_formatter: A function(message, indent, as_one_line): unicode|None
257      to custom format selected sub-messages (usually based on message type).
258      Use to pretty print parts of the protobuf for easier diffing.
259    print_unknown_fields: If True, unknown fields will be printed.
260    force_colon: If set, a colon will be added after the field name even if
261      the field is a proto message.
262  """
263  printer = _Printer(
264      out=out, indent=indent, as_utf8=as_utf8,
265      as_one_line=as_one_line,
266      use_short_repeated_primitives=use_short_repeated_primitives,
267      pointy_brackets=pointy_brackets,
268      use_index_order=use_index_order,
269      float_format=float_format,
270      double_format=double_format,
271      use_field_number=use_field_number,
272      descriptor_pool=descriptor_pool,
273      message_formatter=message_formatter,
274      print_unknown_fields=print_unknown_fields,
275      force_colon=force_colon)
276  printer.PrintMessage(message)
277
278
279def PrintField(field,
280               value,
281               out,
282               indent=0,
283               as_utf8=_as_utf8_default,
284               as_one_line=False,
285               use_short_repeated_primitives=False,
286               pointy_brackets=False,
287               use_index_order=False,
288               float_format=None,
289               double_format=None,
290               message_formatter=None,
291               print_unknown_fields=False,
292               force_colon=False):
293  """Print a single field name/value pair."""
294  printer = _Printer(out, indent, as_utf8, as_one_line,
295                     use_short_repeated_primitives, pointy_brackets,
296                     use_index_order, float_format, double_format,
297                     message_formatter=message_formatter,
298                     print_unknown_fields=print_unknown_fields,
299                     force_colon=force_colon)
300  printer.PrintField(field, value)
301
302
303def PrintFieldValue(field,
304                    value,
305                    out,
306                    indent=0,
307                    as_utf8=_as_utf8_default,
308                    as_one_line=False,
309                    use_short_repeated_primitives=False,
310                    pointy_brackets=False,
311                    use_index_order=False,
312                    float_format=None,
313                    double_format=None,
314                    message_formatter=None,
315                    print_unknown_fields=False,
316                    force_colon=False):
317  """Print a single field value (not including name)."""
318  printer = _Printer(out, indent, as_utf8, as_one_line,
319                     use_short_repeated_primitives, pointy_brackets,
320                     use_index_order, float_format, double_format,
321                     message_formatter=message_formatter,
322                     print_unknown_fields=print_unknown_fields,
323                     force_colon=force_colon)
324  printer.PrintFieldValue(field, value)
325
326
327def _BuildMessageFromTypeName(type_name, descriptor_pool):
328  """Returns a protobuf message instance.
329
330  Args:
331    type_name: Fully-qualified protobuf  message type name string.
332    descriptor_pool: DescriptorPool instance.
333
334  Returns:
335    A Message instance of type matching type_name, or None if the a Descriptor
336    wasn't found matching type_name.
337  """
338  # pylint: disable=g-import-not-at-top
339  if descriptor_pool is None:
340    from google.protobuf import descriptor_pool as pool_mod
341    descriptor_pool = pool_mod.Default()
342  from google.protobuf import message_factory
343  try:
344    message_descriptor = descriptor_pool.FindMessageTypeByName(type_name)
345  except KeyError:
346    return None
347  message_type = message_factory.GetMessageClass(message_descriptor)
348  return message_type()
349
350
351# These values must match WireType enum in //google/protobuf/wire_format.h.
352WIRETYPE_LENGTH_DELIMITED = 2
353WIRETYPE_START_GROUP = 3
354
355
356class _Printer(object):
357  """Text format printer for protocol message."""
358
359  def __init__(
360      self,
361      out,
362      indent=0,
363      as_utf8=_as_utf8_default,
364      as_one_line=False,
365      use_short_repeated_primitives=False,
366      pointy_brackets=False,
367      use_index_order=False,
368      float_format=None,
369      double_format=None,
370      use_field_number=False,
371      descriptor_pool=None,
372      message_formatter=None,
373      print_unknown_fields=False,
374      force_colon=False):
375    """Initialize the Printer.
376
377    Double values can be formatted compactly with 15 digits of precision
378    (which is the most that IEEE 754 "double" can guarantee) using
379    double_format='.15g'. To ensure that converting to text and back to a proto
380    will result in an identical value, double_format='.17g' should be used.
381
382    Args:
383      out: To record the text format result.
384      indent: The initial indent level for pretty print.
385      as_utf8: Return unescaped Unicode for non-ASCII characters.
386      as_one_line: Don't introduce newlines between fields.
387      use_short_repeated_primitives: Use short repeated format for primitives.
388      pointy_brackets: If True, use angle brackets instead of curly braces for
389        nesting.
390      use_index_order: If True, print fields of a proto message using the order
391        defined in source code instead of the field number. By default, use the
392        field number order.
393      float_format: If set, use this to specify float field formatting
394        (per the "Format Specification Mini-Language"); otherwise, shortest
395        float that has same value in wire will be printed. Also affect double
396        field if double_format is not set but float_format is set.
397      double_format: If set, use this to specify double field formatting
398        (per the "Format Specification Mini-Language"); if it is not set but
399        float_format is set, use float_format. Otherwise, str() is used.
400      use_field_number: If True, print field numbers instead of names.
401      descriptor_pool: A DescriptorPool used to resolve Any types.
402      message_formatter: A function(message, indent, as_one_line): unicode|None
403        to custom format selected sub-messages (usually based on message type).
404        Use to pretty print parts of the protobuf for easier diffing.
405      print_unknown_fields: If True, unknown fields will be printed.
406      force_colon: If set, a colon will be added after the field name even if
407        the field is a proto message.
408    """
409    self.out = out
410    self.indent = indent
411    self.as_utf8 = as_utf8
412    self.as_one_line = as_one_line
413    self.use_short_repeated_primitives = use_short_repeated_primitives
414    self.pointy_brackets = pointy_brackets
415    self.use_index_order = use_index_order
416    self.float_format = float_format
417    if double_format is not None:
418      self.double_format = double_format
419    else:
420      self.double_format = float_format
421    self.use_field_number = use_field_number
422    self.descriptor_pool = descriptor_pool
423    self.message_formatter = message_formatter
424    self.print_unknown_fields = print_unknown_fields
425    self.force_colon = force_colon
426
427  def _TryPrintAsAnyMessage(self, message):
428    """Serializes if message is a google.protobuf.Any field."""
429    if '/' not in message.type_url:
430      return False
431    packed_message = _BuildMessageFromTypeName(message.TypeName(),
432                                               self.descriptor_pool)
433    if packed_message:
434      packed_message.MergeFromString(message.value)
435      colon = ':' if self.force_colon else ''
436      self.out.write('%s[%s]%s ' % (self.indent * ' ', message.type_url, colon))
437      self._PrintMessageFieldValue(packed_message)
438      self.out.write(' ' if self.as_one_line else '\n')
439      return True
440    else:
441      return False
442
443  def _TryCustomFormatMessage(self, message):
444    formatted = self.message_formatter(message, self.indent, self.as_one_line)
445    if formatted is None:
446      return False
447
448    out = self.out
449    out.write(' ' * self.indent)
450    out.write(formatted)
451    out.write(' ' if self.as_one_line else '\n')
452    return True
453
454  def PrintMessage(self, message):
455    """Convert protobuf message to text format.
456
457    Args:
458      message: The protocol buffers message.
459    """
460    if self.message_formatter and self._TryCustomFormatMessage(message):
461      return
462    if (message.DESCRIPTOR.full_name == _ANY_FULL_TYPE_NAME and
463        self._TryPrintAsAnyMessage(message)):
464      return
465    fields = message.ListFields()
466    if self.use_index_order:
467      fields.sort(
468          key=lambda x: x[0].number if x[0].is_extension else x[0].index)
469    for field, value in fields:
470      if _IsMapEntry(field):
471        for key in sorted(value):
472          # This is slow for maps with submessage entries because it copies the
473          # entire tree.  Unfortunately this would take significant refactoring
474          # of this file to work around.
475          #
476          # TODO: refactor and optimize if this becomes an issue.
477          entry_submsg = value.GetEntryClass()(key=key, value=value[key])
478          self.PrintField(field, entry_submsg)
479      elif field.label == descriptor.FieldDescriptor.LABEL_REPEATED:
480        if (self.use_short_repeated_primitives
481            and field.cpp_type != descriptor.FieldDescriptor.CPPTYPE_MESSAGE
482            and field.cpp_type != descriptor.FieldDescriptor.CPPTYPE_STRING):
483          self._PrintShortRepeatedPrimitivesValue(field, value)
484        else:
485          for element in value:
486            self.PrintField(field, element)
487      else:
488        self.PrintField(field, value)
489
490    if self.print_unknown_fields:
491      self._PrintUnknownFields(unknown_fields.UnknownFieldSet(message))
492
493  def _PrintUnknownFields(self, unknown_field_set):
494    """Print unknown fields."""
495    out = self.out
496    for field in unknown_field_set:
497      out.write(' ' * self.indent)
498      out.write(str(field.field_number))
499      if field.wire_type == WIRETYPE_START_GROUP:
500        if self.as_one_line:
501          out.write(' { ')
502        else:
503          out.write(' {\n')
504          self.indent += 2
505
506        self._PrintUnknownFields(field.data)
507
508        if self.as_one_line:
509          out.write('} ')
510        else:
511          self.indent -= 2
512          out.write(' ' * self.indent + '}\n')
513      elif field.wire_type == WIRETYPE_LENGTH_DELIMITED:
514        try:
515          # If this field is parseable as a Message, it is probably
516          # an embedded message.
517          # pylint: disable=protected-access
518          (embedded_unknown_message, pos) = decoder._DecodeUnknownFieldSet(
519              memoryview(field.data), 0, len(field.data))
520        except Exception:    # pylint: disable=broad-except
521          pos = 0
522
523        if pos == len(field.data):
524          if self.as_one_line:
525            out.write(' { ')
526          else:
527            out.write(' {\n')
528            self.indent += 2
529
530          self._PrintUnknownFields(embedded_unknown_message)
531
532          if self.as_one_line:
533            out.write('} ')
534          else:
535            self.indent -= 2
536            out.write(' ' * self.indent + '}\n')
537        else:
538          # A string or bytes field. self.as_utf8 may not work.
539          out.write(': \"')
540          out.write(text_encoding.CEscape(field.data, False))
541          out.write('\" ' if self.as_one_line else '\"\n')
542      else:
543        # varint, fixed32, fixed64
544        out.write(': ')
545        out.write(str(field.data))
546        out.write(' ' if self.as_one_line else '\n')
547
548  def _PrintFieldName(self, field):
549    """Print field name."""
550    out = self.out
551    out.write(' ' * self.indent)
552    if self.use_field_number:
553      out.write(str(field.number))
554    else:
555      if field.is_extension:
556        out.write('[')
557        if (field.containing_type.GetOptions().message_set_wire_format and
558            field.type == descriptor.FieldDescriptor.TYPE_MESSAGE and
559            field.label == descriptor.FieldDescriptor.LABEL_OPTIONAL):
560          out.write(field.message_type.full_name)
561        else:
562          out.write(field.full_name)
563        out.write(']')
564      elif _IsGroupLike(field):
565        # For groups, use the capitalized name.
566        out.write(field.message_type.name)
567      else:
568        out.write(field.name)
569
570    if (self.force_colon or
571        field.cpp_type != descriptor.FieldDescriptor.CPPTYPE_MESSAGE):
572      # The colon is optional in this case, but our cross-language golden files
573      # don't include it. Here, the colon is only included if force_colon is
574      # set to True
575      out.write(':')
576
577  def PrintField(self, field, value):
578    """Print a single field name/value pair."""
579    self._PrintFieldName(field)
580    self.out.write(' ')
581    self.PrintFieldValue(field, value)
582    self.out.write(' ' if self.as_one_line else '\n')
583
584  def _PrintShortRepeatedPrimitivesValue(self, field, value):
585    """"Prints short repeated primitives value."""
586    # Note: this is called only when value has at least one element.
587    self._PrintFieldName(field)
588    self.out.write(' [')
589    for i in range(len(value) - 1):
590      self.PrintFieldValue(field, value[i])
591      self.out.write(', ')
592    self.PrintFieldValue(field, value[-1])
593    self.out.write(']')
594    self.out.write(' ' if self.as_one_line else '\n')
595
596  def _PrintMessageFieldValue(self, value):
597    if self.pointy_brackets:
598      openb = '<'
599      closeb = '>'
600    else:
601      openb = '{'
602      closeb = '}'
603
604    if self.as_one_line:
605      self.out.write('%s ' % openb)
606      self.PrintMessage(value)
607      self.out.write(closeb)
608    else:
609      self.out.write('%s\n' % openb)
610      self.indent += 2
611      self.PrintMessage(value)
612      self.indent -= 2
613      self.out.write(' ' * self.indent + closeb)
614
615  def PrintFieldValue(self, field, value):
616    """Print a single field value (not including name).
617
618    For repeated fields, the value should be a single element.
619
620    Args:
621      field: The descriptor of the field to be printed.
622      value: The value of the field.
623    """
624    out = self.out
625    if field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_MESSAGE:
626      self._PrintMessageFieldValue(value)
627    elif field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_ENUM:
628      enum_value = field.enum_type.values_by_number.get(value, None)
629      if enum_value is not None:
630        out.write(enum_value.name)
631      else:
632        out.write(str(value))
633    elif field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_STRING:
634      out.write('\"')
635      if isinstance(value, str) and not self.as_utf8:
636        out_value = value.encode('utf-8')
637      else:
638        out_value = value
639      if field.type == descriptor.FieldDescriptor.TYPE_BYTES:
640        # We always need to escape all binary data in TYPE_BYTES fields.
641        out_as_utf8 = False
642      else:
643        out_as_utf8 = self.as_utf8
644      out.write(text_encoding.CEscape(out_value, out_as_utf8))
645      out.write('\"')
646    elif field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_BOOL:
647      if value:
648        out.write('true')
649      else:
650        out.write('false')
651    elif field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_FLOAT:
652      if self.float_format is not None:
653        out.write('{1:{0}}'.format(self.float_format, value))
654      else:
655        if math.isnan(value):
656          out.write(str(value))
657        else:
658          out.write(str(type_checkers.ToShortestFloat(value)))
659    elif (field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_DOUBLE and
660          self.double_format is not None):
661      out.write('{1:{0}}'.format(self.double_format, value))
662    else:
663      out.write(str(value))
664
665
666def Parse(text,
667          message,
668          allow_unknown_extension=False,
669          allow_field_number=False,
670          descriptor_pool=None,
671          allow_unknown_field=False):
672  """Parses a text representation of a protocol message into a message.
673
674  NOTE: for historical reasons this function does not clear the input
675  message. This is different from what the binary msg.ParseFrom(...) does.
676  If text contains a field already set in message, the value is appended if the
677  field is repeated. Otherwise, an error is raised.
678
679  Example::
680
681    a = MyProto()
682    a.repeated_field.append('test')
683    b = MyProto()
684
685    # Repeated fields are combined
686    text_format.Parse(repr(a), b)
687    text_format.Parse(repr(a), b) # repeated_field contains ["test", "test"]
688
689    # Non-repeated fields cannot be overwritten
690    a.singular_field = 1
691    b.singular_field = 2
692    text_format.Parse(repr(a), b) # ParseError
693
694    # Binary version:
695    b.ParseFromString(a.SerializeToString()) # repeated_field is now "test"
696
697  Caller is responsible for clearing the message as needed.
698
699  Args:
700    text (str): Message text representation.
701    message (Message): A protocol buffer message to merge into.
702    allow_unknown_extension: if True, skip over missing extensions and keep
703      parsing
704    allow_field_number: if True, both field number and field name are allowed.
705    descriptor_pool (DescriptorPool): Descriptor pool used to resolve Any types.
706    allow_unknown_field: if True, skip over unknown field and keep
707      parsing. Avoid to use this option if possible. It may hide some
708      errors (e.g. spelling error on field name)
709
710  Returns:
711    Message: The same message passed as argument.
712
713  Raises:
714    ParseError: On text parsing problems.
715  """
716  return ParseLines(text.split(b'\n' if isinstance(text, bytes) else u'\n'),
717                    message,
718                    allow_unknown_extension,
719                    allow_field_number,
720                    descriptor_pool=descriptor_pool,
721                    allow_unknown_field=allow_unknown_field)
722
723
724def Merge(text,
725          message,
726          allow_unknown_extension=False,
727          allow_field_number=False,
728          descriptor_pool=None,
729          allow_unknown_field=False):
730  """Parses a text representation of a protocol message into a message.
731
732  Like Parse(), but allows repeated values for a non-repeated field, and uses
733  the last one. This means any non-repeated, top-level fields specified in text
734  replace those in the message.
735
736  Args:
737    text (str): Message text representation.
738    message (Message): A protocol buffer message to merge into.
739    allow_unknown_extension: if True, skip over missing extensions and keep
740      parsing
741    allow_field_number: if True, both field number and field name are allowed.
742    descriptor_pool (DescriptorPool): Descriptor pool used to resolve Any types.
743    allow_unknown_field: if True, skip over unknown field and keep
744      parsing. Avoid to use this option if possible. It may hide some
745      errors (e.g. spelling error on field name)
746
747  Returns:
748    Message: The same message passed as argument.
749
750  Raises:
751    ParseError: On text parsing problems.
752  """
753  return MergeLines(
754      text.split(b'\n' if isinstance(text, bytes) else u'\n'),
755      message,
756      allow_unknown_extension,
757      allow_field_number,
758      descriptor_pool=descriptor_pool,
759      allow_unknown_field=allow_unknown_field)
760
761
762def ParseLines(lines,
763               message,
764               allow_unknown_extension=False,
765               allow_field_number=False,
766               descriptor_pool=None,
767               allow_unknown_field=False):
768  """Parses a text representation of a protocol message into a message.
769
770  See Parse() for caveats.
771
772  Args:
773    lines: An iterable of lines of a message's text representation.
774    message: A protocol buffer message to merge into.
775    allow_unknown_extension: if True, skip over missing extensions and keep
776      parsing
777    allow_field_number: if True, both field number and field name are allowed.
778    descriptor_pool: A DescriptorPool used to resolve Any types.
779    allow_unknown_field: if True, skip over unknown field and keep
780      parsing. Avoid to use this option if possible. It may hide some
781      errors (e.g. spelling error on field name)
782
783  Returns:
784    The same message passed as argument.
785
786  Raises:
787    ParseError: On text parsing problems.
788  """
789  parser = _Parser(allow_unknown_extension,
790                   allow_field_number,
791                   descriptor_pool=descriptor_pool,
792                   allow_unknown_field=allow_unknown_field)
793  return parser.ParseLines(lines, message)
794
795
796def MergeLines(lines,
797               message,
798               allow_unknown_extension=False,
799               allow_field_number=False,
800               descriptor_pool=None,
801               allow_unknown_field=False):
802  """Parses a text representation of a protocol message into a message.
803
804  See Merge() for more details.
805
806  Args:
807    lines: An iterable of lines of a message's text representation.
808    message: A protocol buffer message to merge into.
809    allow_unknown_extension: if True, skip over missing extensions and keep
810      parsing
811    allow_field_number: if True, both field number and field name are allowed.
812    descriptor_pool: A DescriptorPool used to resolve Any types.
813    allow_unknown_field: if True, skip over unknown field and keep
814      parsing. Avoid to use this option if possible. It may hide some
815      errors (e.g. spelling error on field name)
816
817  Returns:
818    The same message passed as argument.
819
820  Raises:
821    ParseError: On text parsing problems.
822  """
823  parser = _Parser(allow_unknown_extension,
824                   allow_field_number,
825                   descriptor_pool=descriptor_pool,
826                   allow_unknown_field=allow_unknown_field)
827  return parser.MergeLines(lines, message)
828
829
830class _Parser(object):
831  """Text format parser for protocol message."""
832
833  def __init__(self,
834               allow_unknown_extension=False,
835               allow_field_number=False,
836               descriptor_pool=None,
837               allow_unknown_field=False):
838    self.allow_unknown_extension = allow_unknown_extension
839    self.allow_field_number = allow_field_number
840    self.descriptor_pool = descriptor_pool
841    self.allow_unknown_field = allow_unknown_field
842
843  def ParseLines(self, lines, message):
844    """Parses a text representation of a protocol message into a message."""
845    self._allow_multiple_scalars = False
846    self._ParseOrMerge(lines, message)
847    return message
848
849  def MergeLines(self, lines, message):
850    """Merges a text representation of a protocol message into a message."""
851    self._allow_multiple_scalars = True
852    self._ParseOrMerge(lines, message)
853    return message
854
855  def _ParseOrMerge(self, lines, message):
856    """Converts a text representation of a protocol message into a message.
857
858    Args:
859      lines: Lines of a message's text representation.
860      message: A protocol buffer message to merge into.
861
862    Raises:
863      ParseError: On text parsing problems.
864    """
865    # Tokenize expects native str lines.
866    try:
867      str_lines = (
868          line if isinstance(line, str) else line.decode('utf-8')
869          for line in lines)
870      tokenizer = Tokenizer(str_lines)
871    except UnicodeDecodeError as e:
872      raise ParseError from e
873    if message:
874      self.root_type = message.DESCRIPTOR.full_name
875    while not tokenizer.AtEnd():
876      self._MergeField(tokenizer, message)
877
878  def _MergeField(self, tokenizer, message):
879    """Merges a single protocol message field into a message.
880
881    Args:
882      tokenizer: A tokenizer to parse the field name and values.
883      message: A protocol message to record the data.
884
885    Raises:
886      ParseError: In case of text parsing problems.
887    """
888    message_descriptor = message.DESCRIPTOR
889    if (message_descriptor.full_name == _ANY_FULL_TYPE_NAME and
890        tokenizer.TryConsume('[')):
891      type_url_prefix, packed_type_name = self._ConsumeAnyTypeUrl(tokenizer)
892      tokenizer.Consume(']')
893      tokenizer.TryConsume(':')
894      self._DetectSilentMarker(tokenizer, message_descriptor.full_name,
895                               type_url_prefix + '/' + packed_type_name)
896      if tokenizer.TryConsume('<'):
897        expanded_any_end_token = '>'
898      else:
899        tokenizer.Consume('{')
900        expanded_any_end_token = '}'
901      expanded_any_sub_message = _BuildMessageFromTypeName(packed_type_name,
902                                                           self.descriptor_pool)
903      # Direct comparison with None is used instead of implicit bool conversion
904      # to avoid false positives with falsy initial values, e.g. for
905      # google.protobuf.ListValue.
906      if expanded_any_sub_message is None:
907        raise ParseError('Type %s not found in descriptor pool' %
908                         packed_type_name)
909      while not tokenizer.TryConsume(expanded_any_end_token):
910        if tokenizer.AtEnd():
911          raise tokenizer.ParseErrorPreviousToken('Expected "%s".' %
912                                                  (expanded_any_end_token,))
913        self._MergeField(tokenizer, expanded_any_sub_message)
914      deterministic = False
915
916      message.Pack(expanded_any_sub_message,
917                   type_url_prefix=type_url_prefix,
918                   deterministic=deterministic)
919      return
920
921    if tokenizer.TryConsume('['):
922      name = [tokenizer.ConsumeIdentifier()]
923      while tokenizer.TryConsume('.'):
924        name.append(tokenizer.ConsumeIdentifier())
925      name = '.'.join(name)
926
927      if not message_descriptor.is_extendable:
928        raise tokenizer.ParseErrorPreviousToken(
929            'Message type "%s" does not have extensions.' %
930            message_descriptor.full_name)
931      # pylint: disable=protected-access
932      field = message.Extensions._FindExtensionByName(name)
933      # pylint: enable=protected-access
934      if not field:
935        if self.allow_unknown_extension:
936          field = None
937        else:
938          raise tokenizer.ParseErrorPreviousToken(
939              'Extension "%s" not registered. '
940              'Did you import the _pb2 module which defines it? '
941              'If you are trying to place the extension in the MessageSet '
942              'field of another message that is in an Any or MessageSet field, '
943              'that message\'s _pb2 module must be imported as well' % name)
944      elif message_descriptor != field.containing_type:
945        raise tokenizer.ParseErrorPreviousToken(
946            'Extension "%s" does not extend message type "%s".' %
947            (name, message_descriptor.full_name))
948
949      tokenizer.Consume(']')
950
951    else:
952      name = tokenizer.ConsumeIdentifierOrNumber()
953      if self.allow_field_number and name.isdigit():
954        number = ParseInteger(name, True, True)
955        field = message_descriptor.fields_by_number.get(number, None)
956        if not field and message_descriptor.is_extendable:
957          field = message.Extensions._FindExtensionByNumber(number)
958      else:
959        field = message_descriptor.fields_by_name.get(name, None)
960
961        # Group names are expected to be capitalized as they appear in the
962        # .proto file, which actually matches their type names, not their field
963        # names.
964        if not field:
965          field = message_descriptor.fields_by_name.get(name.lower(), None)
966          if field and not _IsGroupLike(field):
967            field = None
968          if field and field.message_type.name != name:
969            field = None
970
971      if not field and not self.allow_unknown_field:
972        raise tokenizer.ParseErrorPreviousToken(
973            'Message type "%s" has no field named "%s".' %
974            (message_descriptor.full_name, name))
975
976    if field:
977      if not self._allow_multiple_scalars and field.containing_oneof:
978        # Check if there's a different field set in this oneof.
979        # Note that we ignore the case if the same field was set before, and we
980        # apply _allow_multiple_scalars to non-scalar fields as well.
981        which_oneof = message.WhichOneof(field.containing_oneof.name)
982        if which_oneof is not None and which_oneof != field.name:
983          raise tokenizer.ParseErrorPreviousToken(
984              'Field "%s" is specified along with field "%s", another member '
985              'of oneof "%s" for message type "%s".' %
986              (field.name, which_oneof, field.containing_oneof.name,
987               message_descriptor.full_name))
988
989      if field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_MESSAGE:
990        tokenizer.TryConsume(':')
991        self._DetectSilentMarker(tokenizer, message_descriptor.full_name,
992                                 field.full_name)
993        merger = self._MergeMessageField
994      else:
995        tokenizer.Consume(':')
996        self._DetectSilentMarker(tokenizer, message_descriptor.full_name,
997                                 field.full_name)
998        merger = self._MergeScalarField
999
1000      if (field.label == descriptor.FieldDescriptor.LABEL_REPEATED and
1001          tokenizer.TryConsume('[')):
1002        # Short repeated format, e.g. "foo: [1, 2, 3]"
1003        if not tokenizer.TryConsume(']'):
1004          while True:
1005            merger(tokenizer, message, field)
1006            if tokenizer.TryConsume(']'):
1007              break
1008            tokenizer.Consume(',')
1009
1010      else:
1011        merger(tokenizer, message, field)
1012
1013    else:  # Proto field is unknown.
1014      assert (self.allow_unknown_extension or self.allow_unknown_field)
1015      self._SkipFieldContents(tokenizer, name, message_descriptor.full_name)
1016
1017    # For historical reasons, fields may optionally be separated by commas or
1018    # semicolons.
1019    if not tokenizer.TryConsume(','):
1020      tokenizer.TryConsume(';')
1021
1022  def _LogSilentMarker(self, immediate_message_type, field_name):
1023    pass
1024
1025  def _DetectSilentMarker(self, tokenizer, immediate_message_type, field_name):
1026    if tokenizer.contains_silent_marker_before_current_token:
1027      self._LogSilentMarker(immediate_message_type, field_name)
1028
1029  def _ConsumeAnyTypeUrl(self, tokenizer):
1030    """Consumes a google.protobuf.Any type URL and returns the type name."""
1031    # Consume "type.googleapis.com/".
1032    prefix = [tokenizer.ConsumeIdentifier()]
1033    tokenizer.Consume('.')
1034    prefix.append(tokenizer.ConsumeIdentifier())
1035    tokenizer.Consume('.')
1036    prefix.append(tokenizer.ConsumeIdentifier())
1037    tokenizer.Consume('/')
1038    # Consume the fully-qualified type name.
1039    name = [tokenizer.ConsumeIdentifier()]
1040    while tokenizer.TryConsume('.'):
1041      name.append(tokenizer.ConsumeIdentifier())
1042    return '.'.join(prefix), '.'.join(name)
1043
1044  def _MergeMessageField(self, tokenizer, message, field):
1045    """Merges a single scalar field into a message.
1046
1047    Args:
1048      tokenizer: A tokenizer to parse the field value.
1049      message: The message of which field is a member.
1050      field: The descriptor of the field to be merged.
1051
1052    Raises:
1053      ParseError: In case of text parsing problems.
1054    """
1055    is_map_entry = _IsMapEntry(field)
1056
1057    if tokenizer.TryConsume('<'):
1058      end_token = '>'
1059    else:
1060      tokenizer.Consume('{')
1061      end_token = '}'
1062
1063    if field.label == descriptor.FieldDescriptor.LABEL_REPEATED:
1064      if field.is_extension:
1065        sub_message = message.Extensions[field].add()
1066      elif is_map_entry:
1067        sub_message = getattr(message, field.name).GetEntryClass()()
1068      else:
1069        sub_message = getattr(message, field.name).add()
1070    else:
1071      if field.is_extension:
1072        if (not self._allow_multiple_scalars and
1073            message.HasExtension(field)):
1074          raise tokenizer.ParseErrorPreviousToken(
1075              'Message type "%s" should not have multiple "%s" extensions.' %
1076              (message.DESCRIPTOR.full_name, field.full_name))
1077        sub_message = message.Extensions[field]
1078      else:
1079        # Also apply _allow_multiple_scalars to message field.
1080        # TODO: Change to _allow_singular_overwrites.
1081        if (not self._allow_multiple_scalars and
1082            message.HasField(field.name)):
1083          raise tokenizer.ParseErrorPreviousToken(
1084              'Message type "%s" should not have multiple "%s" fields.' %
1085              (message.DESCRIPTOR.full_name, field.name))
1086        sub_message = getattr(message, field.name)
1087      sub_message.SetInParent()
1088
1089    while not tokenizer.TryConsume(end_token):
1090      if tokenizer.AtEnd():
1091        raise tokenizer.ParseErrorPreviousToken('Expected "%s".' % (end_token,))
1092      self._MergeField(tokenizer, sub_message)
1093
1094    if is_map_entry:
1095      value_cpptype = field.message_type.fields_by_name['value'].cpp_type
1096      if value_cpptype == descriptor.FieldDescriptor.CPPTYPE_MESSAGE:
1097        value = getattr(message, field.name)[sub_message.key]
1098        value.CopyFrom(sub_message.value)
1099      else:
1100        getattr(message, field.name)[sub_message.key] = sub_message.value
1101
1102  def _MergeScalarField(self, tokenizer, message, field):
1103    """Merges a single scalar field into a message.
1104
1105    Args:
1106      tokenizer: A tokenizer to parse the field value.
1107      message: A protocol message to record the data.
1108      field: The descriptor of the field to be merged.
1109
1110    Raises:
1111      ParseError: In case of text parsing problems.
1112      RuntimeError: On runtime errors.
1113    """
1114    _ = self.allow_unknown_extension
1115    value = None
1116
1117    if field.type in (descriptor.FieldDescriptor.TYPE_INT32,
1118                      descriptor.FieldDescriptor.TYPE_SINT32,
1119                      descriptor.FieldDescriptor.TYPE_SFIXED32):
1120      value = _ConsumeInt32(tokenizer)
1121    elif field.type in (descriptor.FieldDescriptor.TYPE_INT64,
1122                        descriptor.FieldDescriptor.TYPE_SINT64,
1123                        descriptor.FieldDescriptor.TYPE_SFIXED64):
1124      value = _ConsumeInt64(tokenizer)
1125    elif field.type in (descriptor.FieldDescriptor.TYPE_UINT32,
1126                        descriptor.FieldDescriptor.TYPE_FIXED32):
1127      value = _ConsumeUint32(tokenizer)
1128    elif field.type in (descriptor.FieldDescriptor.TYPE_UINT64,
1129                        descriptor.FieldDescriptor.TYPE_FIXED64):
1130      value = _ConsumeUint64(tokenizer)
1131    elif field.type in (descriptor.FieldDescriptor.TYPE_FLOAT,
1132                        descriptor.FieldDescriptor.TYPE_DOUBLE):
1133      value = tokenizer.ConsumeFloat()
1134    elif field.type == descriptor.FieldDescriptor.TYPE_BOOL:
1135      value = tokenizer.ConsumeBool()
1136    elif field.type == descriptor.FieldDescriptor.TYPE_STRING:
1137      value = tokenizer.ConsumeString()
1138    elif field.type == descriptor.FieldDescriptor.TYPE_BYTES:
1139      value = tokenizer.ConsumeByteString()
1140    elif field.type == descriptor.FieldDescriptor.TYPE_ENUM:
1141      value = tokenizer.ConsumeEnum(field)
1142    else:
1143      raise RuntimeError('Unknown field type %d' % field.type)
1144
1145    if field.label == descriptor.FieldDescriptor.LABEL_REPEATED:
1146      if field.is_extension:
1147        message.Extensions[field].append(value)
1148      else:
1149        getattr(message, field.name).append(value)
1150    else:
1151      if field.is_extension:
1152        if (not self._allow_multiple_scalars and
1153            field.has_presence and
1154            message.HasExtension(field)):
1155          raise tokenizer.ParseErrorPreviousToken(
1156              'Message type "%s" should not have multiple "%s" extensions.' %
1157              (message.DESCRIPTOR.full_name, field.full_name))
1158        else:
1159          message.Extensions[field] = value
1160      else:
1161        duplicate_error = False
1162        if not self._allow_multiple_scalars:
1163          if field.has_presence:
1164            duplicate_error = message.HasField(field.name)
1165          else:
1166            # For field that doesn't represent presence, try best effort to
1167            # check multiple scalars by compare to default values.
1168            duplicate_error = bool(getattr(message, field.name))
1169
1170        if duplicate_error:
1171          raise tokenizer.ParseErrorPreviousToken(
1172              'Message type "%s" should not have multiple "%s" fields.' %
1173              (message.DESCRIPTOR.full_name, field.name))
1174        else:
1175          setattr(message, field.name, value)
1176
1177  def _SkipFieldContents(self, tokenizer, field_name, immediate_message_type):
1178    """Skips over contents (value or message) of a field.
1179
1180    Args:
1181      tokenizer: A tokenizer to parse the field name and values.
1182      field_name: The field name currently being parsed.
1183      immediate_message_type: The type of the message immediately containing
1184        the silent marker.
1185    """
1186    # Try to guess the type of this field.
1187    # If this field is not a message, there should be a ":" between the
1188    # field name and the field value and also the field value should not
1189    # start with "{" or "<" which indicates the beginning of a message body.
1190    # If there is no ":" or there is a "{" or "<" after ":", this field has
1191    # to be a message or the input is ill-formed.
1192    if tokenizer.TryConsume(
1193        ':') and not tokenizer.LookingAt('{') and not tokenizer.LookingAt('<'):
1194      self._DetectSilentMarker(tokenizer, immediate_message_type, field_name)
1195      if tokenizer.LookingAt('['):
1196        self._SkipRepeatedFieldValue(tokenizer)
1197      else:
1198        self._SkipFieldValue(tokenizer)
1199    else:
1200      self._DetectSilentMarker(tokenizer, immediate_message_type, field_name)
1201      self._SkipFieldMessage(tokenizer, immediate_message_type)
1202
1203  def _SkipField(self, tokenizer, immediate_message_type):
1204    """Skips over a complete field (name and value/message).
1205
1206    Args:
1207      tokenizer: A tokenizer to parse the field name and values.
1208      immediate_message_type: The type of the message immediately containing
1209        the silent marker.
1210    """
1211    field_name = ''
1212    if tokenizer.TryConsume('['):
1213      # Consume extension or google.protobuf.Any type URL
1214      field_name += '[' + tokenizer.ConsumeIdentifier()
1215      num_identifiers = 1
1216      while tokenizer.TryConsume('.'):
1217        field_name += '.' + tokenizer.ConsumeIdentifier()
1218        num_identifiers += 1
1219      # This is possibly a type URL for an Any message.
1220      if num_identifiers == 3 and tokenizer.TryConsume('/'):
1221        field_name += '/' + tokenizer.ConsumeIdentifier()
1222        while tokenizer.TryConsume('.'):
1223          field_name += '.' + tokenizer.ConsumeIdentifier()
1224      tokenizer.Consume(']')
1225      field_name += ']'
1226    else:
1227      field_name += tokenizer.ConsumeIdentifierOrNumber()
1228
1229    self._SkipFieldContents(tokenizer, field_name, immediate_message_type)
1230
1231    # For historical reasons, fields may optionally be separated by commas or
1232    # semicolons.
1233    if not tokenizer.TryConsume(','):
1234      tokenizer.TryConsume(';')
1235
1236  def _SkipFieldMessage(self, tokenizer, immediate_message_type):
1237    """Skips over a field message.
1238
1239    Args:
1240      tokenizer: A tokenizer to parse the field name and values.
1241      immediate_message_type: The type of the message immediately containing
1242        the silent marker
1243    """
1244    if tokenizer.TryConsume('<'):
1245      delimiter = '>'
1246    else:
1247      tokenizer.Consume('{')
1248      delimiter = '}'
1249
1250    while not tokenizer.LookingAt('>') and not tokenizer.LookingAt('}'):
1251      self._SkipField(tokenizer, immediate_message_type)
1252
1253    tokenizer.Consume(delimiter)
1254
1255  def _SkipFieldValue(self, tokenizer):
1256    """Skips over a field value.
1257
1258    Args:
1259      tokenizer: A tokenizer to parse the field name and values.
1260
1261    Raises:
1262      ParseError: In case an invalid field value is found.
1263    """
1264    if (not tokenizer.TryConsumeByteString()and
1265        not tokenizer.TryConsumeIdentifier() and
1266        not _TryConsumeInt64(tokenizer) and
1267        not _TryConsumeUint64(tokenizer) and
1268        not tokenizer.TryConsumeFloat()):
1269      raise ParseError('Invalid field value: ' + tokenizer.token)
1270
1271  def _SkipRepeatedFieldValue(self, tokenizer):
1272    """Skips over a repeated field value.
1273
1274    Args:
1275      tokenizer: A tokenizer to parse the field value.
1276    """
1277    tokenizer.Consume('[')
1278    if not tokenizer.LookingAt(']'):
1279      self._SkipFieldValue(tokenizer)
1280      while tokenizer.TryConsume(','):
1281        self._SkipFieldValue(tokenizer)
1282    tokenizer.Consume(']')
1283
1284
1285class Tokenizer(object):
1286  """Protocol buffer text representation tokenizer.
1287
1288  This class handles the lower level string parsing by splitting it into
1289  meaningful tokens.
1290
1291  It was directly ported from the Java protocol buffer API.
1292  """
1293
1294  _WHITESPACE = re.compile(r'\s+')
1295  _COMMENT = re.compile(r'(\s*#.*$)', re.MULTILINE)
1296  _WHITESPACE_OR_COMMENT = re.compile(r'(\s|(#.*$))+', re.MULTILINE)
1297  _TOKEN = re.compile('|'.join([
1298      r'[a-zA-Z_][0-9a-zA-Z_+-]*',  # an identifier
1299      r'([0-9+-]|(\.[0-9]))[0-9a-zA-Z_.+-]*',  # a number
1300  ] + [  # quoted str for each quote mark
1301      # Avoid backtracking! https://stackoverflow.com/a/844267
1302      r'{qt}[^{qt}\n\\]*((\\.)+[^{qt}\n\\]*)*({qt}|\\?$)'.format(qt=mark)
1303      for mark in _QUOTES
1304  ]))
1305
1306  _IDENTIFIER = re.compile(r'[^\d\W]\w*')
1307  _IDENTIFIER_OR_NUMBER = re.compile(r'\w+')
1308
1309  def __init__(self, lines, skip_comments=True):
1310    self._position = 0
1311    self._line = -1
1312    self._column = 0
1313    self._token_start = None
1314    self.token = ''
1315    self._lines = iter(lines)
1316    self._current_line = ''
1317    self._previous_line = 0
1318    self._previous_column = 0
1319    self._more_lines = True
1320    self._skip_comments = skip_comments
1321    self._whitespace_pattern = (skip_comments and self._WHITESPACE_OR_COMMENT
1322                                or self._WHITESPACE)
1323    self.contains_silent_marker_before_current_token = False
1324
1325    self._SkipWhitespace()
1326    self.NextToken()
1327
1328  def LookingAt(self, token):
1329    return self.token == token
1330
1331  def AtEnd(self):
1332    """Checks the end of the text was reached.
1333
1334    Returns:
1335      True iff the end was reached.
1336    """
1337    return not self.token
1338
1339  def _PopLine(self):
1340    while len(self._current_line) <= self._column:
1341      try:
1342        self._current_line = next(self._lines)
1343      except StopIteration:
1344        self._current_line = ''
1345        self._more_lines = False
1346        return
1347      else:
1348        self._line += 1
1349        self._column = 0
1350
1351  def _SkipWhitespace(self):
1352    while True:
1353      self._PopLine()
1354      match = self._whitespace_pattern.match(self._current_line, self._column)
1355      if not match:
1356        break
1357      self.contains_silent_marker_before_current_token = match.group(0) == (
1358          ' ' + _DEBUG_STRING_SILENT_MARKER)
1359      length = len(match.group(0))
1360      self._column += length
1361
1362  def TryConsume(self, token):
1363    """Tries to consume a given piece of text.
1364
1365    Args:
1366      token: Text to consume.
1367
1368    Returns:
1369      True iff the text was consumed.
1370    """
1371    if self.token == token:
1372      self.NextToken()
1373      return True
1374    return False
1375
1376  def Consume(self, token):
1377    """Consumes a piece of text.
1378
1379    Args:
1380      token: Text to consume.
1381
1382    Raises:
1383      ParseError: If the text couldn't be consumed.
1384    """
1385    if not self.TryConsume(token):
1386      raise self.ParseError('Expected "%s".' % token)
1387
1388  def ConsumeComment(self):
1389    result = self.token
1390    if not self._COMMENT.match(result):
1391      raise self.ParseError('Expected comment.')
1392    self.NextToken()
1393    return result
1394
1395  def ConsumeCommentOrTrailingComment(self):
1396    """Consumes a comment, returns a 2-tuple (trailing bool, comment str)."""
1397
1398    # Tokenizer initializes _previous_line and _previous_column to 0. As the
1399    # tokenizer starts, it looks like there is a previous token on the line.
1400    just_started = self._line == 0 and self._column == 0
1401
1402    before_parsing = self._previous_line
1403    comment = self.ConsumeComment()
1404
1405    # A trailing comment is a comment on the same line than the previous token.
1406    trailing = (self._previous_line == before_parsing
1407                and not just_started)
1408
1409    return trailing, comment
1410
1411  def TryConsumeIdentifier(self):
1412    try:
1413      self.ConsumeIdentifier()
1414      return True
1415    except ParseError:
1416      return False
1417
1418  def ConsumeIdentifier(self):
1419    """Consumes protocol message field identifier.
1420
1421    Returns:
1422      Identifier string.
1423
1424    Raises:
1425      ParseError: If an identifier couldn't be consumed.
1426    """
1427    result = self.token
1428    if not self._IDENTIFIER.match(result):
1429      raise self.ParseError('Expected identifier.')
1430    self.NextToken()
1431    return result
1432
1433  def TryConsumeIdentifierOrNumber(self):
1434    try:
1435      self.ConsumeIdentifierOrNumber()
1436      return True
1437    except ParseError:
1438      return False
1439
1440  def ConsumeIdentifierOrNumber(self):
1441    """Consumes protocol message field identifier.
1442
1443    Returns:
1444      Identifier string.
1445
1446    Raises:
1447      ParseError: If an identifier couldn't be consumed.
1448    """
1449    result = self.token
1450    if not self._IDENTIFIER_OR_NUMBER.match(result):
1451      raise self.ParseError('Expected identifier or number, got %s.' % result)
1452    self.NextToken()
1453    return result
1454
1455  def TryConsumeInteger(self):
1456    try:
1457      self.ConsumeInteger()
1458      return True
1459    except ParseError:
1460      return False
1461
1462  def ConsumeInteger(self):
1463    """Consumes an integer number.
1464
1465    Returns:
1466      The integer parsed.
1467
1468    Raises:
1469      ParseError: If an integer couldn't be consumed.
1470    """
1471    try:
1472      result = _ParseAbstractInteger(self.token)
1473    except ValueError as e:
1474      raise self.ParseError(str(e))
1475    self.NextToken()
1476    return result
1477
1478  def TryConsumeFloat(self):
1479    try:
1480      self.ConsumeFloat()
1481      return True
1482    except ParseError:
1483      return False
1484
1485  def ConsumeFloat(self):
1486    """Consumes an floating point number.
1487
1488    Returns:
1489      The number parsed.
1490
1491    Raises:
1492      ParseError: If a floating point number couldn't be consumed.
1493    """
1494    try:
1495      result = ParseFloat(self.token)
1496    except ValueError as e:
1497      raise self.ParseError(str(e))
1498    self.NextToken()
1499    return result
1500
1501  def ConsumeBool(self):
1502    """Consumes a boolean value.
1503
1504    Returns:
1505      The bool parsed.
1506
1507    Raises:
1508      ParseError: If a boolean value couldn't be consumed.
1509    """
1510    try:
1511      result = ParseBool(self.token)
1512    except ValueError as e:
1513      raise self.ParseError(str(e))
1514    self.NextToken()
1515    return result
1516
1517  def TryConsumeByteString(self):
1518    try:
1519      self.ConsumeByteString()
1520      return True
1521    except ParseError:
1522      return False
1523
1524  def ConsumeString(self):
1525    """Consumes a string value.
1526
1527    Returns:
1528      The string parsed.
1529
1530    Raises:
1531      ParseError: If a string value couldn't be consumed.
1532    """
1533    the_bytes = self.ConsumeByteString()
1534    try:
1535      return str(the_bytes, 'utf-8')
1536    except UnicodeDecodeError as e:
1537      raise self._StringParseError(e)
1538
1539  def ConsumeByteString(self):
1540    """Consumes a byte array value.
1541
1542    Returns:
1543      The array parsed (as a string).
1544
1545    Raises:
1546      ParseError: If a byte array value couldn't be consumed.
1547    """
1548    the_list = [self._ConsumeSingleByteString()]
1549    while self.token and self.token[0] in _QUOTES:
1550      the_list.append(self._ConsumeSingleByteString())
1551    return b''.join(the_list)
1552
1553  def _ConsumeSingleByteString(self):
1554    """Consume one token of a string literal.
1555
1556    String literals (whether bytes or text) can come in multiple adjacent
1557    tokens which are automatically concatenated, like in C or Python.  This
1558    method only consumes one token.
1559
1560    Returns:
1561      The token parsed.
1562    Raises:
1563      ParseError: When the wrong format data is found.
1564    """
1565    text = self.token
1566    if len(text) < 1 or text[0] not in _QUOTES:
1567      raise self.ParseError('Expected string but found: %r' % (text,))
1568
1569    if len(text) < 2 or text[-1] != text[0]:
1570      raise self.ParseError('String missing ending quote: %r' % (text,))
1571
1572    try:
1573      result = text_encoding.CUnescape(text[1:-1])
1574    except ValueError as e:
1575      raise self.ParseError(str(e))
1576    self.NextToken()
1577    return result
1578
1579  def ConsumeEnum(self, field):
1580    try:
1581      result = ParseEnum(field, self.token)
1582    except ValueError as e:
1583      raise self.ParseError(str(e))
1584    self.NextToken()
1585    return result
1586
1587  def ParseErrorPreviousToken(self, message):
1588    """Creates and *returns* a ParseError for the previously read token.
1589
1590    Args:
1591      message: A message to set for the exception.
1592
1593    Returns:
1594      A ParseError instance.
1595    """
1596    return ParseError(message, self._previous_line + 1,
1597                      self._previous_column + 1)
1598
1599  def ParseError(self, message):
1600    """Creates and *returns* a ParseError for the current token."""
1601    return ParseError('\'' + self._current_line + '\': ' + message,
1602                      self._line + 1, self._column + 1)
1603
1604  def _StringParseError(self, e):
1605    return self.ParseError('Couldn\'t parse string: ' + str(e))
1606
1607  def NextToken(self):
1608    """Reads the next meaningful token."""
1609    self._previous_line = self._line
1610    self._previous_column = self._column
1611    self.contains_silent_marker_before_current_token = False
1612
1613    self._column += len(self.token)
1614    self._SkipWhitespace()
1615
1616    if not self._more_lines:
1617      self.token = ''
1618      return
1619
1620    match = self._TOKEN.match(self._current_line, self._column)
1621    if not match and not self._skip_comments:
1622      match = self._COMMENT.match(self._current_line, self._column)
1623    if match:
1624      token = match.group(0)
1625      self.token = token
1626    else:
1627      self.token = self._current_line[self._column]
1628
1629# Aliased so it can still be accessed by current visibility violators.
1630# TODO: Migrate violators to textformat_tokenizer.
1631_Tokenizer = Tokenizer  # pylint: disable=invalid-name
1632
1633
1634def _ConsumeInt32(tokenizer):
1635  """Consumes a signed 32bit integer number from tokenizer.
1636
1637  Args:
1638    tokenizer: A tokenizer used to parse the number.
1639
1640  Returns:
1641    The integer parsed.
1642
1643  Raises:
1644    ParseError: If a signed 32bit integer couldn't be consumed.
1645  """
1646  return _ConsumeInteger(tokenizer, is_signed=True, is_long=False)
1647
1648
1649def _ConsumeUint32(tokenizer):
1650  """Consumes an unsigned 32bit integer number from tokenizer.
1651
1652  Args:
1653    tokenizer: A tokenizer used to parse the number.
1654
1655  Returns:
1656    The integer parsed.
1657
1658  Raises:
1659    ParseError: If an unsigned 32bit integer couldn't be consumed.
1660  """
1661  return _ConsumeInteger(tokenizer, is_signed=False, is_long=False)
1662
1663
1664def _TryConsumeInt64(tokenizer):
1665  try:
1666    _ConsumeInt64(tokenizer)
1667    return True
1668  except ParseError:
1669    return False
1670
1671
1672def _ConsumeInt64(tokenizer):
1673  """Consumes a signed 32bit integer number from tokenizer.
1674
1675  Args:
1676    tokenizer: A tokenizer used to parse the number.
1677
1678  Returns:
1679    The integer parsed.
1680
1681  Raises:
1682    ParseError: If a signed 32bit integer couldn't be consumed.
1683  """
1684  return _ConsumeInteger(tokenizer, is_signed=True, is_long=True)
1685
1686
1687def _TryConsumeUint64(tokenizer):
1688  try:
1689    _ConsumeUint64(tokenizer)
1690    return True
1691  except ParseError:
1692    return False
1693
1694
1695def _ConsumeUint64(tokenizer):
1696  """Consumes an unsigned 64bit integer number from tokenizer.
1697
1698  Args:
1699    tokenizer: A tokenizer used to parse the number.
1700
1701  Returns:
1702    The integer parsed.
1703
1704  Raises:
1705    ParseError: If an unsigned 64bit integer couldn't be consumed.
1706  """
1707  return _ConsumeInteger(tokenizer, is_signed=False, is_long=True)
1708
1709
1710def _ConsumeInteger(tokenizer, is_signed=False, is_long=False):
1711  """Consumes an integer number from tokenizer.
1712
1713  Args:
1714    tokenizer: A tokenizer used to parse the number.
1715    is_signed: True if a signed integer must be parsed.
1716    is_long: True if a long integer must be parsed.
1717
1718  Returns:
1719    The integer parsed.
1720
1721  Raises:
1722    ParseError: If an integer with given characteristics couldn't be consumed.
1723  """
1724  try:
1725    result = ParseInteger(tokenizer.token, is_signed=is_signed, is_long=is_long)
1726  except ValueError as e:
1727    raise tokenizer.ParseError(str(e))
1728  tokenizer.NextToken()
1729  return result
1730
1731
1732def ParseInteger(text, is_signed=False, is_long=False):
1733  """Parses an integer.
1734
1735  Args:
1736    text: The text to parse.
1737    is_signed: True if a signed integer must be parsed.
1738    is_long: True if a long integer must be parsed.
1739
1740  Returns:
1741    The integer value.
1742
1743  Raises:
1744    ValueError: Thrown Iff the text is not a valid integer.
1745  """
1746  # Do the actual parsing. Exception handling is propagated to caller.
1747  result = _ParseAbstractInteger(text)
1748
1749  # Check if the integer is sane. Exceptions handled by callers.
1750  checker = _INTEGER_CHECKERS[2 * int(is_long) + int(is_signed)]
1751  checker.CheckValue(result)
1752  return result
1753
1754
1755def _ParseAbstractInteger(text):
1756  """Parses an integer without checking size/signedness.
1757
1758  Args:
1759    text: The text to parse.
1760
1761  Returns:
1762    The integer value.
1763
1764  Raises:
1765    ValueError: Thrown Iff the text is not a valid integer.
1766  """
1767  # Do the actual parsing. Exception handling is propagated to caller.
1768  orig_text = text
1769  c_octal_match = re.match(r'(-?)0(\d+)$', text)
1770  if c_octal_match:
1771    # Python 3 no longer supports 0755 octal syntax without the 'o', so
1772    # we always use the '0o' prefix for multi-digit numbers starting with 0.
1773    text = c_octal_match.group(1) + '0o' + c_octal_match.group(2)
1774  try:
1775    return int(text, 0)
1776  except ValueError:
1777    raise ValueError('Couldn\'t parse integer: %s' % orig_text)
1778
1779
1780def ParseFloat(text):
1781  """Parse a floating point number.
1782
1783  Args:
1784    text: Text to parse.
1785
1786  Returns:
1787    The number parsed.
1788
1789  Raises:
1790    ValueError: If a floating point number couldn't be parsed.
1791  """
1792  try:
1793    # Assume Python compatible syntax.
1794    return float(text)
1795  except ValueError:
1796    # Check alternative spellings.
1797    if _FLOAT_INFINITY.match(text):
1798      if text[0] == '-':
1799        return float('-inf')
1800      else:
1801        return float('inf')
1802    elif _FLOAT_NAN.match(text):
1803      return float('nan')
1804    else:
1805      # assume '1.0f' format
1806      try:
1807        return float(text.rstrip('f'))
1808      except ValueError:
1809        raise ValueError("Couldn't parse float: %s" % text)
1810
1811
1812def ParseBool(text):
1813  """Parse a boolean value.
1814
1815  Args:
1816    text: Text to parse.
1817
1818  Returns:
1819    Boolean values parsed
1820
1821  Raises:
1822    ValueError: If text is not a valid boolean.
1823  """
1824  if text in ('true', 't', '1', 'True'):
1825    return True
1826  elif text in ('false', 'f', '0', 'False'):
1827    return False
1828  else:
1829    raise ValueError('Expected "true" or "false".')
1830
1831
1832def ParseEnum(field, value):
1833  """Parse an enum value.
1834
1835  The value can be specified by a number (the enum value), or by
1836  a string literal (the enum name).
1837
1838  Args:
1839    field: Enum field descriptor.
1840    value: String value.
1841
1842  Returns:
1843    Enum value number.
1844
1845  Raises:
1846    ValueError: If the enum value could not be parsed.
1847  """
1848  enum_descriptor = field.enum_type
1849  try:
1850    number = int(value, 0)
1851  except ValueError:
1852    # Identifier.
1853    enum_value = enum_descriptor.values_by_name.get(value, None)
1854    if enum_value is None:
1855      raise ValueError('Enum type "%s" has no value named %s.' %
1856                       (enum_descriptor.full_name, value))
1857  else:
1858    if not field.enum_type.is_closed:
1859      return number
1860    enum_value = enum_descriptor.values_by_number.get(number, None)
1861    if enum_value is None:
1862      raise ValueError('Enum type "%s" has no value with number %d.' %
1863                       (enum_descriptor.full_name, number))
1864  return enum_value.number
1865