• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Protocol Buffers - Google's data interchange format
2 // Copyright 2008 Google Inc.  All rights reserved.
3 // https://developers.google.com/protocol-buffers/
4 //
5 // Redistribution and use in source and binary forms, with or without
6 // modification, are permitted provided that the following conditions are
7 // met:
8 //
9 //     * Redistributions of source code must retain the above copyright
10 // notice, this list of conditions and the following disclaimer.
11 //     * Redistributions in binary form must reproduce the above
12 // copyright notice, this list of conditions and the following disclaimer
13 // in the documentation and/or other materials provided with the
14 // distribution.
15 //     * Neither the name of Google Inc. nor the names of its
16 // contributors may be used to endorse or promote products derived from
17 // this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 
31 package com.google.protobuf;
32 
33 import com.google.protobuf.Descriptors.Descriptor;
34 import com.google.protobuf.Descriptors.EnumDescriptor;
35 import com.google.protobuf.Descriptors.EnumValueDescriptor;
36 import com.google.protobuf.Descriptors.FieldDescriptor;
37 
38 import java.io.IOException;
39 import java.math.BigInteger;
40 import java.nio.CharBuffer;
41 import java.util.ArrayList;
42 import java.util.List;
43 import java.util.Locale;
44 import java.util.Map;
45 import java.util.logging.Logger;
46 import java.util.regex.Matcher;
47 import java.util.regex.Pattern;
48 
49 /**
50  * Provide text parsing and formatting support for proto2 instances.
51  * The implementation largely follows google/protobuf/text_format.cc.
52  *
53  * @author wenboz@google.com Wenbo Zhu
54  * @author kenton@google.com Kenton Varda
55  */
56 public final class TextFormat {
TextFormat()57   private TextFormat() {}
58 
59   private static final Logger logger =
60       Logger.getLogger(TextFormat.class.getName());
61 
62   private static final Printer DEFAULT_PRINTER = new Printer();
63   private static final Printer SINGLE_LINE_PRINTER =
64       (new Printer()).setSingleLineMode(true);
65   private static final Printer UNICODE_PRINTER =
66       (new Printer()).setEscapeNonAscii(false);
67 
68   /**
69    * Outputs a textual representation of the Protocol Message supplied into
70    * the parameter output. (This representation is the new version of the
71    * classic "ProtocolPrinter" output from the original Protocol Buffer system)
72    */
print( final MessageOrBuilder message, final Appendable output)73   public static void print(
74       final MessageOrBuilder message, final Appendable output)
75       throws IOException {
76     DEFAULT_PRINTER.print(message, new TextGenerator(output));
77   }
78 
79   /** Outputs a textual representation of {@code fields} to {@code output}. */
print(final UnknownFieldSet fields, final Appendable output)80   public static void print(final UnknownFieldSet fields,
81                            final Appendable output)
82                            throws IOException {
83     DEFAULT_PRINTER.printUnknownFields(fields, new TextGenerator(output));
84   }
85 
86   /**
87    * Same as {@code print()}, except that non-ASCII characters are not
88    * escaped.
89    */
printUnicode( final MessageOrBuilder message, final Appendable output)90   public static void printUnicode(
91       final MessageOrBuilder message, final Appendable output)
92       throws IOException {
93     UNICODE_PRINTER.print(message, new TextGenerator(output));
94   }
95 
96   /**
97    * Same as {@code print()}, except that non-ASCII characters are not
98    * escaped.
99    */
printUnicode(final UnknownFieldSet fields, final Appendable output)100   public static void printUnicode(final UnknownFieldSet fields,
101                                   final Appendable output)
102                                   throws IOException {
103     UNICODE_PRINTER.printUnknownFields(fields, new TextGenerator(output));
104   }
105 
106   /**
107    * Generates a human readable form of this message, useful for debugging and
108    * other purposes, with no newline characters.
109    */
shortDebugString(final MessageOrBuilder message)110   public static String shortDebugString(final MessageOrBuilder message) {
111     try {
112       final StringBuilder sb = new StringBuilder();
113       SINGLE_LINE_PRINTER.print(message, new TextGenerator(sb));
114       // Single line mode currently might have an extra space at the end.
115       return sb.toString().trim();
116     } catch (IOException e) {
117       throw new IllegalStateException(e);
118     }
119   }
120 
121   /**
122    * Generates a human readable form of the unknown fields, useful for debugging
123    * and other purposes, with no newline characters.
124    */
shortDebugString(final UnknownFieldSet fields)125   public static String shortDebugString(final UnknownFieldSet fields) {
126     try {
127       final StringBuilder sb = new StringBuilder();
128       SINGLE_LINE_PRINTER.printUnknownFields(fields, new TextGenerator(sb));
129       // Single line mode currently might have an extra space at the end.
130       return sb.toString().trim();
131     } catch (IOException e) {
132       throw new IllegalStateException(e);
133     }
134   }
135 
136   /**
137    * Like {@code print()}, but writes directly to a {@code String} and
138    * returns it.
139    */
printToString(final MessageOrBuilder message)140   public static String printToString(final MessageOrBuilder message) {
141     try {
142       final StringBuilder text = new StringBuilder();
143       print(message, text);
144       return text.toString();
145     } catch (IOException e) {
146       throw new IllegalStateException(e);
147     }
148   }
149 
150   /**
151    * Like {@code print()}, but writes directly to a {@code String} and
152    * returns it.
153    */
printToString(final UnknownFieldSet fields)154   public static String printToString(final UnknownFieldSet fields) {
155     try {
156       final StringBuilder text = new StringBuilder();
157       print(fields, text);
158       return text.toString();
159     } catch (IOException e) {
160       throw new IllegalStateException(e);
161     }
162   }
163 
164   /**
165    * Same as {@code printToString()}, except that non-ASCII characters
166    * in string type fields are not escaped in backslash+octals.
167    */
printToUnicodeString(final MessageOrBuilder message)168   public static String printToUnicodeString(final MessageOrBuilder message) {
169     try {
170       final StringBuilder text = new StringBuilder();
171       UNICODE_PRINTER.print(message, new TextGenerator(text));
172       return text.toString();
173     } catch (IOException e) {
174       throw new IllegalStateException(e);
175     }
176   }
177 
178   /**
179    * Same as {@code printToString()}, except that non-ASCII characters
180    * in string type fields are not escaped in backslash+octals.
181    */
printToUnicodeString(final UnknownFieldSet fields)182   public static String printToUnicodeString(final UnknownFieldSet fields) {
183     try {
184       final StringBuilder text = new StringBuilder();
185       UNICODE_PRINTER.printUnknownFields(fields, new TextGenerator(text));
186       return text.toString();
187     } catch (IOException e) {
188       throw new IllegalStateException(e);
189     }
190   }
191 
printField(final FieldDescriptor field, final Object value, final Appendable output)192   public static void printField(final FieldDescriptor field,
193                                 final Object value,
194                                 final Appendable output)
195                                 throws IOException {
196     DEFAULT_PRINTER.printField(field, value, new TextGenerator(output));
197   }
198 
printFieldToString(final FieldDescriptor field, final Object value)199   public static String printFieldToString(final FieldDescriptor field,
200                                           final Object value) {
201     try {
202       final StringBuilder text = new StringBuilder();
203       printField(field, value, text);
204       return text.toString();
205     } catch (IOException e) {
206       throw new IllegalStateException(e);
207     }
208   }
209 
210   /**
211    * Outputs a textual representation of the value of given field value.
212    *
213    * @param field the descriptor of the field
214    * @param value the value of the field
215    * @param output the output to which to append the formatted value
216    * @throws ClassCastException if the value is not appropriate for the
217    *     given field descriptor
218    * @throws IOException if there is an exception writing to the output
219    */
printFieldValue(final FieldDescriptor field, final Object value, final Appendable output)220   public static void printFieldValue(final FieldDescriptor field,
221                                      final Object value,
222                                      final Appendable output)
223                                      throws IOException {
224     DEFAULT_PRINTER.printFieldValue(field, value, new TextGenerator(output));
225   }
226 
227   /**
228    * Outputs a textual representation of the value of an unknown field.
229    *
230    * @param tag the field's tag number
231    * @param value the value of the field
232    * @param output the output to which to append the formatted value
233    * @throws ClassCastException if the value is not appropriate for the
234    *     given field descriptor
235    * @throws IOException if there is an exception writing to the output
236    */
printUnknownFieldValue(final int tag, final Object value, final Appendable output)237   public static void printUnknownFieldValue(final int tag,
238                                             final Object value,
239                                             final Appendable output)
240                                             throws IOException {
241     printUnknownFieldValue(tag, value, new TextGenerator(output));
242   }
243 
printUnknownFieldValue(final int tag, final Object value, final TextGenerator generator)244   private static void printUnknownFieldValue(final int tag,
245                                              final Object value,
246                                              final TextGenerator generator)
247                                              throws IOException {
248     switch (WireFormat.getTagWireType(tag)) {
249       case WireFormat.WIRETYPE_VARINT:
250         generator.print(unsignedToString((Long) value));
251         break;
252       case WireFormat.WIRETYPE_FIXED32:
253         generator.print(
254             String.format((Locale) null, "0x%08x", (Integer) value));
255         break;
256       case WireFormat.WIRETYPE_FIXED64:
257         generator.print(String.format((Locale) null, "0x%016x", (Long) value));
258         break;
259       case WireFormat.WIRETYPE_LENGTH_DELIMITED:
260         generator.print("\"");
261         generator.print(escapeBytes((ByteString) value));
262         generator.print("\"");
263         break;
264       case WireFormat.WIRETYPE_START_GROUP:
265         DEFAULT_PRINTER.printUnknownFields((UnknownFieldSet) value, generator);
266         break;
267       default:
268         throw new IllegalArgumentException("Bad tag: " + tag);
269     }
270   }
271 
272   /** Helper class for converting protobufs to text. */
273   private static final class Printer {
274     /** Whether to omit newlines from the output. */
275     boolean singleLineMode = false;
276 
277     /** Whether to escape non ASCII characters with backslash and octal. */
278     boolean escapeNonAscii = true;
279 
Printer()280     private Printer() {}
281 
282     /** Setter of singleLineMode */
setSingleLineMode(boolean singleLineMode)283     private Printer setSingleLineMode(boolean singleLineMode) {
284       this.singleLineMode = singleLineMode;
285       return this;
286     }
287 
288     /** Setter of escapeNonAscii */
setEscapeNonAscii(boolean escapeNonAscii)289     private Printer setEscapeNonAscii(boolean escapeNonAscii) {
290       this.escapeNonAscii = escapeNonAscii;
291       return this;
292     }
293 
print( final MessageOrBuilder message, final TextGenerator generator)294     private void print(
295         final MessageOrBuilder message, final TextGenerator generator)
296         throws IOException {
297       for (Map.Entry<FieldDescriptor, Object> field
298           : message.getAllFields().entrySet()) {
299         printField(field.getKey(), field.getValue(), generator);
300       }
301       printUnknownFields(message.getUnknownFields(), generator);
302     }
303 
printField(final FieldDescriptor field, final Object value, final TextGenerator generator)304     private void printField(final FieldDescriptor field, final Object value,
305         final TextGenerator generator) throws IOException {
306       if (field.isRepeated()) {
307         // Repeated field.  Print each element.
308         for (Object element : (List<?>) value) {
309           printSingleField(field, element, generator);
310         }
311       } else {
312         printSingleField(field, value, generator);
313       }
314     }
315 
printSingleField(final FieldDescriptor field, final Object value, final TextGenerator generator)316     private void printSingleField(final FieldDescriptor field,
317                                   final Object value,
318                                   final TextGenerator generator)
319                                   throws IOException {
320       if (field.isExtension()) {
321         generator.print("[");
322         // We special-case MessageSet elements for compatibility with proto1.
323         if (field.getContainingType().getOptions().getMessageSetWireFormat()
324             && (field.getType() == FieldDescriptor.Type.MESSAGE)
325             && (field.isOptional())
326             // object equality
327             && (field.getExtensionScope() == field.getMessageType())) {
328           generator.print(field.getMessageType().getFullName());
329         } else {
330           generator.print(field.getFullName());
331         }
332         generator.print("]");
333       } else {
334         if (field.getType() == FieldDescriptor.Type.GROUP) {
335           // Groups must be serialized with their original capitalization.
336           generator.print(field.getMessageType().getName());
337         } else {
338           generator.print(field.getName());
339         }
340       }
341 
342       if (field.getJavaType() == FieldDescriptor.JavaType.MESSAGE) {
343         if (singleLineMode) {
344           generator.print(" { ");
345         } else {
346           generator.print(" {\n");
347           generator.indent();
348         }
349       } else {
350         generator.print(": ");
351       }
352 
353       printFieldValue(field, value, generator);
354 
355       if (field.getJavaType() == FieldDescriptor.JavaType.MESSAGE) {
356         if (singleLineMode) {
357           generator.print("} ");
358         } else {
359           generator.outdent();
360           generator.print("}\n");
361         }
362       } else {
363         if (singleLineMode) {
364           generator.print(" ");
365         } else {
366           generator.print("\n");
367         }
368       }
369     }
370 
printFieldValue(final FieldDescriptor field, final Object value, final TextGenerator generator)371     private void printFieldValue(final FieldDescriptor field,
372                                  final Object value,
373                                  final TextGenerator generator)
374                                  throws IOException {
375       switch (field.getType()) {
376         case INT32:
377         case SINT32:
378         case SFIXED32:
379           generator.print(((Integer) value).toString());
380           break;
381 
382         case INT64:
383         case SINT64:
384         case SFIXED64:
385           generator.print(((Long) value).toString());
386           break;
387 
388         case BOOL:
389           generator.print(((Boolean) value).toString());
390           break;
391 
392         case FLOAT:
393           generator.print(((Float) value).toString());
394           break;
395 
396         case DOUBLE:
397           generator.print(((Double) value).toString());
398           break;
399 
400         case UINT32:
401         case FIXED32:
402           generator.print(unsignedToString((Integer) value));
403           break;
404 
405         case UINT64:
406         case FIXED64:
407           generator.print(unsignedToString((Long) value));
408           break;
409 
410         case STRING:
411           generator.print("\"");
412           generator.print(escapeNonAscii ?
413               escapeText((String) value) :
414               escapeDoubleQuotesAndBackslashes((String) value));
415           generator.print("\"");
416           break;
417 
418         case BYTES:
419           generator.print("\"");
420           if (value instanceof ByteString) {
421             generator.print(escapeBytes((ByteString) value));
422           } else {
423             generator.print(escapeBytes((byte[]) value));
424           }
425           generator.print("\"");
426           break;
427 
428         case ENUM:
429           generator.print(((EnumValueDescriptor) value).getName());
430           break;
431 
432         case MESSAGE:
433         case GROUP:
434           print((Message) value, generator);
435           break;
436       }
437     }
438 
printUnknownFields(final UnknownFieldSet unknownFields, final TextGenerator generator)439     private void printUnknownFields(final UnknownFieldSet unknownFields,
440                                     final TextGenerator generator)
441                                     throws IOException {
442       for (Map.Entry<Integer, UnknownFieldSet.Field> entry :
443                unknownFields.asMap().entrySet()) {
444         final int number = entry.getKey();
445         final UnknownFieldSet.Field field = entry.getValue();
446         printUnknownField(number, WireFormat.WIRETYPE_VARINT,
447             field.getVarintList(), generator);
448         printUnknownField(number, WireFormat.WIRETYPE_FIXED32,
449             field.getFixed32List(), generator);
450         printUnknownField(number, WireFormat.WIRETYPE_FIXED64,
451             field.getFixed64List(), generator);
452         printUnknownField(number, WireFormat.WIRETYPE_LENGTH_DELIMITED,
453             field.getLengthDelimitedList(), generator);
454         for (final UnknownFieldSet value : field.getGroupList()) {
455           generator.print(entry.getKey().toString());
456           if (singleLineMode) {
457             generator.print(" { ");
458           } else {
459             generator.print(" {\n");
460             generator.indent();
461           }
462           printUnknownFields(value, generator);
463           if (singleLineMode) {
464             generator.print("} ");
465           } else {
466             generator.outdent();
467             generator.print("}\n");
468           }
469         }
470       }
471     }
472 
printUnknownField(final int number, final int wireType, final List<?> values, final TextGenerator generator)473     private void printUnknownField(final int number,
474                                    final int wireType,
475                                    final List<?> values,
476                                    final TextGenerator generator)
477                                    throws IOException {
478       for (final Object value : values) {
479         generator.print(String.valueOf(number));
480         generator.print(": ");
481         printUnknownFieldValue(wireType, value, generator);
482         generator.print(singleLineMode ? " " : "\n");
483       }
484     }
485   }
486 
487   /** Convert an unsigned 32-bit integer to a string. */
unsignedToString(final int value)488   public static String unsignedToString(final int value) {
489     if (value >= 0) {
490       return Integer.toString(value);
491     } else {
492       return Long.toString(value & 0x00000000FFFFFFFFL);
493     }
494   }
495 
496   /** Convert an unsigned 64-bit integer to a string. */
unsignedToString(final long value)497   public static String unsignedToString(final long value) {
498     if (value >= 0) {
499       return Long.toString(value);
500     } else {
501       // Pull off the most-significant bit so that BigInteger doesn't think
502       // the number is negative, then set it again using setBit().
503       return BigInteger.valueOf(value & 0x7FFFFFFFFFFFFFFFL)
504                        .setBit(63).toString();
505     }
506   }
507 
508   /**
509    * An inner class for writing text to the output stream.
510    */
511   private static final class TextGenerator {
512     private final Appendable output;
513     private final StringBuilder indent = new StringBuilder();
514     private boolean atStartOfLine = true;
515 
TextGenerator(final Appendable output)516     private TextGenerator(final Appendable output) {
517       this.output = output;
518     }
519 
520     /**
521      * Indent text by two spaces.  After calling Indent(), two spaces will be
522      * inserted at the beginning of each line of text.  Indent() may be called
523      * multiple times to produce deeper indents.
524      */
indent()525     public void indent() {
526       indent.append("  ");
527     }
528 
529     /**
530      * Reduces the current indent level by two spaces, or crashes if the indent
531      * level is zero.
532      */
outdent()533     public void outdent() {
534       final int length = indent.length();
535       if (length == 0) {
536         throw new IllegalArgumentException(
537             " Outdent() without matching Indent().");
538       }
539       indent.delete(length - 2, length);
540     }
541 
542     /**
543      * Print text to the output stream.
544      */
print(final CharSequence text)545     public void print(final CharSequence text) throws IOException {
546       final int size = text.length();
547       int pos = 0;
548 
549       for (int i = 0; i < size; i++) {
550         if (text.charAt(i) == '\n') {
551           write(text.subSequence(pos, i + 1));
552           pos = i + 1;
553           atStartOfLine = true;
554         }
555       }
556       write(text.subSequence(pos, size));
557     }
558 
write(final CharSequence data)559     private void write(final CharSequence data) throws IOException {
560       if (data.length() == 0) {
561         return;
562       }
563       if (atStartOfLine) {
564         atStartOfLine = false;
565         output.append(indent);
566       }
567       output.append(data);
568     }
569   }
570 
571   // =================================================================
572   // Parsing
573 
574   /**
575    * Represents a stream of tokens parsed from a {@code String}.
576    *
577    * <p>The Java standard library provides many classes that you might think
578    * would be useful for implementing this, but aren't.  For example:
579    *
580    * <ul>
581    * <li>{@code java.io.StreamTokenizer}:  This almost does what we want -- or,
582    *   at least, something that would get us close to what we want -- except
583    *   for one fatal flaw:  It automatically un-escapes strings using Java
584    *   escape sequences, which do not include all the escape sequences we
585    *   need to support (e.g. '\x').
586    * <li>{@code java.util.Scanner}:  This seems like a great way at least to
587    *   parse regular expressions out of a stream (so we wouldn't have to load
588    *   the entire input into a single string before parsing).  Sadly,
589    *   {@code Scanner} requires that tokens be delimited with some delimiter.
590    *   Thus, although the text "foo:" should parse to two tokens ("foo" and
591    *   ":"), {@code Scanner} would recognize it only as a single token.
592    *   Furthermore, {@code Scanner} provides no way to inspect the contents
593    *   of delimiters, making it impossible to keep track of line and column
594    *   numbers.
595    * </ul>
596    *
597    * <p>Luckily, Java's regular expression support does manage to be useful to
598    * us.  (Barely:  We need {@code Matcher.usePattern()}, which is new in
599    * Java 1.5.)  So, we can use that, at least.  Unfortunately, this implies
600    * that we need to have the entire input in one contiguous string.
601    */
602   private static final class Tokenizer {
603     private final CharSequence text;
604     private final Matcher matcher;
605     private String currentToken;
606 
607     // The character index within this.text at which the current token begins.
608     private int pos = 0;
609 
610     // The line and column numbers of the current token.
611     private int line = 0;
612     private int column = 0;
613 
614     // The line and column numbers of the previous token (allows throwing
615     // errors *after* consuming).
616     private int previousLine = 0;
617     private int previousColumn = 0;
618 
619     // We use possessive quantifiers (*+ and ++) because otherwise the Java
620     // regex matcher has stack overflows on large inputs.
621     private static final Pattern WHITESPACE =
622       Pattern.compile("(\\s|(#.*$))++", Pattern.MULTILINE);
623     private static final Pattern TOKEN = Pattern.compile(
624       "[a-zA-Z_][0-9a-zA-Z_+-]*+|" +                // an identifier
625       "[.]?[0-9+-][0-9a-zA-Z_.+-]*+|" +             // a number
626       "\"([^\"\n\\\\]|\\\\.)*+(\"|\\\\?$)|" +       // a double-quoted string
627       "\'([^\'\n\\\\]|\\\\.)*+(\'|\\\\?$)",         // a single-quoted string
628       Pattern.MULTILINE);
629 
630     private static final Pattern DOUBLE_INFINITY = Pattern.compile(
631       "-?inf(inity)?",
632       Pattern.CASE_INSENSITIVE);
633     private static final Pattern FLOAT_INFINITY = Pattern.compile(
634       "-?inf(inity)?f?",
635       Pattern.CASE_INSENSITIVE);
636     private static final Pattern FLOAT_NAN = Pattern.compile(
637       "nanf?",
638       Pattern.CASE_INSENSITIVE);
639 
640     /** Construct a tokenizer that parses tokens from the given text. */
Tokenizer(final CharSequence text)641     private Tokenizer(final CharSequence text) {
642       this.text = text;
643       this.matcher = WHITESPACE.matcher(text);
644       skipWhitespace();
645       nextToken();
646     }
647 
648     /** Are we at the end of the input? */
atEnd()649     public boolean atEnd() {
650       return currentToken.length() == 0;
651     }
652 
653     /** Advance to the next token. */
nextToken()654     public void nextToken() {
655       previousLine = line;
656       previousColumn = column;
657 
658       // Advance the line counter to the current position.
659       while (pos < matcher.regionStart()) {
660         if (text.charAt(pos) == '\n') {
661           ++line;
662           column = 0;
663         } else {
664           ++column;
665         }
666         ++pos;
667       }
668 
669       // Match the next token.
670       if (matcher.regionStart() == matcher.regionEnd()) {
671         // EOF
672         currentToken = "";
673       } else {
674         matcher.usePattern(TOKEN);
675         if (matcher.lookingAt()) {
676           currentToken = matcher.group();
677           matcher.region(matcher.end(), matcher.regionEnd());
678         } else {
679           // Take one character.
680           currentToken = String.valueOf(text.charAt(pos));
681           matcher.region(pos + 1, matcher.regionEnd());
682         }
683 
684         skipWhitespace();
685       }
686     }
687 
688     /**
689      * Skip over any whitespace so that the matcher region starts at the next
690      * token.
691      */
skipWhitespace()692     private void skipWhitespace() {
693       matcher.usePattern(WHITESPACE);
694       if (matcher.lookingAt()) {
695         matcher.region(matcher.end(), matcher.regionEnd());
696       }
697     }
698 
699     /**
700      * If the next token exactly matches {@code token}, consume it and return
701      * {@code true}.  Otherwise, return {@code false} without doing anything.
702      */
tryConsume(final String token)703     public boolean tryConsume(final String token) {
704       if (currentToken.equals(token)) {
705         nextToken();
706         return true;
707       } else {
708         return false;
709       }
710     }
711 
712     /**
713      * If the next token exactly matches {@code token}, consume it.  Otherwise,
714      * throw a {@link ParseException}.
715      */
consume(final String token)716     public void consume(final String token) throws ParseException {
717       if (!tryConsume(token)) {
718         throw parseException("Expected \"" + token + "\".");
719       }
720     }
721 
722     /**
723      * Returns {@code true} if the next token is an integer, but does
724      * not consume it.
725      */
lookingAtInteger()726     public boolean lookingAtInteger() {
727       if (currentToken.length() == 0) {
728         return false;
729       }
730 
731       final char c = currentToken.charAt(0);
732       return ('0' <= c && c <= '9') ||
733              c == '-' || c == '+';
734     }
735 
736     /**
737      * Returns {@code true} if the current token's text is equal to that
738      * specified.
739      */
lookingAt(String text)740     public boolean lookingAt(String text) {
741       return currentToken.equals(text);
742     }
743 
744     /**
745      * If the next token is an identifier, consume it and return its value.
746      * Otherwise, throw a {@link ParseException}.
747      */
consumeIdentifier()748     public String consumeIdentifier() throws ParseException {
749       for (int i = 0; i < currentToken.length(); i++) {
750         final char c = currentToken.charAt(i);
751         if (('a' <= c && c <= 'z') ||
752             ('A' <= c && c <= 'Z') ||
753             ('0' <= c && c <= '9') ||
754             (c == '_') || (c == '.')) {
755           // OK
756         } else {
757           throw parseException(
758               "Expected identifier. Found '" + currentToken + "'");
759         }
760       }
761 
762       final String result = currentToken;
763       nextToken();
764       return result;
765     }
766 
767     /**
768      * If the next token is an identifier, consume it and return {@code true}.
769      * Otherwise, return {@code false} without doing anything.
770      */
tryConsumeIdentifier()771     public boolean tryConsumeIdentifier() {
772       try {
773         consumeIdentifier();
774         return true;
775       } catch (ParseException e) {
776         return false;
777       }
778     }
779 
780     /**
781      * If the next token is a 32-bit signed integer, consume it and return its
782      * value.  Otherwise, throw a {@link ParseException}.
783      */
consumeInt32()784     public int consumeInt32() throws ParseException {
785       try {
786         final int result = parseInt32(currentToken);
787         nextToken();
788         return result;
789       } catch (NumberFormatException e) {
790         throw integerParseException(e);
791       }
792     }
793 
794     /**
795      * If the next token is a 32-bit unsigned integer, consume it and return its
796      * value.  Otherwise, throw a {@link ParseException}.
797      */
consumeUInt32()798     public int consumeUInt32() throws ParseException {
799       try {
800         final int result = parseUInt32(currentToken);
801         nextToken();
802         return result;
803       } catch (NumberFormatException e) {
804         throw integerParseException(e);
805       }
806     }
807 
808     /**
809      * If the next token is a 64-bit signed integer, consume it and return its
810      * value.  Otherwise, throw a {@link ParseException}.
811      */
consumeInt64()812     public long consumeInt64() throws ParseException {
813       try {
814         final long result = parseInt64(currentToken);
815         nextToken();
816         return result;
817       } catch (NumberFormatException e) {
818         throw integerParseException(e);
819       }
820     }
821 
822     /**
823      * If the next token is a 64-bit signed integer, consume it and return
824      * {@code true}.  Otherwise, return {@code false} without doing anything.
825      */
tryConsumeInt64()826     public boolean tryConsumeInt64() {
827       try {
828         consumeInt64();
829         return true;
830       } catch (ParseException e) {
831         return false;
832       }
833     }
834 
835     /**
836      * If the next token is a 64-bit unsigned integer, consume it and return its
837      * value.  Otherwise, throw a {@link ParseException}.
838      */
consumeUInt64()839     public long consumeUInt64() throws ParseException {
840       try {
841         final long result = parseUInt64(currentToken);
842         nextToken();
843         return result;
844       } catch (NumberFormatException e) {
845         throw integerParseException(e);
846       }
847     }
848 
849     /**
850      * If the next token is a 64-bit unsigned integer, consume it and return
851      * {@code true}.  Otherwise, return {@code false} without doing anything.
852      */
tryConsumeUInt64()853     public boolean tryConsumeUInt64() {
854       try {
855         consumeUInt64();
856         return true;
857       } catch (ParseException e) {
858         return false;
859       }
860     }
861 
862     /**
863      * If the next token is a double, consume it and return its value.
864      * Otherwise, throw a {@link ParseException}.
865      */
consumeDouble()866     public double consumeDouble() throws ParseException {
867       // We need to parse infinity and nan separately because
868       // Double.parseDouble() does not accept "inf", "infinity", or "nan".
869       if (DOUBLE_INFINITY.matcher(currentToken).matches()) {
870         final boolean negative = currentToken.startsWith("-");
871         nextToken();
872         return negative ? Double.NEGATIVE_INFINITY : Double.POSITIVE_INFINITY;
873       }
874       if (currentToken.equalsIgnoreCase("nan")) {
875         nextToken();
876         return Double.NaN;
877       }
878       try {
879         final double result = Double.parseDouble(currentToken);
880         nextToken();
881         return result;
882       } catch (NumberFormatException e) {
883         throw floatParseException(e);
884       }
885     }
886 
887     /**
888      * If the next token is a double, consume it and return {@code true}.
889      * Otherwise, return {@code false} without doing anything.
890      */
tryConsumeDouble()891     public boolean tryConsumeDouble() {
892       try {
893         consumeDouble();
894         return true;
895       } catch (ParseException e) {
896         return false;
897       }
898     }
899 
900     /**
901      * If the next token is a float, consume it and return its value.
902      * Otherwise, throw a {@link ParseException}.
903      */
consumeFloat()904     public float consumeFloat() throws ParseException {
905       // We need to parse infinity and nan separately because
906       // Float.parseFloat() does not accept "inf", "infinity", or "nan".
907       if (FLOAT_INFINITY.matcher(currentToken).matches()) {
908         final boolean negative = currentToken.startsWith("-");
909         nextToken();
910         return negative ? Float.NEGATIVE_INFINITY : Float.POSITIVE_INFINITY;
911       }
912       if (FLOAT_NAN.matcher(currentToken).matches()) {
913         nextToken();
914         return Float.NaN;
915       }
916       try {
917         final float result = Float.parseFloat(currentToken);
918         nextToken();
919         return result;
920       } catch (NumberFormatException e) {
921         throw floatParseException(e);
922       }
923     }
924 
925     /**
926      * If the next token is a float, consume it and return {@code true}.
927      * Otherwise, return {@code false} without doing anything.
928      */
tryConsumeFloat()929     public boolean tryConsumeFloat() {
930       try {
931         consumeFloat();
932         return true;
933       } catch (ParseException e) {
934         return false;
935       }
936     }
937 
938     /**
939      * If the next token is a boolean, consume it and return its value.
940      * Otherwise, throw a {@link ParseException}.
941      */
consumeBoolean()942     public boolean consumeBoolean() throws ParseException {
943       if (currentToken.equals("true") ||
944           currentToken.equals("t") ||
945           currentToken.equals("1")) {
946         nextToken();
947         return true;
948       } else if (currentToken.equals("false") ||
949                  currentToken.equals("f") ||
950                  currentToken.equals("0")) {
951         nextToken();
952         return false;
953       } else {
954         throw parseException("Expected \"true\" or \"false\".");
955       }
956     }
957 
958     /**
959      * If the next token is a string, consume it and return its (unescaped)
960      * value.  Otherwise, throw a {@link ParseException}.
961      */
consumeString()962     public String consumeString() throws ParseException {
963       return consumeByteString().toStringUtf8();
964     }
965 
966     /**
967      * If the next token is a string, consume it and return true.  Otherwise,
968      * return false.
969      */
tryConsumeString()970     public boolean tryConsumeString() {
971       try {
972         consumeString();
973         return true;
974       } catch (ParseException e) {
975         return false;
976       }
977     }
978 
979     /**
980      * If the next token is a string, consume it, unescape it as a
981      * {@link ByteString}, and return it.  Otherwise, throw a
982      * {@link ParseException}.
983      */
consumeByteString()984     public ByteString consumeByteString() throws ParseException {
985       List<ByteString> list = new ArrayList<ByteString>();
986       consumeByteString(list);
987       while (currentToken.startsWith("'") || currentToken.startsWith("\"")) {
988         consumeByteString(list);
989       }
990       return ByteString.copyFrom(list);
991     }
992 
993     /**
994      * Like {@link #consumeByteString()} but adds each token of the string to
995      * the given list.  String literals (whether bytes or text) may come in
996      * multiple adjacent tokens which are automatically concatenated, like in
997      * C or Python.
998      */
consumeByteString(List<ByteString> list)999     private void consumeByteString(List<ByteString> list)
1000         throws ParseException {
1001       final char quote = currentToken.length() > 0 ? currentToken.charAt(0)
1002                                                    : '\0';
1003       if (quote != '\"' && quote != '\'') {
1004         throw parseException("Expected string.");
1005       }
1006 
1007       if (currentToken.length() < 2 ||
1008           currentToken.charAt(currentToken.length() - 1) != quote) {
1009         throw parseException("String missing ending quote.");
1010       }
1011 
1012       try {
1013         final String escaped =
1014             currentToken.substring(1, currentToken.length() - 1);
1015         final ByteString result = unescapeBytes(escaped);
1016         nextToken();
1017         list.add(result);
1018       } catch (InvalidEscapeSequenceException e) {
1019         throw parseException(e.getMessage());
1020       }
1021     }
1022 
1023     /**
1024      * Returns a {@link ParseException} with the current line and column
1025      * numbers in the description, suitable for throwing.
1026      */
parseException(final String description)1027     public ParseException parseException(final String description) {
1028       // Note:  People generally prefer one-based line and column numbers.
1029       return new ParseException(
1030         line + 1, column + 1, description);
1031     }
1032 
1033     /**
1034      * Returns a {@link ParseException} with the line and column numbers of
1035      * the previous token in the description, suitable for throwing.
1036      */
parseExceptionPreviousToken( final String description)1037     public ParseException parseExceptionPreviousToken(
1038         final String description) {
1039       // Note:  People generally prefer one-based line and column numbers.
1040       return new ParseException(
1041         previousLine + 1, previousColumn + 1, description);
1042     }
1043 
1044     /**
1045      * Constructs an appropriate {@link ParseException} for the given
1046      * {@code NumberFormatException} when trying to parse an integer.
1047      */
integerParseException( final NumberFormatException e)1048     private ParseException integerParseException(
1049         final NumberFormatException e) {
1050       return parseException("Couldn't parse integer: " + e.getMessage());
1051     }
1052 
1053     /**
1054      * Constructs an appropriate {@link ParseException} for the given
1055      * {@code NumberFormatException} when trying to parse a float or double.
1056      */
floatParseException(final NumberFormatException e)1057     private ParseException floatParseException(final NumberFormatException e) {
1058       return parseException("Couldn't parse number: " + e.getMessage());
1059     }
1060   }
1061 
1062   /** Thrown when parsing an invalid text format message. */
1063   public static class ParseException extends IOException {
1064     private static final long serialVersionUID = 3196188060225107702L;
1065 
1066     private final int line;
1067     private final int column;
1068 
1069     /** Create a new instance, with -1 as the line and column numbers. */
ParseException(final String message)1070     public ParseException(final String message) {
1071       this(-1, -1, message);
1072     }
1073 
1074     /**
1075      * Create a new instance
1076      *
1077      * @param line the line number where the parse error occurred,
1078      * using 1-offset.
1079      * @param column the column number where the parser error occurred,
1080      * using 1-offset.
1081      */
ParseException(final int line, final int column, final String message)1082     public ParseException(final int line, final int column,
1083         final String message) {
1084       super(Integer.toString(line) + ":" + column + ": " + message);
1085       this.line = line;
1086       this.column = column;
1087     }
1088 
1089     /**
1090      * Return the line where the parse exception occurred, or -1 when
1091      * none is provided. The value is specified as 1-offset, so the first
1092      * line is line 1.
1093      */
getLine()1094     public int getLine() {
1095       return line;
1096     }
1097 
1098     /**
1099      * Return the column where the parse exception occurred, or -1 when
1100      * none is provided. The value is specified as 1-offset, so the first
1101      * line is line 1.
1102      */
getColumn()1103     public int getColumn() {
1104       return column;
1105     }
1106   }
1107 
1108   private static final Parser PARSER = Parser.newBuilder().build();
1109 
1110   /**
1111    * Return a {@link Parser} instance which can parse text-format
1112    * messages. The returned instance is thread-safe.
1113    */
getParser()1114   public static Parser getParser() {
1115     return PARSER;
1116   }
1117 
1118   /**
1119    * Parse a text-format message from {@code input} and merge the contents
1120    * into {@code builder}.
1121    */
merge(final Readable input, final Message.Builder builder)1122   public static void merge(final Readable input,
1123                            final Message.Builder builder)
1124                            throws IOException {
1125     PARSER.merge(input, builder);
1126   }
1127 
1128   /**
1129    * Parse a text-format message from {@code input} and merge the contents
1130    * into {@code builder}.
1131    */
merge(final CharSequence input, final Message.Builder builder)1132   public static void merge(final CharSequence input,
1133                            final Message.Builder builder)
1134                            throws ParseException {
1135     PARSER.merge(input, builder);
1136   }
1137 
1138   /**
1139    * Parse a text-format message from {@code input} and merge the contents
1140    * into {@code builder}.  Extensions will be recognized if they are
1141    * registered in {@code extensionRegistry}.
1142    */
merge(final Readable input, final ExtensionRegistry extensionRegistry, final Message.Builder builder)1143   public static void merge(final Readable input,
1144                            final ExtensionRegistry extensionRegistry,
1145                            final Message.Builder builder)
1146                            throws IOException {
1147     PARSER.merge(input, extensionRegistry, builder);
1148   }
1149 
1150 
1151   /**
1152    * Parse a text-format message from {@code input} and merge the contents
1153    * into {@code builder}.  Extensions will be recognized if they are
1154    * registered in {@code extensionRegistry}.
1155    */
merge(final CharSequence input, final ExtensionRegistry extensionRegistry, final Message.Builder builder)1156   public static void merge(final CharSequence input,
1157                            final ExtensionRegistry extensionRegistry,
1158                            final Message.Builder builder)
1159                            throws ParseException {
1160     PARSER.merge(input, extensionRegistry, builder);
1161   }
1162 
1163 
1164   /**
1165    * Parser for text-format proto2 instances. This class is thread-safe.
1166    * The implementation largely follows google/protobuf/text_format.cc.
1167    *
1168    * <p>Use {@link TextFormat#getParser()} to obtain the default parser, or
1169    * {@link Builder} to control the parser behavior.
1170    */
1171   public static class Parser {
1172     /**
1173      * Determines if repeated values for non-repeated fields and
1174      * oneofs are permitted. For example, given required/optional field "foo"
1175      * and a oneof containing "baz" and "qux":
1176      * <li>
1177      * <ul>"foo: 1 foo: 2"
1178      * <ul>"baz: 1 qux: 2"
1179      * <ul>merging "foo: 2" into a proto in which foo is already set, or
1180      * <ul>merging "qux: 2" into a proto in which baz is already set.
1181      * </li>
1182      */
1183     public enum SingularOverwritePolicy {
1184       /** The last value is retained. */
1185       ALLOW_SINGULAR_OVERWRITES,
1186       /** An error is issued. */
1187       FORBID_SINGULAR_OVERWRITES
1188     }
1189 
1190     private final boolean allowUnknownFields;
1191     private final SingularOverwritePolicy singularOverwritePolicy;
1192 
Parser(boolean allowUnknownFields, SingularOverwritePolicy singularOverwritePolicy)1193     private Parser(boolean allowUnknownFields,
1194         SingularOverwritePolicy singularOverwritePolicy) {
1195       this.allowUnknownFields = allowUnknownFields;
1196       this.singularOverwritePolicy = singularOverwritePolicy;
1197     }
1198 
1199     /**
1200      * Returns a new instance of {@link Builder}.
1201      */
newBuilder()1202     public static Builder newBuilder() {
1203       return new Builder();
1204     }
1205 
1206     /**
1207      * Builder that can be used to obtain new instances of {@link Parser}.
1208      */
1209     public static class Builder {
1210       private boolean allowUnknownFields = false;
1211       private SingularOverwritePolicy singularOverwritePolicy =
1212           SingularOverwritePolicy.ALLOW_SINGULAR_OVERWRITES;
1213 
1214       /**
1215        * Sets parser behavior when a non-repeated field appears more than once.
1216        */
setSingularOverwritePolicy(SingularOverwritePolicy p)1217       public Builder setSingularOverwritePolicy(SingularOverwritePolicy p) {
1218         this.singularOverwritePolicy = p;
1219         return this;
1220       }
1221 
build()1222       public Parser build() {
1223         return new Parser(allowUnknownFields, singularOverwritePolicy);
1224       }
1225     }
1226 
1227     /**
1228      * Parse a text-format message from {@code input} and merge the contents
1229      * into {@code builder}.
1230      */
merge(final Readable input, final Message.Builder builder)1231     public void merge(final Readable input,
1232                       final Message.Builder builder)
1233                       throws IOException {
1234       merge(input, ExtensionRegistry.getEmptyRegistry(), builder);
1235     }
1236 
1237     /**
1238      * Parse a text-format message from {@code input} and merge the contents
1239      * into {@code builder}.
1240      */
merge(final CharSequence input, final Message.Builder builder)1241     public void merge(final CharSequence input,
1242                       final Message.Builder builder)
1243                       throws ParseException {
1244       merge(input, ExtensionRegistry.getEmptyRegistry(), builder);
1245     }
1246 
1247     /**
1248      * Parse a text-format message from {@code input} and merge the contents
1249      * into {@code builder}.  Extensions will be recognized if they are
1250      * registered in {@code extensionRegistry}.
1251      */
merge(final Readable input, final ExtensionRegistry extensionRegistry, final Message.Builder builder)1252     public void merge(final Readable input,
1253                       final ExtensionRegistry extensionRegistry,
1254                       final Message.Builder builder)
1255                       throws IOException {
1256       // Read the entire input to a String then parse that.
1257 
1258       // If StreamTokenizer were not quite so crippled, or if there were a kind
1259       // of Reader that could read in chunks that match some particular regex,
1260       // or if we wanted to write a custom Reader to tokenize our stream, then
1261       // we would not have to read to one big String.  Alas, none of these is
1262       // the case.  Oh well.
1263 
1264       merge(toStringBuilder(input), extensionRegistry, builder);
1265     }
1266 
1267 
1268     private static final int BUFFER_SIZE = 4096;
1269 
1270     // TODO(chrisn): See if working around java.io.Reader#read(CharBuffer)
1271     // overhead is worthwhile
toStringBuilder(final Readable input)1272     private static StringBuilder toStringBuilder(final Readable input)
1273         throws IOException {
1274       final StringBuilder text = new StringBuilder();
1275       final CharBuffer buffer = CharBuffer.allocate(BUFFER_SIZE);
1276       while (true) {
1277         final int n = input.read(buffer);
1278         if (n == -1) {
1279           break;
1280         }
1281         buffer.flip();
1282         text.append(buffer, 0, n);
1283       }
1284       return text;
1285     }
1286 
1287     /**
1288      * Parse a text-format message from {@code input} and merge the contents
1289      * into {@code builder}.  Extensions will be recognized if they are
1290      * registered in {@code extensionRegistry}.
1291      */
merge(final CharSequence input, final ExtensionRegistry extensionRegistry, final Message.Builder builder)1292     public void merge(final CharSequence input,
1293                       final ExtensionRegistry extensionRegistry,
1294                       final Message.Builder builder)
1295                       throws ParseException {
1296       final Tokenizer tokenizer = new Tokenizer(input);
1297       MessageReflection.BuilderAdapter target =
1298           new MessageReflection.BuilderAdapter(builder);
1299 
1300       while (!tokenizer.atEnd()) {
1301         mergeField(tokenizer, extensionRegistry, target);
1302       }
1303     }
1304 
1305 
1306     /**
1307      * Parse a single field from {@code tokenizer} and merge it into
1308      * {@code builder}.
1309      */
mergeField(final Tokenizer tokenizer, final ExtensionRegistry extensionRegistry, final MessageReflection.MergeTarget target)1310     private void mergeField(final Tokenizer tokenizer,
1311                             final ExtensionRegistry extensionRegistry,
1312                             final MessageReflection.MergeTarget target)
1313                             throws ParseException {
1314       FieldDescriptor field = null;
1315       final Descriptor type = target.getDescriptorForType();
1316       ExtensionRegistry.ExtensionInfo extension = null;
1317 
1318       if (tokenizer.tryConsume("[")) {
1319         // An extension.
1320         final StringBuilder name =
1321             new StringBuilder(tokenizer.consumeIdentifier());
1322         while (tokenizer.tryConsume(".")) {
1323           name.append('.');
1324           name.append(tokenizer.consumeIdentifier());
1325         }
1326 
1327         extension = target.findExtensionByName(
1328             extensionRegistry, name.toString());
1329 
1330         if (extension == null) {
1331           if (!allowUnknownFields) {
1332             throw tokenizer.parseExceptionPreviousToken(
1333               "Extension \"" + name + "\" not found in the ExtensionRegistry.");
1334           } else {
1335             logger.warning(
1336               "Extension \"" + name + "\" not found in the ExtensionRegistry.");
1337           }
1338         } else {
1339           if (extension.descriptor.getContainingType() != type) {
1340             throw tokenizer.parseExceptionPreviousToken(
1341               "Extension \"" + name + "\" does not extend message type \"" +
1342               type.getFullName() + "\".");
1343           }
1344           field = extension.descriptor;
1345         }
1346 
1347         tokenizer.consume("]");
1348       } else {
1349         final String name = tokenizer.consumeIdentifier();
1350         field = type.findFieldByName(name);
1351 
1352         // Group names are expected to be capitalized as they appear in the
1353         // .proto file, which actually matches their type names, not their field
1354         // names.
1355         if (field == null) {
1356           // Explicitly specify US locale so that this code does not break when
1357           // executing in Turkey.
1358           final String lowerName = name.toLowerCase(Locale.US);
1359           field = type.findFieldByName(lowerName);
1360           // If the case-insensitive match worked but the field is NOT a group,
1361           if (field != null && field.getType() != FieldDescriptor.Type.GROUP) {
1362             field = null;
1363           }
1364         }
1365         // Again, special-case group names as described above.
1366         if (field != null && field.getType() == FieldDescriptor.Type.GROUP &&
1367             !field.getMessageType().getName().equals(name)) {
1368           field = null;
1369         }
1370 
1371         if (field == null) {
1372           if (!allowUnknownFields) {
1373             throw tokenizer.parseExceptionPreviousToken(
1374               "Message type \"" + type.getFullName() +
1375               "\" has no field named \"" + name + "\".");
1376           } else {
1377             logger.warning(
1378               "Message type \"" + type.getFullName() +
1379               "\" has no field named \"" + name + "\".");
1380           }
1381         }
1382       }
1383 
1384       // Skips unknown fields.
1385       if (field == null) {
1386         // Try to guess the type of this field.
1387         // If this field is not a message, there should be a ":" between the
1388         // field name and the field value and also the field value should not
1389         // start with "{" or "<" which indicates the begining of a message body.
1390         // If there is no ":" or there is a "{" or "<" after ":", this field has
1391         // to be a message or the input is ill-formed.
1392         if (tokenizer.tryConsume(":") && !tokenizer.lookingAt("{") &&
1393             !tokenizer.lookingAt("<")) {
1394           skipFieldValue(tokenizer);
1395         } else {
1396           skipFieldMessage(tokenizer);
1397         }
1398         return;
1399       }
1400 
1401       // Handle potential ':'.
1402       if (field.getJavaType() == FieldDescriptor.JavaType.MESSAGE) {
1403         tokenizer.tryConsume(":");  // optional
1404       } else {
1405         tokenizer.consume(":");  // required
1406       }
1407       // Support specifying repeated field values as a comma-separated list.
1408       // Ex."foo: [1, 2, 3]"
1409       if (field.isRepeated() && tokenizer.tryConsume("[")) {
1410         while (true) {
1411           consumeFieldValue(tokenizer, extensionRegistry, target, field, extension);
1412           if (tokenizer.tryConsume("]")) {
1413             // End of list.
1414             break;
1415           }
1416           tokenizer.consume(",");
1417         }
1418       } else {
1419         consumeFieldValue(tokenizer, extensionRegistry, target, field, extension);
1420       }
1421     }
1422 
1423     /**
1424      * Parse a single field value from {@code tokenizer} and merge it into
1425      * {@code builder}.
1426      */
consumeFieldValue( final Tokenizer tokenizer, final ExtensionRegistry extensionRegistry, final MessageReflection.MergeTarget target, final FieldDescriptor field, final ExtensionRegistry.ExtensionInfo extension)1427     private void consumeFieldValue(
1428         final Tokenizer tokenizer,
1429         final ExtensionRegistry extensionRegistry,
1430         final MessageReflection.MergeTarget target,
1431         final FieldDescriptor field,
1432         final ExtensionRegistry.ExtensionInfo extension)
1433         throws ParseException {
1434       Object value = null;
1435 
1436       if (field.getJavaType() == FieldDescriptor.JavaType.MESSAGE) {
1437         final String endToken;
1438         if (tokenizer.tryConsume("<")) {
1439           endToken = ">";
1440         } else {
1441           tokenizer.consume("{");
1442           endToken = "}";
1443         }
1444 
1445         final MessageReflection.MergeTarget subField;
1446         subField = target.newMergeTargetForField(field,
1447             (extension == null) ? null : extension.defaultInstance);
1448 
1449         while (!tokenizer.tryConsume(endToken)) {
1450           if (tokenizer.atEnd()) {
1451             throw tokenizer.parseException(
1452               "Expected \"" + endToken + "\".");
1453           }
1454           mergeField(tokenizer, extensionRegistry, subField);
1455         }
1456 
1457         value = subField.finish();
1458 
1459       } else {
1460         switch (field.getType()) {
1461           case INT32:
1462           case SINT32:
1463           case SFIXED32:
1464             value = tokenizer.consumeInt32();
1465             break;
1466 
1467           case INT64:
1468           case SINT64:
1469           case SFIXED64:
1470             value = tokenizer.consumeInt64();
1471             break;
1472 
1473           case UINT32:
1474           case FIXED32:
1475             value = tokenizer.consumeUInt32();
1476             break;
1477 
1478           case UINT64:
1479           case FIXED64:
1480             value = tokenizer.consumeUInt64();
1481             break;
1482 
1483           case FLOAT:
1484             value = tokenizer.consumeFloat();
1485             break;
1486 
1487           case DOUBLE:
1488             value = tokenizer.consumeDouble();
1489             break;
1490 
1491           case BOOL:
1492             value = tokenizer.consumeBoolean();
1493             break;
1494 
1495           case STRING:
1496             value = tokenizer.consumeString();
1497             break;
1498 
1499           case BYTES:
1500             value = tokenizer.consumeByteString();
1501             break;
1502 
1503           case ENUM:
1504             final EnumDescriptor enumType = field.getEnumType();
1505 
1506             if (tokenizer.lookingAtInteger()) {
1507               final int number = tokenizer.consumeInt32();
1508               value = enumType.findValueByNumber(number);
1509               if (value == null) {
1510                 throw tokenizer.parseExceptionPreviousToken(
1511                   "Enum type \"" + enumType.getFullName() +
1512                   "\" has no value with number " + number + '.');
1513               }
1514             } else {
1515               final String id = tokenizer.consumeIdentifier();
1516               value = enumType.findValueByName(id);
1517               if (value == null) {
1518                 throw tokenizer.parseExceptionPreviousToken(
1519                   "Enum type \"" + enumType.getFullName() +
1520                   "\" has no value named \"" + id + "\".");
1521               }
1522             }
1523 
1524             break;
1525 
1526           case MESSAGE:
1527           case GROUP:
1528             throw new RuntimeException("Can't get here.");
1529         }
1530       }
1531 
1532       if (field.isRepeated()) {
1533         target.addRepeatedField(field, value);
1534       } else if ((singularOverwritePolicy
1535               == SingularOverwritePolicy.FORBID_SINGULAR_OVERWRITES)
1536           && target.hasField(field)) {
1537         throw tokenizer.parseExceptionPreviousToken("Non-repeated field \""
1538             + field.getFullName() + "\" cannot be overwritten.");
1539       } else if ((singularOverwritePolicy
1540               == SingularOverwritePolicy.FORBID_SINGULAR_OVERWRITES)
1541           && field.getContainingOneof() != null
1542           && target.hasOneof(field.getContainingOneof())) {
1543         Descriptors.OneofDescriptor oneof = field.getContainingOneof();
1544         throw tokenizer.parseExceptionPreviousToken("Field \""
1545             + field.getFullName() + "\" is specified along with field \""
1546             + target.getOneofFieldDescriptor(oneof).getFullName()
1547             + "\", another member of oneof \"" + oneof.getName() + "\".");
1548       } else {
1549         target.setField(field, value);
1550       }
1551     }
1552 
1553     /**
1554      * Skips the next field including the field's name and value.
1555      */
skipField(Tokenizer tokenizer)1556     private void skipField(Tokenizer tokenizer) throws ParseException {
1557       if (tokenizer.tryConsume("[")) {
1558         // Extension name.
1559         do {
1560           tokenizer.consumeIdentifier();
1561         } while (tokenizer.tryConsume("."));
1562         tokenizer.consume("]");
1563       } else {
1564         tokenizer.consumeIdentifier();
1565       }
1566 
1567       // Try to guess the type of this field.
1568       // If this field is not a message, there should be a ":" between the
1569       // field name and the field value and also the field value should not
1570       // start with "{" or "<" which indicates the begining of a message body.
1571       // If there is no ":" or there is a "{" or "<" after ":", this field has
1572       // to be a message or the input is ill-formed.
1573       if (tokenizer.tryConsume(":") && !tokenizer.lookingAt("<") &&
1574           !tokenizer.lookingAt("{")) {
1575         skipFieldValue(tokenizer);
1576       } else {
1577         skipFieldMessage(tokenizer);
1578       }
1579       // For historical reasons, fields may optionally be separated by commas or
1580       // semicolons.
1581       if (!tokenizer.tryConsume(";")) {
1582         tokenizer.tryConsume(",");
1583       }
1584     }
1585 
1586     /**
1587      * Skips the whole body of a message including the beginning delimeter and
1588      * the ending delimeter.
1589      */
skipFieldMessage(Tokenizer tokenizer)1590     private void skipFieldMessage(Tokenizer tokenizer) throws ParseException {
1591       final String delimiter;
1592       if (tokenizer.tryConsume("<")) {
1593         delimiter = ">";
1594       } else {
1595         tokenizer.consume("{");
1596         delimiter = "}";
1597       }
1598       while (!tokenizer.lookingAt(">") && !tokenizer.lookingAt("}")) {
1599         skipField(tokenizer);
1600       }
1601       tokenizer.consume(delimiter);
1602     }
1603 
1604     /**
1605      * Skips a field value.
1606      */
skipFieldValue(Tokenizer tokenizer)1607     private void skipFieldValue(Tokenizer tokenizer) throws ParseException {
1608       if (tokenizer.tryConsumeString()) {
1609         while (tokenizer.tryConsumeString()) {}
1610         return;
1611       }
1612       if (!tokenizer.tryConsumeIdentifier() &&  // includes enum & boolean
1613           !tokenizer.tryConsumeInt64() &&       // includes int32
1614           !tokenizer.tryConsumeUInt64() &&      // includes uint32
1615           !tokenizer.tryConsumeDouble() &&
1616           !tokenizer.tryConsumeFloat()) {
1617         throw tokenizer.parseException(
1618             "Invalid field value: " + tokenizer.currentToken);
1619       }
1620     }
1621   }
1622 
1623   // =================================================================
1624   // Utility functions
1625   //
1626   // Some of these methods are package-private because Descriptors.java uses
1627   // them.
1628 
1629   private interface ByteSequence {
size()1630     int size();
byteAt(int offset)1631     byte byteAt(int offset);
1632   }
1633 
1634   /**
1635    * Escapes bytes in the format used in protocol buffer text format, which
1636    * is the same as the format used for C string literals.  All bytes
1637    * that are not printable 7-bit ASCII characters are escaped, as well as
1638    * backslash, single-quote, and double-quote characters.  Characters for
1639    * which no defined short-hand escape sequence is defined will be escaped
1640    * using 3-digit octal sequences.
1641    */
escapeBytes(final ByteSequence input)1642   private static String escapeBytes(final ByteSequence input) {
1643     final StringBuilder builder = new StringBuilder(input.size());
1644     for (int i = 0; i < input.size(); i++) {
1645       final byte b = input.byteAt(i);
1646       switch (b) {
1647         // Java does not recognize \a or \v, apparently.
1648         case 0x07: builder.append("\\a" ); break;
1649         case '\b': builder.append("\\b" ); break;
1650         case '\f': builder.append("\\f" ); break;
1651         case '\n': builder.append("\\n" ); break;
1652         case '\r': builder.append("\\r" ); break;
1653         case '\t': builder.append("\\t" ); break;
1654         case 0x0b: builder.append("\\v" ); break;
1655         case '\\': builder.append("\\\\"); break;
1656         case '\'': builder.append("\\\'"); break;
1657         case '"' : builder.append("\\\""); break;
1658         default:
1659           // Note:  Bytes with the high-order bit set should be escaped.  Since
1660           //   bytes are signed, such bytes will compare less than 0x20, hence
1661           //   the following line is correct.
1662           if (b >= 0x20) {
1663             builder.append((char) b);
1664           } else {
1665             builder.append('\\');
1666             builder.append((char) ('0' + ((b >>> 6) & 3)));
1667             builder.append((char) ('0' + ((b >>> 3) & 7)));
1668             builder.append((char) ('0' + (b & 7)));
1669           }
1670           break;
1671       }
1672     }
1673     return builder.toString();
1674   }
1675 
1676   /**
1677    * Escapes bytes in the format used in protocol buffer text format, which
1678    * is the same as the format used for C string literals.  All bytes
1679    * that are not printable 7-bit ASCII characters are escaped, as well as
1680    * backslash, single-quote, and double-quote characters.  Characters for
1681    * which no defined short-hand escape sequence is defined will be escaped
1682    * using 3-digit octal sequences.
1683    */
escapeBytes(final ByteString input)1684   static String escapeBytes(final ByteString input) {
1685     return escapeBytes(new ByteSequence() {
1686       public int size() {
1687         return input.size();
1688       }
1689       public byte byteAt(int offset) {
1690         return input.byteAt(offset);
1691       }
1692     });
1693   }
1694 
1695   /**
1696    * Like {@link #escapeBytes(ByteString)}, but used for byte array.
1697    */
1698   static String escapeBytes(final byte[] input) {
1699     return escapeBytes(new ByteSequence() {
1700       public int size() {
1701         return input.length;
1702       }
1703       public byte byteAt(int offset) {
1704         return input[offset];
1705       }
1706     });
1707   }
1708 
1709   /**
1710    * Un-escape a byte sequence as escaped using
1711    * {@link #escapeBytes(ByteString)}.  Two-digit hex escapes (starting with
1712    * "\x") are also recognized.
1713    */
1714   static ByteString unescapeBytes(final CharSequence charString)
1715       throws InvalidEscapeSequenceException {
1716     // First convert the Java character sequence to UTF-8 bytes.
1717     ByteString input = ByteString.copyFromUtf8(charString.toString());
1718     // Then unescape certain byte sequences introduced by ASCII '\\'.  The valid
1719     // escapes can all be expressed with ASCII characters, so it is safe to
1720     // operate on bytes here.
1721     //
1722     // Unescaping the input byte array will result in a byte sequence that's no
1723     // longer than the input.  That's because each escape sequence is between
1724     // two and four bytes long and stands for a single byte.
1725     final byte[] result = new byte[input.size()];
1726     int pos = 0;
1727     for (int i = 0; i < input.size(); i++) {
1728       byte c = input.byteAt(i);
1729       if (c == '\\') {
1730         if (i + 1 < input.size()) {
1731           ++i;
1732           c = input.byteAt(i);
1733           if (isOctal(c)) {
1734             // Octal escape.
1735             int code = digitValue(c);
1736             if (i + 1 < input.size() && isOctal(input.byteAt(i + 1))) {
1737               ++i;
1738               code = code * 8 + digitValue(input.byteAt(i));
1739             }
1740             if (i + 1 < input.size() && isOctal(input.byteAt(i + 1))) {
1741               ++i;
1742               code = code * 8 + digitValue(input.byteAt(i));
1743             }
1744             // TODO: Check that 0 <= code && code <= 0xFF.
1745             result[pos++] = (byte)code;
1746           } else {
1747             switch (c) {
1748               case 'a' : result[pos++] = 0x07; break;
1749               case 'b' : result[pos++] = '\b'; break;
1750               case 'f' : result[pos++] = '\f'; break;
1751               case 'n' : result[pos++] = '\n'; break;
1752               case 'r' : result[pos++] = '\r'; break;
1753               case 't' : result[pos++] = '\t'; break;
1754               case 'v' : result[pos++] = 0x0b; break;
1755               case '\\': result[pos++] = '\\'; break;
1756               case '\'': result[pos++] = '\''; break;
1757               case '"' : result[pos++] = '\"'; break;
1758 
1759               case 'x':
1760                 // hex escape
1761                 int code = 0;
1762                 if (i + 1 < input.size() && isHex(input.byteAt(i + 1))) {
1763                   ++i;
1764                   code = digitValue(input.byteAt(i));
1765                 } else {
1766                   throw new InvalidEscapeSequenceException(
1767                       "Invalid escape sequence: '\\x' with no digits");
1768                 }
1769                 if (i + 1 < input.size() && isHex(input.byteAt(i + 1))) {
1770                   ++i;
1771                   code = code * 16 + digitValue(input.byteAt(i));
1772                 }
1773                 result[pos++] = (byte)code;
1774                 break;
1775 
1776               default:
1777                 throw new InvalidEscapeSequenceException(
1778                     "Invalid escape sequence: '\\" + (char)c + '\'');
1779             }
1780           }
1781         } else {
1782           throw new InvalidEscapeSequenceException(
1783               "Invalid escape sequence: '\\' at end of string.");
1784         }
1785       } else {
1786         result[pos++] = c;
1787       }
1788     }
1789 
1790     return ByteString.copyFrom(result, 0, pos);
1791   }
1792 
1793   /**
1794    * Thrown by {@link TextFormat#unescapeBytes} and
1795    * {@link TextFormat#unescapeText} when an invalid escape sequence is seen.
1796    */
1797   static class InvalidEscapeSequenceException extends IOException {
1798     private static final long serialVersionUID = -8164033650142593304L;
1799 
1800     InvalidEscapeSequenceException(final String description) {
1801       super(description);
1802     }
1803   }
1804 
1805   /**
1806    * Like {@link #escapeBytes(ByteString)}, but escapes a text string.
1807    * Non-ASCII characters are first encoded as UTF-8, then each byte is escaped
1808    * individually as a 3-digit octal escape.  Yes, it's weird.
1809    */
1810   static String escapeText(final String input) {
1811     return escapeBytes(ByteString.copyFromUtf8(input));
1812   }
1813 
1814   /**
1815    * Escape double quotes and backslashes in a String for unicode output of a message.
1816    */
1817   public static String escapeDoubleQuotesAndBackslashes(final String input) {
1818     return input.replace("\\", "\\\\").replace("\"", "\\\"");
1819   }
1820 
1821   /**
1822    * Un-escape a text string as escaped using {@link #escapeText(String)}.
1823    * Two-digit hex escapes (starting with "\x") are also recognized.
1824    */
1825   static String unescapeText(final String input)
1826                              throws InvalidEscapeSequenceException {
1827     return unescapeBytes(input).toStringUtf8();
1828   }
1829 
1830   /** Is this an octal digit? */
1831   private static boolean isOctal(final byte c) {
1832     return '0' <= c && c <= '7';
1833   }
1834 
1835   /** Is this a hex digit? */
1836   private static boolean isHex(final byte c) {
1837     return ('0' <= c && c <= '9') ||
1838            ('a' <= c && c <= 'f') ||
1839            ('A' <= c && c <= 'F');
1840   }
1841 
1842   /**
1843    * Interpret a character as a digit (in any base up to 36) and return the
1844    * numeric value.  This is like {@code Character.digit()} but we don't accept
1845    * non-ASCII digits.
1846    */
1847   private static int digitValue(final byte c) {
1848     if ('0' <= c && c <= '9') {
1849       return c - '0';
1850     } else if ('a' <= c && c <= 'z') {
1851       return c - 'a' + 10;
1852     } else {
1853       return c - 'A' + 10;
1854     }
1855   }
1856 
1857   /**
1858    * Parse a 32-bit signed integer from the text.  Unlike the Java standard
1859    * {@code Integer.parseInt()}, this function recognizes the prefixes "0x"
1860    * and "0" to signify hexadecimal and octal numbers, respectively.
1861    */
1862   static int parseInt32(final String text) throws NumberFormatException {
1863     return (int) parseInteger(text, true, false);
1864   }
1865 
1866   /**
1867    * Parse a 32-bit unsigned integer from the text.  Unlike the Java standard
1868    * {@code Integer.parseInt()}, this function recognizes the prefixes "0x"
1869    * and "0" to signify hexadecimal and octal numbers, respectively.  The
1870    * result is coerced to a (signed) {@code int} when returned since Java has
1871    * no unsigned integer type.
1872    */
1873   static int parseUInt32(final String text) throws NumberFormatException {
1874     return (int) parseInteger(text, false, false);
1875   }
1876 
1877   /**
1878    * Parse a 64-bit signed integer from the text.  Unlike the Java standard
1879    * {@code Integer.parseInt()}, this function recognizes the prefixes "0x"
1880    * and "0" to signify hexadecimal and octal numbers, respectively.
1881    */
1882   static long parseInt64(final String text) throws NumberFormatException {
1883     return parseInteger(text, true, true);
1884   }
1885 
1886   /**
1887    * Parse a 64-bit unsigned integer from the text.  Unlike the Java standard
1888    * {@code Integer.parseInt()}, this function recognizes the prefixes "0x"
1889    * and "0" to signify hexadecimal and octal numbers, respectively.  The
1890    * result is coerced to a (signed) {@code long} when returned since Java has
1891    * no unsigned long type.
1892    */
1893   static long parseUInt64(final String text) throws NumberFormatException {
1894     return parseInteger(text, false, true);
1895   }
1896 
1897   private static long parseInteger(final String text,
1898                                    final boolean isSigned,
1899                                    final boolean isLong)
1900                                    throws NumberFormatException {
1901     int pos = 0;
1902 
1903     boolean negative = false;
1904     if (text.startsWith("-", pos)) {
1905       if (!isSigned) {
1906         throw new NumberFormatException("Number must be positive: " + text);
1907       }
1908       ++pos;
1909       negative = true;
1910     }
1911 
1912     int radix = 10;
1913     if (text.startsWith("0x", pos)) {
1914       pos += 2;
1915       radix = 16;
1916     } else if (text.startsWith("0", pos)) {
1917       radix = 8;
1918     }
1919 
1920     final String numberText = text.substring(pos);
1921 
1922     long result = 0;
1923     if (numberText.length() < 16) {
1924       // Can safely assume no overflow.
1925       result = Long.parseLong(numberText, radix);
1926       if (negative) {
1927         result = -result;
1928       }
1929 
1930       // Check bounds.
1931       // No need to check for 64-bit numbers since they'd have to be 16 chars
1932       // or longer to overflow.
1933       if (!isLong) {
1934         if (isSigned) {
1935           if (result > Integer.MAX_VALUE || result < Integer.MIN_VALUE) {
1936             throw new NumberFormatException(
1937               "Number out of range for 32-bit signed integer: " + text);
1938           }
1939         } else {
1940           if (result >= (1L << 32) || result < 0) {
1941             throw new NumberFormatException(
1942               "Number out of range for 32-bit unsigned integer: " + text);
1943           }
1944         }
1945       }
1946     } else {
1947       BigInteger bigValue = new BigInteger(numberText, radix);
1948       if (negative) {
1949         bigValue = bigValue.negate();
1950       }
1951 
1952       // Check bounds.
1953       if (!isLong) {
1954         if (isSigned) {
1955           if (bigValue.bitLength() > 31) {
1956             throw new NumberFormatException(
1957               "Number out of range for 32-bit signed integer: " + text);
1958           }
1959         } else {
1960           if (bigValue.bitLength() > 32) {
1961             throw new NumberFormatException(
1962               "Number out of range for 32-bit unsigned integer: " + text);
1963           }
1964         }
1965       } else {
1966         if (isSigned) {
1967           if (bigValue.bitLength() > 63) {
1968             throw new NumberFormatException(
1969               "Number out of range for 64-bit signed integer: " + text);
1970           }
1971         } else {
1972           if (bigValue.bitLength() > 64) {
1973             throw new NumberFormatException(
1974               "Number out of range for 64-bit unsigned integer: " + text);
1975           }
1976         }
1977       }
1978 
1979       result = bigValue.longValue();
1980     }
1981 
1982     return result;
1983   }
1984 }
1985