• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Protocol Buffers - Google's data interchange format
2 // Copyright 2008 Google Inc.  All rights reserved.
3 // http://code.google.com/p/protobuf/
4 //
5 // Redistribution and use in source and binary forms, with or without
6 // modification, are permitted provided that the following conditions are
7 // met:
8 //
9 //     * Redistributions of source code must retain the above copyright
10 // notice, this list of conditions and the following disclaimer.
11 //     * Redistributions in binary form must reproduce the above
12 // copyright notice, this list of conditions and the following disclaimer
13 // in the documentation and/or other materials provided with the
14 // distribution.
15 //     * Neither the name of Google Inc. nor the names of its
16 // contributors may be used to endorse or promote products derived from
17 // this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 
31 package com.google.protobuf;
32 
33 import com.google.protobuf.Descriptors.Descriptor;
34 import com.google.protobuf.Descriptors.FieldDescriptor;
35 import com.google.protobuf.Descriptors.EnumDescriptor;
36 import com.google.protobuf.Descriptors.EnumValueDescriptor;
37 
38 import java.io.IOException;
39 import java.nio.CharBuffer;
40 import java.math.BigInteger;
41 import java.util.ArrayList;
42 import java.util.List;
43 import java.util.Locale;
44 import java.util.Map;
45 import java.util.regex.Matcher;
46 import java.util.regex.Pattern;
47 
48 /**
49  * Provide ascii text parsing and formatting support for proto2 instances.
50  * The implementation largely follows google/protobuf/text_format.cc.
51  *
52  * @author wenboz@google.com Wenbo Zhu
53  * @author kenton@google.com Kenton Varda
54  */
55 public final class TextFormat {
TextFormat()56   private TextFormat() {
57   }
58 
59   /**
60    * Outputs a textual representation of the Protocol Message supplied into
61    * the parameter output. (This representation is the new version of the
62    * classic "ProtocolPrinter" output from the original Protocol Buffer system)
63    */
print(final Message message, final Appendable output)64   public static void print(final Message message, final Appendable output)
65                            throws IOException {
66     final TextGenerator generator = new TextGenerator(output);
67     print(message, generator);
68   }
69 
70   /** Outputs a textual representation of {@code fields} to {@code output}. */
print(final UnknownFieldSet fields, final Appendable output)71   public static void print(final UnknownFieldSet fields,
72                            final Appendable output)
73                            throws IOException {
74     final TextGenerator generator = new TextGenerator(output);
75     printUnknownFields(fields, generator);
76   }
77 
78   /**
79    * Like {@code print()}, but writes directly to a {@code String} and
80    * returns it.
81    */
printToString(final Message message)82   public static String printToString(final Message message) {
83     try {
84       final StringBuilder text = new StringBuilder();
85       print(message, text);
86       return text.toString();
87     } catch (IOException e) {
88       throw new RuntimeException(
89         "Writing to a StringBuilder threw an IOException (should never " +
90         "happen).", e);
91     }
92   }
93 
94   /**
95    * Like {@code print()}, but writes directly to a {@code String} and
96    * returns it.
97    */
printToString(final UnknownFieldSet fields)98   public static String printToString(final UnknownFieldSet fields) {
99     try {
100       final StringBuilder text = new StringBuilder();
101       print(fields, text);
102       return text.toString();
103     } catch (IOException e) {
104       throw new RuntimeException(
105         "Writing to a StringBuilder threw an IOException (should never " +
106         "happen).", e);
107     }
108   }
109 
print(final Message message, final TextGenerator generator)110   private static void print(final Message message,
111                             final TextGenerator generator)
112       throws IOException {
113     for (final Map.Entry<FieldDescriptor, Object> field :
114          message.getAllFields().entrySet()) {
115       printField(field.getKey(), field.getValue(), generator);
116     }
117     printUnknownFields(message.getUnknownFields(), generator);
118   }
119 
printField(final FieldDescriptor field, final Object value, final Appendable output)120   public static void printField(final FieldDescriptor field,
121                                 final Object value,
122                                 final Appendable output)
123                                 throws IOException {
124     final TextGenerator generator = new TextGenerator(output);
125     printField(field, value, generator);
126   }
127 
printFieldToString(final FieldDescriptor field, final Object value)128   public static String printFieldToString(final FieldDescriptor field,
129                                           final Object value) {
130     try {
131       final StringBuilder text = new StringBuilder();
132       printField(field, value, text);
133       return text.toString();
134     } catch (IOException e) {
135       throw new RuntimeException(
136         "Writing to a StringBuilder threw an IOException (should never " +
137         "happen).", e);
138     }
139   }
140 
printField(final FieldDescriptor field, final Object value, final TextGenerator generator)141   private static void printField(final FieldDescriptor field,
142                                 final Object value,
143                                 final TextGenerator generator)
144                                 throws IOException {
145     if (field.isRepeated()) {
146       // Repeated field.  Print each element.
147       for (final Object element : (List) value) {
148         printSingleField(field, element, generator);
149       }
150     } else {
151       printSingleField(field, value, generator);
152     }
153   }
154 
printSingleField(final FieldDescriptor field, final Object value, final TextGenerator generator)155   private static void printSingleField(final FieldDescriptor field,
156                                        final Object value,
157                                        final TextGenerator generator)
158                                        throws IOException {
159     if (field.isExtension()) {
160       generator.print("[");
161       // We special-case MessageSet elements for compatibility with proto1.
162       if (field.getContainingType().getOptions().getMessageSetWireFormat()
163           && (field.getType() == FieldDescriptor.Type.MESSAGE)
164           && (field.isOptional())
165           // object equality
166           && (field.getExtensionScope() == field.getMessageType())) {
167         generator.print(field.getMessageType().getFullName());
168       } else {
169         generator.print(field.getFullName());
170       }
171       generator.print("]");
172     } else {
173       if (field.getType() == FieldDescriptor.Type.GROUP) {
174         // Groups must be serialized with their original capitalization.
175         generator.print(field.getMessageType().getName());
176       } else {
177         generator.print(field.getName());
178       }
179     }
180 
181     if (field.getJavaType() == FieldDescriptor.JavaType.MESSAGE) {
182       generator.print(" {\n");
183       generator.indent();
184     } else {
185       generator.print(": ");
186     }
187 
188     printFieldValue(field, value, generator);
189 
190     if (field.getJavaType() == FieldDescriptor.JavaType.MESSAGE) {
191       generator.outdent();
192       generator.print("}");
193     }
194     generator.print("\n");
195   }
196 
printFieldValue(final FieldDescriptor field, final Object value, final TextGenerator generator)197   private static void printFieldValue(final FieldDescriptor field,
198                                       final Object value,
199                                       final TextGenerator generator)
200                                       throws IOException {
201     switch (field.getType()) {
202       case INT32:
203       case INT64:
204       case SINT32:
205       case SINT64:
206       case SFIXED32:
207       case SFIXED64:
208       case FLOAT:
209       case DOUBLE:
210       case BOOL:
211         // Good old toString() does what we want for these types.
212         generator.print(value.toString());
213         break;
214 
215       case UINT32:
216       case FIXED32:
217         generator.print(unsignedToString((Integer) value));
218         break;
219 
220       case UINT64:
221       case FIXED64:
222         generator.print(unsignedToString((Long) value));
223         break;
224 
225       case STRING:
226         generator.print("\"");
227         generator.print(escapeText((String) value));
228         generator.print("\"");
229         break;
230 
231       case BYTES:
232         generator.print("\"");
233         generator.print(escapeBytes((ByteString) value));
234         generator.print("\"");
235         break;
236 
237       case ENUM:
238         generator.print(((EnumValueDescriptor) value).getName());
239         break;
240 
241       case MESSAGE:
242       case GROUP:
243         print((Message) value, generator);
244         break;
245     }
246   }
247 
printUnknownFields(final UnknownFieldSet unknownFields, final TextGenerator generator)248   private static void printUnknownFields(final UnknownFieldSet unknownFields,
249                                          final TextGenerator generator)
250                                          throws IOException {
251     for (final Map.Entry<Integer, UnknownFieldSet.Field> entry :
252          unknownFields.asMap().entrySet()) {
253       final String prefix = entry.getKey().toString() + ": ";
254       final UnknownFieldSet.Field field = entry.getValue();
255 
256       for (final long value : field.getVarintList()) {
257         generator.print(entry.getKey().toString());
258         generator.print(": ");
259         generator.print(unsignedToString(value));
260         generator.print("\n");
261       }
262       for (final int value : field.getFixed32List()) {
263         generator.print(entry.getKey().toString());
264         generator.print(": ");
265         generator.print(String.format((Locale) null, "0x%08x", value));
266         generator.print("\n");
267       }
268       for (final long value : field.getFixed64List()) {
269         generator.print(entry.getKey().toString());
270         generator.print(": ");
271         generator.print(String.format((Locale) null, "0x%016x", value));
272         generator.print("\n");
273       }
274       for (final ByteString value : field.getLengthDelimitedList()) {
275         generator.print(entry.getKey().toString());
276         generator.print(": \"");
277         generator.print(escapeBytes(value));
278         generator.print("\"\n");
279       }
280       for (final UnknownFieldSet value : field.getGroupList()) {
281         generator.print(entry.getKey().toString());
282         generator.print(" {\n");
283         generator.indent();
284         printUnknownFields(value, generator);
285         generator.outdent();
286         generator.print("}\n");
287       }
288     }
289   }
290 
291   /** Convert an unsigned 32-bit integer to a string. */
unsignedToString(final int value)292   private static String unsignedToString(final int value) {
293     if (value >= 0) {
294       return Integer.toString(value);
295     } else {
296       return Long.toString(((long) value) & 0x00000000FFFFFFFFL);
297     }
298   }
299 
300   /** Convert an unsigned 64-bit integer to a string. */
unsignedToString(final long value)301   private static String unsignedToString(final long value) {
302     if (value >= 0) {
303       return Long.toString(value);
304     } else {
305       // Pull off the most-significant bit so that BigInteger doesn't think
306       // the number is negative, then set it again using setBit().
307       return BigInteger.valueOf(value & 0x7FFFFFFFFFFFFFFFL)
308                        .setBit(63).toString();
309     }
310   }
311 
312   /**
313    * An inner class for writing text to the output stream.
314    */
315   private static final class TextGenerator {
316     private Appendable output;
317     private boolean atStartOfLine = true;
318     private final StringBuilder indent = new StringBuilder();
319 
TextGenerator(final Appendable output)320     private TextGenerator(final Appendable output) {
321       this.output = output;
322     }
323 
324     /**
325      * Indent text by two spaces.  After calling Indent(), two spaces will be
326      * inserted at the beginning of each line of text.  Indent() may be called
327      * multiple times to produce deeper indents.
328      */
indent()329     public void indent() {
330       indent.append("  ");
331     }
332 
333     /**
334      * Reduces the current indent level by two spaces, or crashes if the indent
335      * level is zero.
336      */
outdent()337     public void outdent() {
338       final int length = indent.length();
339       if (length == 0) {
340         throw new IllegalArgumentException(
341             " Outdent() without matching Indent().");
342       }
343       indent.delete(length - 2, length);
344     }
345 
346     /**
347      * Print text to the output stream.
348      */
print(final CharSequence text)349     public void print(final CharSequence text) throws IOException {
350       final int size = text.length();
351       int pos = 0;
352 
353       for (int i = 0; i < size; i++) {
354         if (text.charAt(i) == '\n') {
355           write(text.subSequence(pos, size), i - pos + 1);
356           pos = i + 1;
357           atStartOfLine = true;
358         }
359       }
360       write(text.subSequence(pos, size), size - pos);
361     }
362 
write(final CharSequence data, final int size)363     private void write(final CharSequence data, final int size)
364                        throws IOException {
365       if (size == 0) {
366         return;
367       }
368       if (atStartOfLine) {
369         atStartOfLine = false;
370         output.append(indent);
371       }
372       output.append(data);
373     }
374   }
375 
376   // =================================================================
377   // Parsing
378 
379   /**
380    * Represents a stream of tokens parsed from a {@code String}.
381    *
382    * <p>The Java standard library provides many classes that you might think
383    * would be useful for implementing this, but aren't.  For example:
384    *
385    * <ul>
386    * <li>{@code java.io.StreamTokenizer}:  This almost does what we want -- or,
387    *   at least, something that would get us close to what we want -- except
388    *   for one fatal flaw:  It automatically un-escapes strings using Java
389    *   escape sequences, which do not include all the escape sequences we
390    *   need to support (e.g. '\x').
391    * <li>{@code java.util.Scanner}:  This seems like a great way at least to
392    *   parse regular expressions out of a stream (so we wouldn't have to load
393    *   the entire input into a single string before parsing).  Sadly,
394    *   {@code Scanner} requires that tokens be delimited with some delimiter.
395    *   Thus, although the text "foo:" should parse to two tokens ("foo" and
396    *   ":"), {@code Scanner} would recognize it only as a single token.
397    *   Furthermore, {@code Scanner} provides no way to inspect the contents
398    *   of delimiters, making it impossible to keep track of line and column
399    *   numbers.
400    * </ul>
401    *
402    * <p>Luckily, Java's regular expression support does manage to be useful to
403    * us.  (Barely:  We need {@code Matcher.usePattern()}, which is new in
404    * Java 1.5.)  So, we can use that, at least.  Unfortunately, this implies
405    * that we need to have the entire input in one contiguous string.
406    */
407   private static final class Tokenizer {
408     private final CharSequence text;
409     private final Matcher matcher;
410     private String currentToken;
411 
412     // The character index within this.text at which the current token begins.
413     private int pos = 0;
414 
415     // The line and column numbers of the current token.
416     private int line = 0;
417     private int column = 0;
418 
419     // The line and column numbers of the previous token (allows throwing
420     // errors *after* consuming).
421     private int previousLine = 0;
422     private int previousColumn = 0;
423 
424     // We use possesive quantifiers (*+ and ++) because otherwise the Java
425     // regex matcher has stack overflows on large inputs.
426     private static final Pattern WHITESPACE =
427       Pattern.compile("(\\s|(#.*$))++", Pattern.MULTILINE);
428     private static final Pattern TOKEN = Pattern.compile(
429       "[a-zA-Z_][0-9a-zA-Z_+-]*+|" +                // an identifier
430       "[.]?[0-9+-][0-9a-zA-Z_.+-]*+|" +             // a number
431       "\"([^\"\n\\\\]|\\\\.)*+(\"|\\\\?$)|" +       // a double-quoted string
432       "\'([^\'\n\\\\]|\\\\.)*+(\'|\\\\?$)",         // a single-quoted string
433       Pattern.MULTILINE);
434 
435     private static final Pattern DOUBLE_INFINITY = Pattern.compile(
436       "-?inf(inity)?",
437       Pattern.CASE_INSENSITIVE);
438     private static final Pattern FLOAT_INFINITY = Pattern.compile(
439       "-?inf(inity)?f?",
440       Pattern.CASE_INSENSITIVE);
441     private static final Pattern FLOAT_NAN = Pattern.compile(
442       "nanf?",
443       Pattern.CASE_INSENSITIVE);
444 
445     /** Construct a tokenizer that parses tokens from the given text. */
Tokenizer(final CharSequence text)446     private Tokenizer(final CharSequence text) {
447       this.text = text;
448       this.matcher = WHITESPACE.matcher(text);
449       skipWhitespace();
450       nextToken();
451     }
452 
453     /** Are we at the end of the input? */
atEnd()454     public boolean atEnd() {
455       return currentToken.length() == 0;
456     }
457 
458     /** Advance to the next token. */
nextToken()459     public void nextToken() {
460       previousLine = line;
461       previousColumn = column;
462 
463       // Advance the line counter to the current position.
464       while (pos < matcher.regionStart()) {
465         if (text.charAt(pos) == '\n') {
466           ++line;
467           column = 0;
468         } else {
469           ++column;
470         }
471         ++pos;
472       }
473 
474       // Match the next token.
475       if (matcher.regionStart() == matcher.regionEnd()) {
476         // EOF
477         currentToken = "";
478       } else {
479         matcher.usePattern(TOKEN);
480         if (matcher.lookingAt()) {
481           currentToken = matcher.group();
482           matcher.region(matcher.end(), matcher.regionEnd());
483         } else {
484           // Take one character.
485           currentToken = String.valueOf(text.charAt(pos));
486           matcher.region(pos + 1, matcher.regionEnd());
487         }
488 
489         skipWhitespace();
490       }
491     }
492 
493     /**
494      * Skip over any whitespace so that the matcher region starts at the next
495      * token.
496      */
skipWhitespace()497     private void skipWhitespace() {
498       matcher.usePattern(WHITESPACE);
499       if (matcher.lookingAt()) {
500         matcher.region(matcher.end(), matcher.regionEnd());
501       }
502     }
503 
504     /**
505      * If the next token exactly matches {@code token}, consume it and return
506      * {@code true}.  Otherwise, return {@code false} without doing anything.
507      */
tryConsume(final String token)508     public boolean tryConsume(final String token) {
509       if (currentToken.equals(token)) {
510         nextToken();
511         return true;
512       } else {
513         return false;
514       }
515     }
516 
517     /**
518      * If the next token exactly matches {@code token}, consume it.  Otherwise,
519      * throw a {@link ParseException}.
520      */
consume(final String token)521     public void consume(final String token) throws ParseException {
522       if (!tryConsume(token)) {
523         throw parseException("Expected \"" + token + "\".");
524       }
525     }
526 
527     /**
528      * Returns {@code true} if the next token is an integer, but does
529      * not consume it.
530      */
lookingAtInteger()531     public boolean lookingAtInteger() {
532       if (currentToken.length() == 0) {
533         return false;
534       }
535 
536       final char c = currentToken.charAt(0);
537       return ('0' <= c && c <= '9') ||
538              c == '-' || c == '+';
539     }
540 
541     /**
542      * If the next token is an identifier, consume it and return its value.
543      * Otherwise, throw a {@link ParseException}.
544      */
consumeIdentifier()545     public String consumeIdentifier() throws ParseException {
546       for (int i = 0; i < currentToken.length(); i++) {
547         final char c = currentToken.charAt(i);
548         if (('a' <= c && c <= 'z') ||
549             ('A' <= c && c <= 'Z') ||
550             ('0' <= c && c <= '9') ||
551             (c == '_') || (c == '.')) {
552           // OK
553         } else {
554           throw parseException("Expected identifier.");
555         }
556       }
557 
558       final String result = currentToken;
559       nextToken();
560       return result;
561     }
562 
563     /**
564      * If the next token is a 32-bit signed integer, consume it and return its
565      * value.  Otherwise, throw a {@link ParseException}.
566      */
consumeInt32()567     public int consumeInt32() throws ParseException {
568       try {
569         final int result = parseInt32(currentToken);
570         nextToken();
571         return result;
572       } catch (NumberFormatException e) {
573         throw integerParseException(e);
574       }
575     }
576 
577     /**
578      * If the next token is a 32-bit unsigned integer, consume it and return its
579      * value.  Otherwise, throw a {@link ParseException}.
580      */
consumeUInt32()581     public int consumeUInt32() throws ParseException {
582       try {
583         final int result = parseUInt32(currentToken);
584         nextToken();
585         return result;
586       } catch (NumberFormatException e) {
587         throw integerParseException(e);
588       }
589     }
590 
591     /**
592      * If the next token is a 64-bit signed integer, consume it and return its
593      * value.  Otherwise, throw a {@link ParseException}.
594      */
consumeInt64()595     public long consumeInt64() throws ParseException {
596       try {
597         final long result = parseInt64(currentToken);
598         nextToken();
599         return result;
600       } catch (NumberFormatException e) {
601         throw integerParseException(e);
602       }
603     }
604 
605     /**
606      * If the next token is a 64-bit unsigned integer, consume it and return its
607      * value.  Otherwise, throw a {@link ParseException}.
608      */
consumeUInt64()609     public long consumeUInt64() throws ParseException {
610       try {
611         final long result = parseUInt64(currentToken);
612         nextToken();
613         return result;
614       } catch (NumberFormatException e) {
615         throw integerParseException(e);
616       }
617     }
618 
619     /**
620      * If the next token is a double, consume it and return its value.
621      * Otherwise, throw a {@link ParseException}.
622      */
consumeDouble()623     public double consumeDouble() throws ParseException {
624       // We need to parse infinity and nan separately because
625       // Double.parseDouble() does not accept "inf", "infinity", or "nan".
626       if (DOUBLE_INFINITY.matcher(currentToken).matches()) {
627         final boolean negative = currentToken.startsWith("-");
628         nextToken();
629         return negative ? Double.NEGATIVE_INFINITY : Double.POSITIVE_INFINITY;
630       }
631       if (currentToken.equalsIgnoreCase("nan")) {
632         nextToken();
633         return Double.NaN;
634       }
635       try {
636         final double result = Double.parseDouble(currentToken);
637         nextToken();
638         return result;
639       } catch (NumberFormatException e) {
640         throw floatParseException(e);
641       }
642     }
643 
644     /**
645      * If the next token is a float, consume it and return its value.
646      * Otherwise, throw a {@link ParseException}.
647      */
consumeFloat()648     public float consumeFloat() throws ParseException {
649       // We need to parse infinity and nan separately because
650       // Float.parseFloat() does not accept "inf", "infinity", or "nan".
651       if (FLOAT_INFINITY.matcher(currentToken).matches()) {
652         final boolean negative = currentToken.startsWith("-");
653         nextToken();
654         return negative ? Float.NEGATIVE_INFINITY : Float.POSITIVE_INFINITY;
655       }
656       if (FLOAT_NAN.matcher(currentToken).matches()) {
657         nextToken();
658         return Float.NaN;
659       }
660       try {
661         final float result = Float.parseFloat(currentToken);
662         nextToken();
663         return result;
664       } catch (NumberFormatException e) {
665         throw floatParseException(e);
666       }
667     }
668 
669     /**
670      * If the next token is a boolean, consume it and return its value.
671      * Otherwise, throw a {@link ParseException}.
672      */
consumeBoolean()673     public boolean consumeBoolean() throws ParseException {
674       if (currentToken.equals("true")) {
675         nextToken();
676         return true;
677       } else if (currentToken.equals("false")) {
678         nextToken();
679         return false;
680       } else {
681         throw parseException("Expected \"true\" or \"false\".");
682       }
683     }
684 
685     /**
686      * If the next token is a string, consume it and return its (unescaped)
687      * value.  Otherwise, throw a {@link ParseException}.
688      */
consumeString()689     public String consumeString() throws ParseException {
690       return consumeByteString().toStringUtf8();
691     }
692 
693     /**
694      * If the next token is a string, consume it, unescape it as a
695      * {@link ByteString}, and return it.  Otherwise, throw a
696      * {@link ParseException}.
697      */
consumeByteString()698     public ByteString consumeByteString() throws ParseException {
699       List<ByteString> list = new ArrayList<ByteString>();
700       consumeByteString(list);
701       while (currentToken.startsWith("'") || currentToken.startsWith("\"")) {
702         consumeByteString(list);
703       }
704       return ByteString.copyFrom(list);
705     }
706 
707     /**
708      * Like {@link #consumeByteString()} but adds each token of the string to
709      * the given list.  String literals (whether bytes or text) may come in
710      * multiple adjacent tokens which are automatically concatenated, like in
711      * C or Python.
712      */
consumeByteString(List<ByteString> list)713     private void consumeByteString(List<ByteString> list) throws ParseException {
714       final char quote = currentToken.length() > 0 ? currentToken.charAt(0)
715                                                    : '\0';
716       if (quote != '\"' && quote != '\'') {
717         throw parseException("Expected string.");
718       }
719 
720       if (currentToken.length() < 2 ||
721           currentToken.charAt(currentToken.length() - 1) != quote) {
722         throw parseException("String missing ending quote.");
723       }
724 
725       try {
726         final String escaped =
727             currentToken.substring(1, currentToken.length() - 1);
728         final ByteString result = unescapeBytes(escaped);
729         nextToken();
730         list.add(result);
731       } catch (InvalidEscapeSequenceException e) {
732         throw parseException(e.getMessage());
733       }
734     }
735 
736     /**
737      * Returns a {@link ParseException} with the current line and column
738      * numbers in the description, suitable for throwing.
739      */
parseException(final String description)740     public ParseException parseException(final String description) {
741       // Note:  People generally prefer one-based line and column numbers.
742       return new ParseException(
743         (line + 1) + ":" + (column + 1) + ": " + description);
744     }
745 
746     /**
747      * Returns a {@link ParseException} with the line and column numbers of
748      * the previous token in the description, suitable for throwing.
749      */
parseExceptionPreviousToken( final String description)750     public ParseException parseExceptionPreviousToken(
751         final String description) {
752       // Note:  People generally prefer one-based line and column numbers.
753       return new ParseException(
754         (previousLine + 1) + ":" + (previousColumn + 1) + ": " + description);
755     }
756 
757     /**
758      * Constructs an appropriate {@link ParseException} for the given
759      * {@code NumberFormatException} when trying to parse an integer.
760      */
integerParseException( final NumberFormatException e)761     private ParseException integerParseException(
762         final NumberFormatException e) {
763       return parseException("Couldn't parse integer: " + e.getMessage());
764     }
765 
766     /**
767      * Constructs an appropriate {@link ParseException} for the given
768      * {@code NumberFormatException} when trying to parse a float or double.
769      */
floatParseException(final NumberFormatException e)770     private ParseException floatParseException(final NumberFormatException e) {
771       return parseException("Couldn't parse number: " + e.getMessage());
772     }
773   }
774 
775   /** Thrown when parsing an invalid text format message. */
776   public static class ParseException extends IOException {
777     private static final long serialVersionUID = 3196188060225107702L;
778 
ParseException(final String message)779     public ParseException(final String message) {
780       super(message);
781     }
782   }
783 
784   /**
785    * Parse a text-format message from {@code input} and merge the contents
786    * into {@code builder}.
787    */
merge(final Readable input, final Message.Builder builder)788   public static void merge(final Readable input,
789                            final Message.Builder builder)
790                            throws IOException {
791     merge(input, ExtensionRegistry.getEmptyRegistry(), builder);
792   }
793 
794   /**
795    * Parse a text-format message from {@code input} and merge the contents
796    * into {@code builder}.
797    */
merge(final CharSequence input, final Message.Builder builder)798   public static void merge(final CharSequence input,
799                            final Message.Builder builder)
800                            throws ParseException {
801     merge(input, ExtensionRegistry.getEmptyRegistry(), builder);
802   }
803 
804   /**
805    * Parse a text-format message from {@code input} and merge the contents
806    * into {@code builder}.  Extensions will be recognized if they are
807    * registered in {@code extensionRegistry}.
808    */
merge(final Readable input, final ExtensionRegistry extensionRegistry, final Message.Builder builder)809   public static void merge(final Readable input,
810                            final ExtensionRegistry extensionRegistry,
811                            final Message.Builder builder)
812                            throws IOException {
813     // Read the entire input to a String then parse that.
814 
815     // If StreamTokenizer were not quite so crippled, or if there were a kind
816     // of Reader that could read in chunks that match some particular regex,
817     // or if we wanted to write a custom Reader to tokenize our stream, then
818     // we would not have to read to one big String.  Alas, none of these is
819     // the case.  Oh well.
820 
821     merge(toStringBuilder(input), extensionRegistry, builder);
822   }
823 
824   private static final int BUFFER_SIZE = 4096;
825 
826   // TODO(chrisn): See if working around java.io.Reader#read(CharBuffer)
827   // overhead is worthwhile
toStringBuilder(final Readable input)828   private static StringBuilder toStringBuilder(final Readable input)
829       throws IOException {
830     final StringBuilder text = new StringBuilder();
831     final CharBuffer buffer = CharBuffer.allocate(BUFFER_SIZE);
832     while (true) {
833       final int n = input.read(buffer);
834       if (n == -1) {
835         break;
836       }
837       buffer.flip();
838       text.append(buffer, 0, n);
839     }
840     return text;
841   }
842 
843   /**
844    * Parse a text-format message from {@code input} and merge the contents
845    * into {@code builder}.  Extensions will be recognized if they are
846    * registered in {@code extensionRegistry}.
847    */
merge(final CharSequence input, final ExtensionRegistry extensionRegistry, final Message.Builder builder)848   public static void merge(final CharSequence input,
849                            final ExtensionRegistry extensionRegistry,
850                            final Message.Builder builder)
851                            throws ParseException {
852     final Tokenizer tokenizer = new Tokenizer(input);
853 
854     while (!tokenizer.atEnd()) {
855       mergeField(tokenizer, extensionRegistry, builder);
856     }
857   }
858 
859   /**
860    * Parse a single field from {@code tokenizer} and merge it into
861    * {@code builder}.
862    */
mergeField(final Tokenizer tokenizer, final ExtensionRegistry extensionRegistry, final Message.Builder builder)863   private static void mergeField(final Tokenizer tokenizer,
864                                  final ExtensionRegistry extensionRegistry,
865                                  final Message.Builder builder)
866                                  throws ParseException {
867     FieldDescriptor field;
868     final Descriptor type = builder.getDescriptorForType();
869     ExtensionRegistry.ExtensionInfo extension = null;
870 
871     if (tokenizer.tryConsume("[")) {
872       // An extension.
873       final StringBuilder name =
874           new StringBuilder(tokenizer.consumeIdentifier());
875       while (tokenizer.tryConsume(".")) {
876         name.append('.');
877         name.append(tokenizer.consumeIdentifier());
878       }
879 
880       extension = extensionRegistry.findExtensionByName(name.toString());
881 
882       if (extension == null) {
883         throw tokenizer.parseExceptionPreviousToken(
884           "Extension \"" + name + "\" not found in the ExtensionRegistry.");
885       } else if (extension.descriptor.getContainingType() != type) {
886         throw tokenizer.parseExceptionPreviousToken(
887           "Extension \"" + name + "\" does not extend message type \"" +
888           type.getFullName() + "\".");
889       }
890 
891       tokenizer.consume("]");
892 
893       field = extension.descriptor;
894     } else {
895       final String name = tokenizer.consumeIdentifier();
896       field = type.findFieldByName(name);
897 
898       // Group names are expected to be capitalized as they appear in the
899       // .proto file, which actually matches their type names, not their field
900       // names.
901       if (field == null) {
902         // Explicitly specify US locale so that this code does not break when
903         // executing in Turkey.
904         final String lowerName = name.toLowerCase(Locale.US);
905         field = type.findFieldByName(lowerName);
906         // If the case-insensitive match worked but the field is NOT a group,
907         if (field != null && field.getType() != FieldDescriptor.Type.GROUP) {
908           field = null;
909         }
910       }
911       // Again, special-case group names as described above.
912       if (field != null && field.getType() == FieldDescriptor.Type.GROUP &&
913           !field.getMessageType().getName().equals(name)) {
914         field = null;
915       }
916 
917       if (field == null) {
918         throw tokenizer.parseExceptionPreviousToken(
919           "Message type \"" + type.getFullName() +
920           "\" has no field named \"" + name + "\".");
921       }
922     }
923 
924     Object value = null;
925 
926     if (field.getJavaType() == FieldDescriptor.JavaType.MESSAGE) {
927       tokenizer.tryConsume(":");  // optional
928 
929       final String endToken;
930       if (tokenizer.tryConsume("<")) {
931         endToken = ">";
932       } else {
933         tokenizer.consume("{");
934         endToken = "}";
935       }
936 
937       final Message.Builder subBuilder;
938       if (extension == null) {
939         subBuilder = builder.newBuilderForField(field);
940       } else {
941         subBuilder = extension.defaultInstance.newBuilderForType();
942       }
943 
944       while (!tokenizer.tryConsume(endToken)) {
945         if (tokenizer.atEnd()) {
946           throw tokenizer.parseException(
947             "Expected \"" + endToken + "\".");
948         }
949         mergeField(tokenizer, extensionRegistry, subBuilder);
950       }
951 
952       value = subBuilder.build();
953 
954     } else {
955       tokenizer.consume(":");
956 
957       switch (field.getType()) {
958         case INT32:
959         case SINT32:
960         case SFIXED32:
961           value = tokenizer.consumeInt32();
962           break;
963 
964         case INT64:
965         case SINT64:
966         case SFIXED64:
967           value = tokenizer.consumeInt64();
968           break;
969 
970         case UINT32:
971         case FIXED32:
972           value = tokenizer.consumeUInt32();
973           break;
974 
975         case UINT64:
976         case FIXED64:
977           value = tokenizer.consumeUInt64();
978           break;
979 
980         case FLOAT:
981           value = tokenizer.consumeFloat();
982           break;
983 
984         case DOUBLE:
985           value = tokenizer.consumeDouble();
986           break;
987 
988         case BOOL:
989           value = tokenizer.consumeBoolean();
990           break;
991 
992         case STRING:
993           value = tokenizer.consumeString();
994           break;
995 
996         case BYTES:
997           value = tokenizer.consumeByteString();
998           break;
999 
1000         case ENUM:
1001           final EnumDescriptor enumType = field.getEnumType();
1002 
1003           if (tokenizer.lookingAtInteger()) {
1004             final int number = tokenizer.consumeInt32();
1005             value = enumType.findValueByNumber(number);
1006             if (value == null) {
1007               throw tokenizer.parseExceptionPreviousToken(
1008                 "Enum type \"" + enumType.getFullName() +
1009                 "\" has no value with number " + number + '.');
1010             }
1011           } else {
1012             final String id = tokenizer.consumeIdentifier();
1013             value = enumType.findValueByName(id);
1014             if (value == null) {
1015               throw tokenizer.parseExceptionPreviousToken(
1016                 "Enum type \"" + enumType.getFullName() +
1017                 "\" has no value named \"" + id + "\".");
1018             }
1019           }
1020 
1021           break;
1022 
1023         case MESSAGE:
1024         case GROUP:
1025           throw new RuntimeException("Can't get here.");
1026       }
1027     }
1028 
1029     if (field.isRepeated()) {
1030       builder.addRepeatedField(field, value);
1031     } else {
1032       builder.setField(field, value);
1033     }
1034   }
1035 
1036   // =================================================================
1037   // Utility functions
1038   //
1039   // Some of these methods are package-private because Descriptors.java uses
1040   // them.
1041 
1042   /**
1043    * Escapes bytes in the format used in protocol buffer text format, which
1044    * is the same as the format used for C string literals.  All bytes
1045    * that are not printable 7-bit ASCII characters are escaped, as well as
1046    * backslash, single-quote, and double-quote characters.  Characters for
1047    * which no defined short-hand escape sequence is defined will be escaped
1048    * using 3-digit octal sequences.
1049    */
escapeBytes(final ByteString input)1050   static String escapeBytes(final ByteString input) {
1051     final StringBuilder builder = new StringBuilder(input.size());
1052     for (int i = 0; i < input.size(); i++) {
1053       final byte b = input.byteAt(i);
1054       switch (b) {
1055         // Java does not recognize \a or \v, apparently.
1056         case 0x07: builder.append("\\a" ); break;
1057         case '\b': builder.append("\\b" ); break;
1058         case '\f': builder.append("\\f" ); break;
1059         case '\n': builder.append("\\n" ); break;
1060         case '\r': builder.append("\\r" ); break;
1061         case '\t': builder.append("\\t" ); break;
1062         case 0x0b: builder.append("\\v" ); break;
1063         case '\\': builder.append("\\\\"); break;
1064         case '\'': builder.append("\\\'"); break;
1065         case '"' : builder.append("\\\""); break;
1066         default:
1067           if (b >= 0x20) {
1068             builder.append((char) b);
1069           } else {
1070             builder.append('\\');
1071             builder.append((char) ('0' + ((b >>> 6) & 3)));
1072             builder.append((char) ('0' + ((b >>> 3) & 7)));
1073             builder.append((char) ('0' + (b & 7)));
1074           }
1075           break;
1076       }
1077     }
1078     return builder.toString();
1079   }
1080 
1081   /**
1082    * Un-escape a byte sequence as escaped using
1083    * {@link #escapeBytes(ByteString)}.  Two-digit hex escapes (starting with
1084    * "\x") are also recognized.
1085    */
unescapeBytes(final CharSequence input)1086   static ByteString unescapeBytes(final CharSequence input)
1087       throws InvalidEscapeSequenceException {
1088     final byte[] result = new byte[input.length()];
1089     int pos = 0;
1090     for (int i = 0; i < input.length(); i++) {
1091       char c = input.charAt(i);
1092       if (c == '\\') {
1093         if (i + 1 < input.length()) {
1094           ++i;
1095           c = input.charAt(i);
1096           if (isOctal(c)) {
1097             // Octal escape.
1098             int code = digitValue(c);
1099             if (i + 1 < input.length() && isOctal(input.charAt(i + 1))) {
1100               ++i;
1101               code = code * 8 + digitValue(input.charAt(i));
1102             }
1103             if (i + 1 < input.length() && isOctal(input.charAt(i + 1))) {
1104               ++i;
1105               code = code * 8 + digitValue(input.charAt(i));
1106             }
1107             result[pos++] = (byte)code;
1108           } else {
1109             switch (c) {
1110               case 'a' : result[pos++] = 0x07; break;
1111               case 'b' : result[pos++] = '\b'; break;
1112               case 'f' : result[pos++] = '\f'; break;
1113               case 'n' : result[pos++] = '\n'; break;
1114               case 'r' : result[pos++] = '\r'; break;
1115               case 't' : result[pos++] = '\t'; break;
1116               case 'v' : result[pos++] = 0x0b; break;
1117               case '\\': result[pos++] = '\\'; break;
1118               case '\'': result[pos++] = '\''; break;
1119               case '"' : result[pos++] = '\"'; break;
1120 
1121               case 'x':
1122                 // hex escape
1123                 int code = 0;
1124                 if (i + 1 < input.length() && isHex(input.charAt(i + 1))) {
1125                   ++i;
1126                   code = digitValue(input.charAt(i));
1127                 } else {
1128                   throw new InvalidEscapeSequenceException(
1129                     "Invalid escape sequence: '\\x' with no digits");
1130                 }
1131                 if (i + 1 < input.length() && isHex(input.charAt(i + 1))) {
1132                   ++i;
1133                   code = code * 16 + digitValue(input.charAt(i));
1134                 }
1135                 result[pos++] = (byte)code;
1136                 break;
1137 
1138               default:
1139                 throw new InvalidEscapeSequenceException(
1140                   "Invalid escape sequence: '\\" + c + '\'');
1141             }
1142           }
1143         } else {
1144           throw new InvalidEscapeSequenceException(
1145             "Invalid escape sequence: '\\' at end of string.");
1146         }
1147       } else {
1148         result[pos++] = (byte)c;
1149       }
1150     }
1151 
1152     return ByteString.copyFrom(result, 0, pos);
1153   }
1154 
1155   /**
1156    * Thrown by {@link TextFormat#unescapeBytes} and
1157    * {@link TextFormat#unescapeText} when an invalid escape sequence is seen.
1158    */
1159   static class InvalidEscapeSequenceException extends IOException {
1160     private static final long serialVersionUID = -8164033650142593304L;
1161 
InvalidEscapeSequenceException(final String description)1162     InvalidEscapeSequenceException(final String description) {
1163       super(description);
1164     }
1165   }
1166 
1167   /**
1168    * Like {@link #escapeBytes(ByteString)}, but escapes a text string.
1169    * Non-ASCII characters are first encoded as UTF-8, then each byte is escaped
1170    * individually as a 3-digit octal escape.  Yes, it's weird.
1171    */
escapeText(final String input)1172   static String escapeText(final String input) {
1173     return escapeBytes(ByteString.copyFromUtf8(input));
1174   }
1175 
1176   /**
1177    * Un-escape a text string as escaped using {@link #escapeText(String)}.
1178    * Two-digit hex escapes (starting with "\x") are also recognized.
1179    */
unescapeText(final String input)1180   static String unescapeText(final String input)
1181                              throws InvalidEscapeSequenceException {
1182     return unescapeBytes(input).toStringUtf8();
1183   }
1184 
1185   /** Is this an octal digit? */
isOctal(final char c)1186   private static boolean isOctal(final char c) {
1187     return '0' <= c && c <= '7';
1188   }
1189 
1190   /** Is this a hex digit? */
isHex(final char c)1191   private static boolean isHex(final char c) {
1192     return ('0' <= c && c <= '9') ||
1193            ('a' <= c && c <= 'f') ||
1194            ('A' <= c && c <= 'F');
1195   }
1196 
1197   /**
1198    * Interpret a character as a digit (in any base up to 36) and return the
1199    * numeric value.  This is like {@code Character.digit()} but we don't accept
1200    * non-ASCII digits.
1201    */
digitValue(final char c)1202   private static int digitValue(final char c) {
1203     if ('0' <= c && c <= '9') {
1204       return c - '0';
1205     } else if ('a' <= c && c <= 'z') {
1206       return c - 'a' + 10;
1207     } else {
1208       return c - 'A' + 10;
1209     }
1210   }
1211 
1212   /**
1213    * Parse a 32-bit signed integer from the text.  Unlike the Java standard
1214    * {@code Integer.parseInt()}, this function recognizes the prefixes "0x"
1215    * and "0" to signify hexidecimal and octal numbers, respectively.
1216    */
parseInt32(final String text)1217   static int parseInt32(final String text) throws NumberFormatException {
1218     return (int) parseInteger(text, true, false);
1219   }
1220 
1221   /**
1222    * Parse a 32-bit unsigned integer from the text.  Unlike the Java standard
1223    * {@code Integer.parseInt()}, this function recognizes the prefixes "0x"
1224    * and "0" to signify hexidecimal and octal numbers, respectively.  The
1225    * result is coerced to a (signed) {@code int} when returned since Java has
1226    * no unsigned integer type.
1227    */
parseUInt32(final String text)1228   static int parseUInt32(final String text) throws NumberFormatException {
1229     return (int) parseInteger(text, false, false);
1230   }
1231 
1232   /**
1233    * Parse a 64-bit signed integer from the text.  Unlike the Java standard
1234    * {@code Integer.parseInt()}, this function recognizes the prefixes "0x"
1235    * and "0" to signify hexidecimal and octal numbers, respectively.
1236    */
parseInt64(final String text)1237   static long parseInt64(final String text) throws NumberFormatException {
1238     return parseInteger(text, true, true);
1239   }
1240 
1241   /**
1242    * Parse a 64-bit unsigned integer from the text.  Unlike the Java standard
1243    * {@code Integer.parseInt()}, this function recognizes the prefixes "0x"
1244    * and "0" to signify hexidecimal and octal numbers, respectively.  The
1245    * result is coerced to a (signed) {@code long} when returned since Java has
1246    * no unsigned long type.
1247    */
parseUInt64(final String text)1248   static long parseUInt64(final String text) throws NumberFormatException {
1249     return parseInteger(text, false, true);
1250   }
1251 
parseInteger(final String text, final boolean isSigned, final boolean isLong)1252   private static long parseInteger(final String text,
1253                                    final boolean isSigned,
1254                                    final boolean isLong)
1255                                    throws NumberFormatException {
1256     int pos = 0;
1257 
1258     boolean negative = false;
1259     if (text.startsWith("-", pos)) {
1260       if (!isSigned) {
1261         throw new NumberFormatException("Number must be positive: " + text);
1262       }
1263       ++pos;
1264       negative = true;
1265     }
1266 
1267     int radix = 10;
1268     if (text.startsWith("0x", pos)) {
1269       pos += 2;
1270       radix = 16;
1271     } else if (text.startsWith("0", pos)) {
1272       radix = 8;
1273     }
1274 
1275     final String numberText = text.substring(pos);
1276 
1277     long result = 0;
1278     if (numberText.length() < 16) {
1279       // Can safely assume no overflow.
1280       result = Long.parseLong(numberText, radix);
1281       if (negative) {
1282         result = -result;
1283       }
1284 
1285       // Check bounds.
1286       // No need to check for 64-bit numbers since they'd have to be 16 chars
1287       // or longer to overflow.
1288       if (!isLong) {
1289         if (isSigned) {
1290           if (result > Integer.MAX_VALUE || result < Integer.MIN_VALUE) {
1291             throw new NumberFormatException(
1292               "Number out of range for 32-bit signed integer: " + text);
1293           }
1294         } else {
1295           if (result >= (1L << 32) || result < 0) {
1296             throw new NumberFormatException(
1297               "Number out of range for 32-bit unsigned integer: " + text);
1298           }
1299         }
1300       }
1301     } else {
1302       BigInteger bigValue = new BigInteger(numberText, radix);
1303       if (negative) {
1304         bigValue = bigValue.negate();
1305       }
1306 
1307       // Check bounds.
1308       if (!isLong) {
1309         if (isSigned) {
1310           if (bigValue.bitLength() > 31) {
1311             throw new NumberFormatException(
1312               "Number out of range for 32-bit signed integer: " + text);
1313           }
1314         } else {
1315           if (bigValue.bitLength() > 32) {
1316             throw new NumberFormatException(
1317               "Number out of range for 32-bit unsigned integer: " + text);
1318           }
1319         }
1320       } else {
1321         if (isSigned) {
1322           if (bigValue.bitLength() > 63) {
1323             throw new NumberFormatException(
1324               "Number out of range for 64-bit signed integer: " + text);
1325           }
1326         } else {
1327           if (bigValue.bitLength() > 64) {
1328             throw new NumberFormatException(
1329               "Number out of range for 64-bit unsigned integer: " + text);
1330           }
1331         }
1332       }
1333 
1334       result = bigValue.longValue();
1335     }
1336 
1337     return result;
1338   }
1339 }
1340