1 // Protocol Buffers - Google's data interchange format 2 // Copyright 2008 Google Inc. All rights reserved. 3 // https://developers.google.com/protocol-buffers/ 4 // 5 // Redistribution and use in source and binary forms, with or without 6 // modification, are permitted provided that the following conditions are 7 // met: 8 // 9 // * Redistributions of source code must retain the above copyright 10 // notice, this list of conditions and the following disclaimer. 11 // * Redistributions in binary form must reproduce the above 12 // copyright notice, this list of conditions and the following disclaimer 13 // in the documentation and/or other materials provided with the 14 // distribution. 15 // * Neither the name of Google Inc. nor the names of its 16 // contributors may be used to endorse or promote products derived from 17 // this software without specific prior written permission. 18 // 19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 31 package com.google.protobuf; 32 33 import com.google.protobuf.Descriptors.Descriptor; 34 import com.google.protobuf.Descriptors.EnumDescriptor; 35 import com.google.protobuf.Descriptors.EnumValueDescriptor; 36 import com.google.protobuf.Descriptors.FieldDescriptor; 37 38 import java.io.IOException; 39 import java.math.BigInteger; 40 import java.nio.CharBuffer; 41 import java.util.ArrayList; 42 import java.util.List; 43 import java.util.Locale; 44 import java.util.Map; 45 import java.util.logging.Logger; 46 import java.util.regex.Matcher; 47 import java.util.regex.Pattern; 48 49 /** 50 * Provide text parsing and formatting support for proto2 instances. 51 * The implementation largely follows google/protobuf/text_format.cc. 52 * 53 * @author wenboz@google.com Wenbo Zhu 54 * @author kenton@google.com Kenton Varda 55 */ 56 public final class TextFormat { TextFormat()57 private TextFormat() {} 58 59 private static final Logger logger = 60 Logger.getLogger(TextFormat.class.getName()); 61 62 private static final Printer DEFAULT_PRINTER = new Printer(); 63 private static final Printer SINGLE_LINE_PRINTER = 64 (new Printer()).setSingleLineMode(true); 65 private static final Printer UNICODE_PRINTER = 66 (new Printer()).setEscapeNonAscii(false); 67 68 /** 69 * Outputs a textual representation of the Protocol Message supplied into 70 * the parameter output. (This representation is the new version of the 71 * classic "ProtocolPrinter" output from the original Protocol Buffer system) 72 */ print( final MessageOrBuilder message, final Appendable output)73 public static void print( 74 final MessageOrBuilder message, final Appendable output) 75 throws IOException { 76 DEFAULT_PRINTER.print(message, new TextGenerator(output)); 77 } 78 79 /** Outputs a textual representation of {@code fields} to {@code output}. */ print(final UnknownFieldSet fields, final Appendable output)80 public static void print(final UnknownFieldSet fields, 81 final Appendable output) 82 throws IOException { 83 DEFAULT_PRINTER.printUnknownFields(fields, new TextGenerator(output)); 84 } 85 86 /** 87 * Same as {@code print()}, except that non-ASCII characters are not 88 * escaped. 89 */ printUnicode( final MessageOrBuilder message, final Appendable output)90 public static void printUnicode( 91 final MessageOrBuilder message, final Appendable output) 92 throws IOException { 93 UNICODE_PRINTER.print(message, new TextGenerator(output)); 94 } 95 96 /** 97 * Same as {@code print()}, except that non-ASCII characters are not 98 * escaped. 99 */ printUnicode(final UnknownFieldSet fields, final Appendable output)100 public static void printUnicode(final UnknownFieldSet fields, 101 final Appendable output) 102 throws IOException { 103 UNICODE_PRINTER.printUnknownFields(fields, new TextGenerator(output)); 104 } 105 106 /** 107 * Generates a human readable form of this message, useful for debugging and 108 * other purposes, with no newline characters. 109 */ shortDebugString(final MessageOrBuilder message)110 public static String shortDebugString(final MessageOrBuilder message) { 111 try { 112 final StringBuilder sb = new StringBuilder(); 113 SINGLE_LINE_PRINTER.print(message, new TextGenerator(sb)); 114 // Single line mode currently might have an extra space at the end. 115 return sb.toString().trim(); 116 } catch (IOException e) { 117 throw new IllegalStateException(e); 118 } 119 } 120 121 /** 122 * Generates a human readable form of the unknown fields, useful for debugging 123 * and other purposes, with no newline characters. 124 */ shortDebugString(final UnknownFieldSet fields)125 public static String shortDebugString(final UnknownFieldSet fields) { 126 try { 127 final StringBuilder sb = new StringBuilder(); 128 SINGLE_LINE_PRINTER.printUnknownFields(fields, new TextGenerator(sb)); 129 // Single line mode currently might have an extra space at the end. 130 return sb.toString().trim(); 131 } catch (IOException e) { 132 throw new IllegalStateException(e); 133 } 134 } 135 136 /** 137 * Like {@code print()}, but writes directly to a {@code String} and 138 * returns it. 139 */ printToString(final MessageOrBuilder message)140 public static String printToString(final MessageOrBuilder message) { 141 try { 142 final StringBuilder text = new StringBuilder(); 143 print(message, text); 144 return text.toString(); 145 } catch (IOException e) { 146 throw new IllegalStateException(e); 147 } 148 } 149 150 /** 151 * Like {@code print()}, but writes directly to a {@code String} and 152 * returns it. 153 */ printToString(final UnknownFieldSet fields)154 public static String printToString(final UnknownFieldSet fields) { 155 try { 156 final StringBuilder text = new StringBuilder(); 157 print(fields, text); 158 return text.toString(); 159 } catch (IOException e) { 160 throw new IllegalStateException(e); 161 } 162 } 163 164 /** 165 * Same as {@code printToString()}, except that non-ASCII characters 166 * in string type fields are not escaped in backslash+octals. 167 */ printToUnicodeString(final MessageOrBuilder message)168 public static String printToUnicodeString(final MessageOrBuilder message) { 169 try { 170 final StringBuilder text = new StringBuilder(); 171 UNICODE_PRINTER.print(message, new TextGenerator(text)); 172 return text.toString(); 173 } catch (IOException e) { 174 throw new IllegalStateException(e); 175 } 176 } 177 178 /** 179 * Same as {@code printToString()}, except that non-ASCII characters 180 * in string type fields are not escaped in backslash+octals. 181 */ printToUnicodeString(final UnknownFieldSet fields)182 public static String printToUnicodeString(final UnknownFieldSet fields) { 183 try { 184 final StringBuilder text = new StringBuilder(); 185 UNICODE_PRINTER.printUnknownFields(fields, new TextGenerator(text)); 186 return text.toString(); 187 } catch (IOException e) { 188 throw new IllegalStateException(e); 189 } 190 } 191 printField(final FieldDescriptor field, final Object value, final Appendable output)192 public static void printField(final FieldDescriptor field, 193 final Object value, 194 final Appendable output) 195 throws IOException { 196 DEFAULT_PRINTER.printField(field, value, new TextGenerator(output)); 197 } 198 printFieldToString(final FieldDescriptor field, final Object value)199 public static String printFieldToString(final FieldDescriptor field, 200 final Object value) { 201 try { 202 final StringBuilder text = new StringBuilder(); 203 printField(field, value, text); 204 return text.toString(); 205 } catch (IOException e) { 206 throw new IllegalStateException(e); 207 } 208 } 209 210 /** 211 * Outputs a textual representation of the value of given field value. 212 * 213 * @param field the descriptor of the field 214 * @param value the value of the field 215 * @param output the output to which to append the formatted value 216 * @throws ClassCastException if the value is not appropriate for the 217 * given field descriptor 218 * @throws IOException if there is an exception writing to the output 219 */ printFieldValue(final FieldDescriptor field, final Object value, final Appendable output)220 public static void printFieldValue(final FieldDescriptor field, 221 final Object value, 222 final Appendable output) 223 throws IOException { 224 DEFAULT_PRINTER.printFieldValue(field, value, new TextGenerator(output)); 225 } 226 227 /** 228 * Outputs a textual representation of the value of an unknown field. 229 * 230 * @param tag the field's tag number 231 * @param value the value of the field 232 * @param output the output to which to append the formatted value 233 * @throws ClassCastException if the value is not appropriate for the 234 * given field descriptor 235 * @throws IOException if there is an exception writing to the output 236 */ printUnknownFieldValue(final int tag, final Object value, final Appendable output)237 public static void printUnknownFieldValue(final int tag, 238 final Object value, 239 final Appendable output) 240 throws IOException { 241 printUnknownFieldValue(tag, value, new TextGenerator(output)); 242 } 243 printUnknownFieldValue(final int tag, final Object value, final TextGenerator generator)244 private static void printUnknownFieldValue(final int tag, 245 final Object value, 246 final TextGenerator generator) 247 throws IOException { 248 switch (WireFormat.getTagWireType(tag)) { 249 case WireFormat.WIRETYPE_VARINT: 250 generator.print(unsignedToString((Long) value)); 251 break; 252 case WireFormat.WIRETYPE_FIXED32: 253 generator.print( 254 String.format((Locale) null, "0x%08x", (Integer) value)); 255 break; 256 case WireFormat.WIRETYPE_FIXED64: 257 generator.print(String.format((Locale) null, "0x%016x", (Long) value)); 258 break; 259 case WireFormat.WIRETYPE_LENGTH_DELIMITED: 260 generator.print("\""); 261 generator.print(escapeBytes((ByteString) value)); 262 generator.print("\""); 263 break; 264 case WireFormat.WIRETYPE_START_GROUP: 265 DEFAULT_PRINTER.printUnknownFields((UnknownFieldSet) value, generator); 266 break; 267 default: 268 throw new IllegalArgumentException("Bad tag: " + tag); 269 } 270 } 271 272 /** Helper class for converting protobufs to text. */ 273 private static final class Printer { 274 /** Whether to omit newlines from the output. */ 275 boolean singleLineMode = false; 276 277 /** Whether to escape non ASCII characters with backslash and octal. */ 278 boolean escapeNonAscii = true; 279 Printer()280 private Printer() {} 281 282 /** Setter of singleLineMode */ setSingleLineMode(boolean singleLineMode)283 private Printer setSingleLineMode(boolean singleLineMode) { 284 this.singleLineMode = singleLineMode; 285 return this; 286 } 287 288 /** Setter of escapeNonAscii */ setEscapeNonAscii(boolean escapeNonAscii)289 private Printer setEscapeNonAscii(boolean escapeNonAscii) { 290 this.escapeNonAscii = escapeNonAscii; 291 return this; 292 } 293 print( final MessageOrBuilder message, final TextGenerator generator)294 private void print( 295 final MessageOrBuilder message, final TextGenerator generator) 296 throws IOException { 297 for (Map.Entry<FieldDescriptor, Object> field 298 : message.getAllFields().entrySet()) { 299 printField(field.getKey(), field.getValue(), generator); 300 } 301 printUnknownFields(message.getUnknownFields(), generator); 302 } 303 printField(final FieldDescriptor field, final Object value, final TextGenerator generator)304 private void printField(final FieldDescriptor field, final Object value, 305 final TextGenerator generator) throws IOException { 306 if (field.isRepeated()) { 307 // Repeated field. Print each element. 308 for (Object element : (List<?>) value) { 309 printSingleField(field, element, generator); 310 } 311 } else { 312 printSingleField(field, value, generator); 313 } 314 } 315 printSingleField(final FieldDescriptor field, final Object value, final TextGenerator generator)316 private void printSingleField(final FieldDescriptor field, 317 final Object value, 318 final TextGenerator generator) 319 throws IOException { 320 if (field.isExtension()) { 321 generator.print("["); 322 // We special-case MessageSet elements for compatibility with proto1. 323 if (field.getContainingType().getOptions().getMessageSetWireFormat() 324 && (field.getType() == FieldDescriptor.Type.MESSAGE) 325 && (field.isOptional()) 326 // object equality 327 && (field.getExtensionScope() == field.getMessageType())) { 328 generator.print(field.getMessageType().getFullName()); 329 } else { 330 generator.print(field.getFullName()); 331 } 332 generator.print("]"); 333 } else { 334 if (field.getType() == FieldDescriptor.Type.GROUP) { 335 // Groups must be serialized with their original capitalization. 336 generator.print(field.getMessageType().getName()); 337 } else { 338 generator.print(field.getName()); 339 } 340 } 341 342 if (field.getJavaType() == FieldDescriptor.JavaType.MESSAGE) { 343 if (singleLineMode) { 344 generator.print(" { "); 345 } else { 346 generator.print(" {\n"); 347 generator.indent(); 348 } 349 } else { 350 generator.print(": "); 351 } 352 353 printFieldValue(field, value, generator); 354 355 if (field.getJavaType() == FieldDescriptor.JavaType.MESSAGE) { 356 if (singleLineMode) { 357 generator.print("} "); 358 } else { 359 generator.outdent(); 360 generator.print("}\n"); 361 } 362 } else { 363 if (singleLineMode) { 364 generator.print(" "); 365 } else { 366 generator.print("\n"); 367 } 368 } 369 } 370 printFieldValue(final FieldDescriptor field, final Object value, final TextGenerator generator)371 private void printFieldValue(final FieldDescriptor field, 372 final Object value, 373 final TextGenerator generator) 374 throws IOException { 375 switch (field.getType()) { 376 case INT32: 377 case SINT32: 378 case SFIXED32: 379 generator.print(((Integer) value).toString()); 380 break; 381 382 case INT64: 383 case SINT64: 384 case SFIXED64: 385 generator.print(((Long) value).toString()); 386 break; 387 388 case BOOL: 389 generator.print(((Boolean) value).toString()); 390 break; 391 392 case FLOAT: 393 generator.print(((Float) value).toString()); 394 break; 395 396 case DOUBLE: 397 generator.print(((Double) value).toString()); 398 break; 399 400 case UINT32: 401 case FIXED32: 402 generator.print(unsignedToString((Integer) value)); 403 break; 404 405 case UINT64: 406 case FIXED64: 407 generator.print(unsignedToString((Long) value)); 408 break; 409 410 case STRING: 411 generator.print("\""); 412 generator.print(escapeNonAscii ? 413 escapeText((String) value) : 414 escapeDoubleQuotesAndBackslashes((String) value)); 415 generator.print("\""); 416 break; 417 418 case BYTES: 419 generator.print("\""); 420 if (value instanceof ByteString) { 421 generator.print(escapeBytes((ByteString) value)); 422 } else { 423 generator.print(escapeBytes((byte[]) value)); 424 } 425 generator.print("\""); 426 break; 427 428 case ENUM: 429 generator.print(((EnumValueDescriptor) value).getName()); 430 break; 431 432 case MESSAGE: 433 case GROUP: 434 print((Message) value, generator); 435 break; 436 } 437 } 438 printUnknownFields(final UnknownFieldSet unknownFields, final TextGenerator generator)439 private void printUnknownFields(final UnknownFieldSet unknownFields, 440 final TextGenerator generator) 441 throws IOException { 442 for (Map.Entry<Integer, UnknownFieldSet.Field> entry : 443 unknownFields.asMap().entrySet()) { 444 final int number = entry.getKey(); 445 final UnknownFieldSet.Field field = entry.getValue(); 446 printUnknownField(number, WireFormat.WIRETYPE_VARINT, 447 field.getVarintList(), generator); 448 printUnknownField(number, WireFormat.WIRETYPE_FIXED32, 449 field.getFixed32List(), generator); 450 printUnknownField(number, WireFormat.WIRETYPE_FIXED64, 451 field.getFixed64List(), generator); 452 printUnknownField(number, WireFormat.WIRETYPE_LENGTH_DELIMITED, 453 field.getLengthDelimitedList(), generator); 454 for (final UnknownFieldSet value : field.getGroupList()) { 455 generator.print(entry.getKey().toString()); 456 if (singleLineMode) { 457 generator.print(" { "); 458 } else { 459 generator.print(" {\n"); 460 generator.indent(); 461 } 462 printUnknownFields(value, generator); 463 if (singleLineMode) { 464 generator.print("} "); 465 } else { 466 generator.outdent(); 467 generator.print("}\n"); 468 } 469 } 470 } 471 } 472 printUnknownField(final int number, final int wireType, final List<?> values, final TextGenerator generator)473 private void printUnknownField(final int number, 474 final int wireType, 475 final List<?> values, 476 final TextGenerator generator) 477 throws IOException { 478 for (final Object value : values) { 479 generator.print(String.valueOf(number)); 480 generator.print(": "); 481 printUnknownFieldValue(wireType, value, generator); 482 generator.print(singleLineMode ? " " : "\n"); 483 } 484 } 485 } 486 487 /** Convert an unsigned 32-bit integer to a string. */ unsignedToString(final int value)488 public static String unsignedToString(final int value) { 489 if (value >= 0) { 490 return Integer.toString(value); 491 } else { 492 return Long.toString(value & 0x00000000FFFFFFFFL); 493 } 494 } 495 496 /** Convert an unsigned 64-bit integer to a string. */ unsignedToString(final long value)497 public static String unsignedToString(final long value) { 498 if (value >= 0) { 499 return Long.toString(value); 500 } else { 501 // Pull off the most-significant bit so that BigInteger doesn't think 502 // the number is negative, then set it again using setBit(). 503 return BigInteger.valueOf(value & 0x7FFFFFFFFFFFFFFFL) 504 .setBit(63).toString(); 505 } 506 } 507 508 /** 509 * An inner class for writing text to the output stream. 510 */ 511 private static final class TextGenerator { 512 private final Appendable output; 513 private final StringBuilder indent = new StringBuilder(); 514 private boolean atStartOfLine = true; 515 TextGenerator(final Appendable output)516 private TextGenerator(final Appendable output) { 517 this.output = output; 518 } 519 520 /** 521 * Indent text by two spaces. After calling Indent(), two spaces will be 522 * inserted at the beginning of each line of text. Indent() may be called 523 * multiple times to produce deeper indents. 524 */ indent()525 public void indent() { 526 indent.append(" "); 527 } 528 529 /** 530 * Reduces the current indent level by two spaces, or crashes if the indent 531 * level is zero. 532 */ outdent()533 public void outdent() { 534 final int length = indent.length(); 535 if (length == 0) { 536 throw new IllegalArgumentException( 537 " Outdent() without matching Indent()."); 538 } 539 indent.delete(length - 2, length); 540 } 541 542 /** 543 * Print text to the output stream. 544 */ print(final CharSequence text)545 public void print(final CharSequence text) throws IOException { 546 final int size = text.length(); 547 int pos = 0; 548 549 for (int i = 0; i < size; i++) { 550 if (text.charAt(i) == '\n') { 551 write(text.subSequence(pos, i + 1)); 552 pos = i + 1; 553 atStartOfLine = true; 554 } 555 } 556 write(text.subSequence(pos, size)); 557 } 558 write(final CharSequence data)559 private void write(final CharSequence data) throws IOException { 560 if (data.length() == 0) { 561 return; 562 } 563 if (atStartOfLine) { 564 atStartOfLine = false; 565 output.append(indent); 566 } 567 output.append(data); 568 } 569 } 570 571 // ================================================================= 572 // Parsing 573 574 /** 575 * Represents a stream of tokens parsed from a {@code String}. 576 * 577 * <p>The Java standard library provides many classes that you might think 578 * would be useful for implementing this, but aren't. For example: 579 * 580 * <ul> 581 * <li>{@code java.io.StreamTokenizer}: This almost does what we want -- or, 582 * at least, something that would get us close to what we want -- except 583 * for one fatal flaw: It automatically un-escapes strings using Java 584 * escape sequences, which do not include all the escape sequences we 585 * need to support (e.g. '\x'). 586 * <li>{@code java.util.Scanner}: This seems like a great way at least to 587 * parse regular expressions out of a stream (so we wouldn't have to load 588 * the entire input into a single string before parsing). Sadly, 589 * {@code Scanner} requires that tokens be delimited with some delimiter. 590 * Thus, although the text "foo:" should parse to two tokens ("foo" and 591 * ":"), {@code Scanner} would recognize it only as a single token. 592 * Furthermore, {@code Scanner} provides no way to inspect the contents 593 * of delimiters, making it impossible to keep track of line and column 594 * numbers. 595 * </ul> 596 * 597 * <p>Luckily, Java's regular expression support does manage to be useful to 598 * us. (Barely: We need {@code Matcher.usePattern()}, which is new in 599 * Java 1.5.) So, we can use that, at least. Unfortunately, this implies 600 * that we need to have the entire input in one contiguous string. 601 */ 602 private static final class Tokenizer { 603 private final CharSequence text; 604 private final Matcher matcher; 605 private String currentToken; 606 607 // The character index within this.text at which the current token begins. 608 private int pos = 0; 609 610 // The line and column numbers of the current token. 611 private int line = 0; 612 private int column = 0; 613 614 // The line and column numbers of the previous token (allows throwing 615 // errors *after* consuming). 616 private int previousLine = 0; 617 private int previousColumn = 0; 618 619 // We use possessive quantifiers (*+ and ++) because otherwise the Java 620 // regex matcher has stack overflows on large inputs. 621 private static final Pattern WHITESPACE = 622 Pattern.compile("(\\s|(#.*$))++", Pattern.MULTILINE); 623 private static final Pattern TOKEN = Pattern.compile( 624 "[a-zA-Z_][0-9a-zA-Z_+-]*+|" + // an identifier 625 "[.]?[0-9+-][0-9a-zA-Z_.+-]*+|" + // a number 626 "\"([^\"\n\\\\]|\\\\.)*+(\"|\\\\?$)|" + // a double-quoted string 627 "\'([^\'\n\\\\]|\\\\.)*+(\'|\\\\?$)", // a single-quoted string 628 Pattern.MULTILINE); 629 630 private static final Pattern DOUBLE_INFINITY = Pattern.compile( 631 "-?inf(inity)?", 632 Pattern.CASE_INSENSITIVE); 633 private static final Pattern FLOAT_INFINITY = Pattern.compile( 634 "-?inf(inity)?f?", 635 Pattern.CASE_INSENSITIVE); 636 private static final Pattern FLOAT_NAN = Pattern.compile( 637 "nanf?", 638 Pattern.CASE_INSENSITIVE); 639 640 /** Construct a tokenizer that parses tokens from the given text. */ Tokenizer(final CharSequence text)641 private Tokenizer(final CharSequence text) { 642 this.text = text; 643 this.matcher = WHITESPACE.matcher(text); 644 skipWhitespace(); 645 nextToken(); 646 } 647 648 /** Are we at the end of the input? */ atEnd()649 public boolean atEnd() { 650 return currentToken.length() == 0; 651 } 652 653 /** Advance to the next token. */ nextToken()654 public void nextToken() { 655 previousLine = line; 656 previousColumn = column; 657 658 // Advance the line counter to the current position. 659 while (pos < matcher.regionStart()) { 660 if (text.charAt(pos) == '\n') { 661 ++line; 662 column = 0; 663 } else { 664 ++column; 665 } 666 ++pos; 667 } 668 669 // Match the next token. 670 if (matcher.regionStart() == matcher.regionEnd()) { 671 // EOF 672 currentToken = ""; 673 } else { 674 matcher.usePattern(TOKEN); 675 if (matcher.lookingAt()) { 676 currentToken = matcher.group(); 677 matcher.region(matcher.end(), matcher.regionEnd()); 678 } else { 679 // Take one character. 680 currentToken = String.valueOf(text.charAt(pos)); 681 matcher.region(pos + 1, matcher.regionEnd()); 682 } 683 684 skipWhitespace(); 685 } 686 } 687 688 /** 689 * Skip over any whitespace so that the matcher region starts at the next 690 * token. 691 */ skipWhitespace()692 private void skipWhitespace() { 693 matcher.usePattern(WHITESPACE); 694 if (matcher.lookingAt()) { 695 matcher.region(matcher.end(), matcher.regionEnd()); 696 } 697 } 698 699 /** 700 * If the next token exactly matches {@code token}, consume it and return 701 * {@code true}. Otherwise, return {@code false} without doing anything. 702 */ tryConsume(final String token)703 public boolean tryConsume(final String token) { 704 if (currentToken.equals(token)) { 705 nextToken(); 706 return true; 707 } else { 708 return false; 709 } 710 } 711 712 /** 713 * If the next token exactly matches {@code token}, consume it. Otherwise, 714 * throw a {@link ParseException}. 715 */ consume(final String token)716 public void consume(final String token) throws ParseException { 717 if (!tryConsume(token)) { 718 throw parseException("Expected \"" + token + "\"."); 719 } 720 } 721 722 /** 723 * Returns {@code true} if the next token is an integer, but does 724 * not consume it. 725 */ lookingAtInteger()726 public boolean lookingAtInteger() { 727 if (currentToken.length() == 0) { 728 return false; 729 } 730 731 final char c = currentToken.charAt(0); 732 return ('0' <= c && c <= '9') || 733 c == '-' || c == '+'; 734 } 735 736 /** 737 * Returns {@code true} if the current token's text is equal to that 738 * specified. 739 */ lookingAt(String text)740 public boolean lookingAt(String text) { 741 return currentToken.equals(text); 742 } 743 744 /** 745 * If the next token is an identifier, consume it and return its value. 746 * Otherwise, throw a {@link ParseException}. 747 */ consumeIdentifier()748 public String consumeIdentifier() throws ParseException { 749 for (int i = 0; i < currentToken.length(); i++) { 750 final char c = currentToken.charAt(i); 751 if (('a' <= c && c <= 'z') || 752 ('A' <= c && c <= 'Z') || 753 ('0' <= c && c <= '9') || 754 (c == '_') || (c == '.')) { 755 // OK 756 } else { 757 throw parseException( 758 "Expected identifier. Found '" + currentToken + "'"); 759 } 760 } 761 762 final String result = currentToken; 763 nextToken(); 764 return result; 765 } 766 767 /** 768 * If the next token is an identifier, consume it and return {@code true}. 769 * Otherwise, return {@code false} without doing anything. 770 */ tryConsumeIdentifier()771 public boolean tryConsumeIdentifier() { 772 try { 773 consumeIdentifier(); 774 return true; 775 } catch (ParseException e) { 776 return false; 777 } 778 } 779 780 /** 781 * If the next token is a 32-bit signed integer, consume it and return its 782 * value. Otherwise, throw a {@link ParseException}. 783 */ consumeInt32()784 public int consumeInt32() throws ParseException { 785 try { 786 final int result = parseInt32(currentToken); 787 nextToken(); 788 return result; 789 } catch (NumberFormatException e) { 790 throw integerParseException(e); 791 } 792 } 793 794 /** 795 * If the next token is a 32-bit unsigned integer, consume it and return its 796 * value. Otherwise, throw a {@link ParseException}. 797 */ consumeUInt32()798 public int consumeUInt32() throws ParseException { 799 try { 800 final int result = parseUInt32(currentToken); 801 nextToken(); 802 return result; 803 } catch (NumberFormatException e) { 804 throw integerParseException(e); 805 } 806 } 807 808 /** 809 * If the next token is a 64-bit signed integer, consume it and return its 810 * value. Otherwise, throw a {@link ParseException}. 811 */ consumeInt64()812 public long consumeInt64() throws ParseException { 813 try { 814 final long result = parseInt64(currentToken); 815 nextToken(); 816 return result; 817 } catch (NumberFormatException e) { 818 throw integerParseException(e); 819 } 820 } 821 822 /** 823 * If the next token is a 64-bit signed integer, consume it and return 824 * {@code true}. Otherwise, return {@code false} without doing anything. 825 */ tryConsumeInt64()826 public boolean tryConsumeInt64() { 827 try { 828 consumeInt64(); 829 return true; 830 } catch (ParseException e) { 831 return false; 832 } 833 } 834 835 /** 836 * If the next token is a 64-bit unsigned integer, consume it and return its 837 * value. Otherwise, throw a {@link ParseException}. 838 */ consumeUInt64()839 public long consumeUInt64() throws ParseException { 840 try { 841 final long result = parseUInt64(currentToken); 842 nextToken(); 843 return result; 844 } catch (NumberFormatException e) { 845 throw integerParseException(e); 846 } 847 } 848 849 /** 850 * If the next token is a 64-bit unsigned integer, consume it and return 851 * {@code true}. Otherwise, return {@code false} without doing anything. 852 */ tryConsumeUInt64()853 public boolean tryConsumeUInt64() { 854 try { 855 consumeUInt64(); 856 return true; 857 } catch (ParseException e) { 858 return false; 859 } 860 } 861 862 /** 863 * If the next token is a double, consume it and return its value. 864 * Otherwise, throw a {@link ParseException}. 865 */ consumeDouble()866 public double consumeDouble() throws ParseException { 867 // We need to parse infinity and nan separately because 868 // Double.parseDouble() does not accept "inf", "infinity", or "nan". 869 if (DOUBLE_INFINITY.matcher(currentToken).matches()) { 870 final boolean negative = currentToken.startsWith("-"); 871 nextToken(); 872 return negative ? Double.NEGATIVE_INFINITY : Double.POSITIVE_INFINITY; 873 } 874 if (currentToken.equalsIgnoreCase("nan")) { 875 nextToken(); 876 return Double.NaN; 877 } 878 try { 879 final double result = Double.parseDouble(currentToken); 880 nextToken(); 881 return result; 882 } catch (NumberFormatException e) { 883 throw floatParseException(e); 884 } 885 } 886 887 /** 888 * If the next token is a double, consume it and return {@code true}. 889 * Otherwise, return {@code false} without doing anything. 890 */ tryConsumeDouble()891 public boolean tryConsumeDouble() { 892 try { 893 consumeDouble(); 894 return true; 895 } catch (ParseException e) { 896 return false; 897 } 898 } 899 900 /** 901 * If the next token is a float, consume it and return its value. 902 * Otherwise, throw a {@link ParseException}. 903 */ consumeFloat()904 public float consumeFloat() throws ParseException { 905 // We need to parse infinity and nan separately because 906 // Float.parseFloat() does not accept "inf", "infinity", or "nan". 907 if (FLOAT_INFINITY.matcher(currentToken).matches()) { 908 final boolean negative = currentToken.startsWith("-"); 909 nextToken(); 910 return negative ? Float.NEGATIVE_INFINITY : Float.POSITIVE_INFINITY; 911 } 912 if (FLOAT_NAN.matcher(currentToken).matches()) { 913 nextToken(); 914 return Float.NaN; 915 } 916 try { 917 final float result = Float.parseFloat(currentToken); 918 nextToken(); 919 return result; 920 } catch (NumberFormatException e) { 921 throw floatParseException(e); 922 } 923 } 924 925 /** 926 * If the next token is a float, consume it and return {@code true}. 927 * Otherwise, return {@code false} without doing anything. 928 */ tryConsumeFloat()929 public boolean tryConsumeFloat() { 930 try { 931 consumeFloat(); 932 return true; 933 } catch (ParseException e) { 934 return false; 935 } 936 } 937 938 /** 939 * If the next token is a boolean, consume it and return its value. 940 * Otherwise, throw a {@link ParseException}. 941 */ consumeBoolean()942 public boolean consumeBoolean() throws ParseException { 943 if (currentToken.equals("true") || 944 currentToken.equals("t") || 945 currentToken.equals("1")) { 946 nextToken(); 947 return true; 948 } else if (currentToken.equals("false") || 949 currentToken.equals("f") || 950 currentToken.equals("0")) { 951 nextToken(); 952 return false; 953 } else { 954 throw parseException("Expected \"true\" or \"false\"."); 955 } 956 } 957 958 /** 959 * If the next token is a string, consume it and return its (unescaped) 960 * value. Otherwise, throw a {@link ParseException}. 961 */ consumeString()962 public String consumeString() throws ParseException { 963 return consumeByteString().toStringUtf8(); 964 } 965 966 /** 967 * If the next token is a string, consume it and return true. Otherwise, 968 * return false. 969 */ tryConsumeString()970 public boolean tryConsumeString() { 971 try { 972 consumeString(); 973 return true; 974 } catch (ParseException e) { 975 return false; 976 } 977 } 978 979 /** 980 * If the next token is a string, consume it, unescape it as a 981 * {@link ByteString}, and return it. Otherwise, throw a 982 * {@link ParseException}. 983 */ consumeByteString()984 public ByteString consumeByteString() throws ParseException { 985 List<ByteString> list = new ArrayList<ByteString>(); 986 consumeByteString(list); 987 while (currentToken.startsWith("'") || currentToken.startsWith("\"")) { 988 consumeByteString(list); 989 } 990 return ByteString.copyFrom(list); 991 } 992 993 /** 994 * Like {@link #consumeByteString()} but adds each token of the string to 995 * the given list. String literals (whether bytes or text) may come in 996 * multiple adjacent tokens which are automatically concatenated, like in 997 * C or Python. 998 */ consumeByteString(List<ByteString> list)999 private void consumeByteString(List<ByteString> list) 1000 throws ParseException { 1001 final char quote = currentToken.length() > 0 ? currentToken.charAt(0) 1002 : '\0'; 1003 if (quote != '\"' && quote != '\'') { 1004 throw parseException("Expected string."); 1005 } 1006 1007 if (currentToken.length() < 2 || 1008 currentToken.charAt(currentToken.length() - 1) != quote) { 1009 throw parseException("String missing ending quote."); 1010 } 1011 1012 try { 1013 final String escaped = 1014 currentToken.substring(1, currentToken.length() - 1); 1015 final ByteString result = unescapeBytes(escaped); 1016 nextToken(); 1017 list.add(result); 1018 } catch (InvalidEscapeSequenceException e) { 1019 throw parseException(e.getMessage()); 1020 } 1021 } 1022 1023 /** 1024 * Returns a {@link ParseException} with the current line and column 1025 * numbers in the description, suitable for throwing. 1026 */ parseException(final String description)1027 public ParseException parseException(final String description) { 1028 // Note: People generally prefer one-based line and column numbers. 1029 return new ParseException( 1030 line + 1, column + 1, description); 1031 } 1032 1033 /** 1034 * Returns a {@link ParseException} with the line and column numbers of 1035 * the previous token in the description, suitable for throwing. 1036 */ parseExceptionPreviousToken( final String description)1037 public ParseException parseExceptionPreviousToken( 1038 final String description) { 1039 // Note: People generally prefer one-based line and column numbers. 1040 return new ParseException( 1041 previousLine + 1, previousColumn + 1, description); 1042 } 1043 1044 /** 1045 * Constructs an appropriate {@link ParseException} for the given 1046 * {@code NumberFormatException} when trying to parse an integer. 1047 */ integerParseException( final NumberFormatException e)1048 private ParseException integerParseException( 1049 final NumberFormatException e) { 1050 return parseException("Couldn't parse integer: " + e.getMessage()); 1051 } 1052 1053 /** 1054 * Constructs an appropriate {@link ParseException} for the given 1055 * {@code NumberFormatException} when trying to parse a float or double. 1056 */ floatParseException(final NumberFormatException e)1057 private ParseException floatParseException(final NumberFormatException e) { 1058 return parseException("Couldn't parse number: " + e.getMessage()); 1059 } 1060 } 1061 1062 /** Thrown when parsing an invalid text format message. */ 1063 public static class ParseException extends IOException { 1064 private static final long serialVersionUID = 3196188060225107702L; 1065 1066 private final int line; 1067 private final int column; 1068 1069 /** Create a new instance, with -1 as the line and column numbers. */ ParseException(final String message)1070 public ParseException(final String message) { 1071 this(-1, -1, message); 1072 } 1073 1074 /** 1075 * Create a new instance 1076 * 1077 * @param line the line number where the parse error occurred, 1078 * using 1-offset. 1079 * @param column the column number where the parser error occurred, 1080 * using 1-offset. 1081 */ ParseException(final int line, final int column, final String message)1082 public ParseException(final int line, final int column, 1083 final String message) { 1084 super(Integer.toString(line) + ":" + column + ": " + message); 1085 this.line = line; 1086 this.column = column; 1087 } 1088 1089 /** 1090 * Return the line where the parse exception occurred, or -1 when 1091 * none is provided. The value is specified as 1-offset, so the first 1092 * line is line 1. 1093 */ getLine()1094 public int getLine() { 1095 return line; 1096 } 1097 1098 /** 1099 * Return the column where the parse exception occurred, or -1 when 1100 * none is provided. The value is specified as 1-offset, so the first 1101 * line is line 1. 1102 */ getColumn()1103 public int getColumn() { 1104 return column; 1105 } 1106 } 1107 1108 private static final Parser PARSER = Parser.newBuilder().build(); 1109 1110 /** 1111 * Return a {@link Parser} instance which can parse text-format 1112 * messages. The returned instance is thread-safe. 1113 */ getParser()1114 public static Parser getParser() { 1115 return PARSER; 1116 } 1117 1118 /** 1119 * Parse a text-format message from {@code input} and merge the contents 1120 * into {@code builder}. 1121 */ merge(final Readable input, final Message.Builder builder)1122 public static void merge(final Readable input, 1123 final Message.Builder builder) 1124 throws IOException { 1125 PARSER.merge(input, builder); 1126 } 1127 1128 /** 1129 * Parse a text-format message from {@code input} and merge the contents 1130 * into {@code builder}. 1131 */ merge(final CharSequence input, final Message.Builder builder)1132 public static void merge(final CharSequence input, 1133 final Message.Builder builder) 1134 throws ParseException { 1135 PARSER.merge(input, builder); 1136 } 1137 1138 /** 1139 * Parse a text-format message from {@code input} and merge the contents 1140 * into {@code builder}. Extensions will be recognized if they are 1141 * registered in {@code extensionRegistry}. 1142 */ merge(final Readable input, final ExtensionRegistry extensionRegistry, final Message.Builder builder)1143 public static void merge(final Readable input, 1144 final ExtensionRegistry extensionRegistry, 1145 final Message.Builder builder) 1146 throws IOException { 1147 PARSER.merge(input, extensionRegistry, builder); 1148 } 1149 1150 1151 /** 1152 * Parse a text-format message from {@code input} and merge the contents 1153 * into {@code builder}. Extensions will be recognized if they are 1154 * registered in {@code extensionRegistry}. 1155 */ merge(final CharSequence input, final ExtensionRegistry extensionRegistry, final Message.Builder builder)1156 public static void merge(final CharSequence input, 1157 final ExtensionRegistry extensionRegistry, 1158 final Message.Builder builder) 1159 throws ParseException { 1160 PARSER.merge(input, extensionRegistry, builder); 1161 } 1162 1163 1164 /** 1165 * Parser for text-format proto2 instances. This class is thread-safe. 1166 * The implementation largely follows google/protobuf/text_format.cc. 1167 * 1168 * <p>Use {@link TextFormat#getParser()} to obtain the default parser, or 1169 * {@link Builder} to control the parser behavior. 1170 */ 1171 public static class Parser { 1172 /** 1173 * Determines if repeated values for non-repeated fields and 1174 * oneofs are permitted. For example, given required/optional field "foo" 1175 * and a oneof containing "baz" and "qux": 1176 * <li> 1177 * <ul>"foo: 1 foo: 2" 1178 * <ul>"baz: 1 qux: 2" 1179 * <ul>merging "foo: 2" into a proto in which foo is already set, or 1180 * <ul>merging "qux: 2" into a proto in which baz is already set. 1181 * </li> 1182 */ 1183 public enum SingularOverwritePolicy { 1184 /** The last value is retained. */ 1185 ALLOW_SINGULAR_OVERWRITES, 1186 /** An error is issued. */ 1187 FORBID_SINGULAR_OVERWRITES 1188 } 1189 1190 private final boolean allowUnknownFields; 1191 private final SingularOverwritePolicy singularOverwritePolicy; 1192 Parser(boolean allowUnknownFields, SingularOverwritePolicy singularOverwritePolicy)1193 private Parser(boolean allowUnknownFields, 1194 SingularOverwritePolicy singularOverwritePolicy) { 1195 this.allowUnknownFields = allowUnknownFields; 1196 this.singularOverwritePolicy = singularOverwritePolicy; 1197 } 1198 1199 /** 1200 * Returns a new instance of {@link Builder}. 1201 */ newBuilder()1202 public static Builder newBuilder() { 1203 return new Builder(); 1204 } 1205 1206 /** 1207 * Builder that can be used to obtain new instances of {@link Parser}. 1208 */ 1209 public static class Builder { 1210 private boolean allowUnknownFields = false; 1211 private SingularOverwritePolicy singularOverwritePolicy = 1212 SingularOverwritePolicy.ALLOW_SINGULAR_OVERWRITES; 1213 1214 /** 1215 * Sets parser behavior when a non-repeated field appears more than once. 1216 */ setSingularOverwritePolicy(SingularOverwritePolicy p)1217 public Builder setSingularOverwritePolicy(SingularOverwritePolicy p) { 1218 this.singularOverwritePolicy = p; 1219 return this; 1220 } 1221 build()1222 public Parser build() { 1223 return new Parser(allowUnknownFields, singularOverwritePolicy); 1224 } 1225 } 1226 1227 /** 1228 * Parse a text-format message from {@code input} and merge the contents 1229 * into {@code builder}. 1230 */ merge(final Readable input, final Message.Builder builder)1231 public void merge(final Readable input, 1232 final Message.Builder builder) 1233 throws IOException { 1234 merge(input, ExtensionRegistry.getEmptyRegistry(), builder); 1235 } 1236 1237 /** 1238 * Parse a text-format message from {@code input} and merge the contents 1239 * into {@code builder}. 1240 */ merge(final CharSequence input, final Message.Builder builder)1241 public void merge(final CharSequence input, 1242 final Message.Builder builder) 1243 throws ParseException { 1244 merge(input, ExtensionRegistry.getEmptyRegistry(), builder); 1245 } 1246 1247 /** 1248 * Parse a text-format message from {@code input} and merge the contents 1249 * into {@code builder}. Extensions will be recognized if they are 1250 * registered in {@code extensionRegistry}. 1251 */ merge(final Readable input, final ExtensionRegistry extensionRegistry, final Message.Builder builder)1252 public void merge(final Readable input, 1253 final ExtensionRegistry extensionRegistry, 1254 final Message.Builder builder) 1255 throws IOException { 1256 // Read the entire input to a String then parse that. 1257 1258 // If StreamTokenizer were not quite so crippled, or if there were a kind 1259 // of Reader that could read in chunks that match some particular regex, 1260 // or if we wanted to write a custom Reader to tokenize our stream, then 1261 // we would not have to read to one big String. Alas, none of these is 1262 // the case. Oh well. 1263 1264 merge(toStringBuilder(input), extensionRegistry, builder); 1265 } 1266 1267 1268 private static final int BUFFER_SIZE = 4096; 1269 1270 // TODO(chrisn): See if working around java.io.Reader#read(CharBuffer) 1271 // overhead is worthwhile toStringBuilder(final Readable input)1272 private static StringBuilder toStringBuilder(final Readable input) 1273 throws IOException { 1274 final StringBuilder text = new StringBuilder(); 1275 final CharBuffer buffer = CharBuffer.allocate(BUFFER_SIZE); 1276 while (true) { 1277 final int n = input.read(buffer); 1278 if (n == -1) { 1279 break; 1280 } 1281 buffer.flip(); 1282 text.append(buffer, 0, n); 1283 } 1284 return text; 1285 } 1286 1287 /** 1288 * Parse a text-format message from {@code input} and merge the contents 1289 * into {@code builder}. Extensions will be recognized if they are 1290 * registered in {@code extensionRegistry}. 1291 */ merge(final CharSequence input, final ExtensionRegistry extensionRegistry, final Message.Builder builder)1292 public void merge(final CharSequence input, 1293 final ExtensionRegistry extensionRegistry, 1294 final Message.Builder builder) 1295 throws ParseException { 1296 final Tokenizer tokenizer = new Tokenizer(input); 1297 MessageReflection.BuilderAdapter target = 1298 new MessageReflection.BuilderAdapter(builder); 1299 1300 while (!tokenizer.atEnd()) { 1301 mergeField(tokenizer, extensionRegistry, target); 1302 } 1303 } 1304 1305 1306 /** 1307 * Parse a single field from {@code tokenizer} and merge it into 1308 * {@code builder}. 1309 */ mergeField(final Tokenizer tokenizer, final ExtensionRegistry extensionRegistry, final MessageReflection.MergeTarget target)1310 private void mergeField(final Tokenizer tokenizer, 1311 final ExtensionRegistry extensionRegistry, 1312 final MessageReflection.MergeTarget target) 1313 throws ParseException { 1314 FieldDescriptor field = null; 1315 final Descriptor type = target.getDescriptorForType(); 1316 ExtensionRegistry.ExtensionInfo extension = null; 1317 1318 if (tokenizer.tryConsume("[")) { 1319 // An extension. 1320 final StringBuilder name = 1321 new StringBuilder(tokenizer.consumeIdentifier()); 1322 while (tokenizer.tryConsume(".")) { 1323 name.append('.'); 1324 name.append(tokenizer.consumeIdentifier()); 1325 } 1326 1327 extension = target.findExtensionByName( 1328 extensionRegistry, name.toString()); 1329 1330 if (extension == null) { 1331 if (!allowUnknownFields) { 1332 throw tokenizer.parseExceptionPreviousToken( 1333 "Extension \"" + name + "\" not found in the ExtensionRegistry."); 1334 } else { 1335 logger.warning( 1336 "Extension \"" + name + "\" not found in the ExtensionRegistry."); 1337 } 1338 } else { 1339 if (extension.descriptor.getContainingType() != type) { 1340 throw tokenizer.parseExceptionPreviousToken( 1341 "Extension \"" + name + "\" does not extend message type \"" + 1342 type.getFullName() + "\"."); 1343 } 1344 field = extension.descriptor; 1345 } 1346 1347 tokenizer.consume("]"); 1348 } else { 1349 final String name = tokenizer.consumeIdentifier(); 1350 field = type.findFieldByName(name); 1351 1352 // Group names are expected to be capitalized as they appear in the 1353 // .proto file, which actually matches their type names, not their field 1354 // names. 1355 if (field == null) { 1356 // Explicitly specify US locale so that this code does not break when 1357 // executing in Turkey. 1358 final String lowerName = name.toLowerCase(Locale.US); 1359 field = type.findFieldByName(lowerName); 1360 // If the case-insensitive match worked but the field is NOT a group, 1361 if (field != null && field.getType() != FieldDescriptor.Type.GROUP) { 1362 field = null; 1363 } 1364 } 1365 // Again, special-case group names as described above. 1366 if (field != null && field.getType() == FieldDescriptor.Type.GROUP && 1367 !field.getMessageType().getName().equals(name)) { 1368 field = null; 1369 } 1370 1371 if (field == null) { 1372 if (!allowUnknownFields) { 1373 throw tokenizer.parseExceptionPreviousToken( 1374 "Message type \"" + type.getFullName() + 1375 "\" has no field named \"" + name + "\"."); 1376 } else { 1377 logger.warning( 1378 "Message type \"" + type.getFullName() + 1379 "\" has no field named \"" + name + "\"."); 1380 } 1381 } 1382 } 1383 1384 // Skips unknown fields. 1385 if (field == null) { 1386 // Try to guess the type of this field. 1387 // If this field is not a message, there should be a ":" between the 1388 // field name and the field value and also the field value should not 1389 // start with "{" or "<" which indicates the begining of a message body. 1390 // If there is no ":" or there is a "{" or "<" after ":", this field has 1391 // to be a message or the input is ill-formed. 1392 if (tokenizer.tryConsume(":") && !tokenizer.lookingAt("{") && 1393 !tokenizer.lookingAt("<")) { 1394 skipFieldValue(tokenizer); 1395 } else { 1396 skipFieldMessage(tokenizer); 1397 } 1398 return; 1399 } 1400 1401 // Handle potential ':'. 1402 if (field.getJavaType() == FieldDescriptor.JavaType.MESSAGE) { 1403 tokenizer.tryConsume(":"); // optional 1404 } else { 1405 tokenizer.consume(":"); // required 1406 } 1407 // Support specifying repeated field values as a comma-separated list. 1408 // Ex."foo: [1, 2, 3]" 1409 if (field.isRepeated() && tokenizer.tryConsume("[")) { 1410 while (true) { 1411 consumeFieldValue(tokenizer, extensionRegistry, target, field, extension); 1412 if (tokenizer.tryConsume("]")) { 1413 // End of list. 1414 break; 1415 } 1416 tokenizer.consume(","); 1417 } 1418 } else { 1419 consumeFieldValue(tokenizer, extensionRegistry, target, field, extension); 1420 } 1421 } 1422 1423 /** 1424 * Parse a single field value from {@code tokenizer} and merge it into 1425 * {@code builder}. 1426 */ consumeFieldValue( final Tokenizer tokenizer, final ExtensionRegistry extensionRegistry, final MessageReflection.MergeTarget target, final FieldDescriptor field, final ExtensionRegistry.ExtensionInfo extension)1427 private void consumeFieldValue( 1428 final Tokenizer tokenizer, 1429 final ExtensionRegistry extensionRegistry, 1430 final MessageReflection.MergeTarget target, 1431 final FieldDescriptor field, 1432 final ExtensionRegistry.ExtensionInfo extension) 1433 throws ParseException { 1434 Object value = null; 1435 1436 if (field.getJavaType() == FieldDescriptor.JavaType.MESSAGE) { 1437 final String endToken; 1438 if (tokenizer.tryConsume("<")) { 1439 endToken = ">"; 1440 } else { 1441 tokenizer.consume("{"); 1442 endToken = "}"; 1443 } 1444 1445 final MessageReflection.MergeTarget subField; 1446 subField = target.newMergeTargetForField(field, 1447 (extension == null) ? null : extension.defaultInstance); 1448 1449 while (!tokenizer.tryConsume(endToken)) { 1450 if (tokenizer.atEnd()) { 1451 throw tokenizer.parseException( 1452 "Expected \"" + endToken + "\"."); 1453 } 1454 mergeField(tokenizer, extensionRegistry, subField); 1455 } 1456 1457 value = subField.finish(); 1458 1459 } else { 1460 switch (field.getType()) { 1461 case INT32: 1462 case SINT32: 1463 case SFIXED32: 1464 value = tokenizer.consumeInt32(); 1465 break; 1466 1467 case INT64: 1468 case SINT64: 1469 case SFIXED64: 1470 value = tokenizer.consumeInt64(); 1471 break; 1472 1473 case UINT32: 1474 case FIXED32: 1475 value = tokenizer.consumeUInt32(); 1476 break; 1477 1478 case UINT64: 1479 case FIXED64: 1480 value = tokenizer.consumeUInt64(); 1481 break; 1482 1483 case FLOAT: 1484 value = tokenizer.consumeFloat(); 1485 break; 1486 1487 case DOUBLE: 1488 value = tokenizer.consumeDouble(); 1489 break; 1490 1491 case BOOL: 1492 value = tokenizer.consumeBoolean(); 1493 break; 1494 1495 case STRING: 1496 value = tokenizer.consumeString(); 1497 break; 1498 1499 case BYTES: 1500 value = tokenizer.consumeByteString(); 1501 break; 1502 1503 case ENUM: 1504 final EnumDescriptor enumType = field.getEnumType(); 1505 1506 if (tokenizer.lookingAtInteger()) { 1507 final int number = tokenizer.consumeInt32(); 1508 value = enumType.findValueByNumber(number); 1509 if (value == null) { 1510 throw tokenizer.parseExceptionPreviousToken( 1511 "Enum type \"" + enumType.getFullName() + 1512 "\" has no value with number " + number + '.'); 1513 } 1514 } else { 1515 final String id = tokenizer.consumeIdentifier(); 1516 value = enumType.findValueByName(id); 1517 if (value == null) { 1518 throw tokenizer.parseExceptionPreviousToken( 1519 "Enum type \"" + enumType.getFullName() + 1520 "\" has no value named \"" + id + "\"."); 1521 } 1522 } 1523 1524 break; 1525 1526 case MESSAGE: 1527 case GROUP: 1528 throw new RuntimeException("Can't get here."); 1529 } 1530 } 1531 1532 if (field.isRepeated()) { 1533 target.addRepeatedField(field, value); 1534 } else if ((singularOverwritePolicy 1535 == SingularOverwritePolicy.FORBID_SINGULAR_OVERWRITES) 1536 && target.hasField(field)) { 1537 throw tokenizer.parseExceptionPreviousToken("Non-repeated field \"" 1538 + field.getFullName() + "\" cannot be overwritten."); 1539 } else if ((singularOverwritePolicy 1540 == SingularOverwritePolicy.FORBID_SINGULAR_OVERWRITES) 1541 && field.getContainingOneof() != null 1542 && target.hasOneof(field.getContainingOneof())) { 1543 Descriptors.OneofDescriptor oneof = field.getContainingOneof(); 1544 throw tokenizer.parseExceptionPreviousToken("Field \"" 1545 + field.getFullName() + "\" is specified along with field \"" 1546 + target.getOneofFieldDescriptor(oneof).getFullName() 1547 + "\", another member of oneof \"" + oneof.getName() + "\"."); 1548 } else { 1549 target.setField(field, value); 1550 } 1551 } 1552 1553 /** 1554 * Skips the next field including the field's name and value. 1555 */ skipField(Tokenizer tokenizer)1556 private void skipField(Tokenizer tokenizer) throws ParseException { 1557 if (tokenizer.tryConsume("[")) { 1558 // Extension name. 1559 do { 1560 tokenizer.consumeIdentifier(); 1561 } while (tokenizer.tryConsume(".")); 1562 tokenizer.consume("]"); 1563 } else { 1564 tokenizer.consumeIdentifier(); 1565 } 1566 1567 // Try to guess the type of this field. 1568 // If this field is not a message, there should be a ":" between the 1569 // field name and the field value and also the field value should not 1570 // start with "{" or "<" which indicates the begining of a message body. 1571 // If there is no ":" or there is a "{" or "<" after ":", this field has 1572 // to be a message or the input is ill-formed. 1573 if (tokenizer.tryConsume(":") && !tokenizer.lookingAt("<") && 1574 !tokenizer.lookingAt("{")) { 1575 skipFieldValue(tokenizer); 1576 } else { 1577 skipFieldMessage(tokenizer); 1578 } 1579 // For historical reasons, fields may optionally be separated by commas or 1580 // semicolons. 1581 if (!tokenizer.tryConsume(";")) { 1582 tokenizer.tryConsume(","); 1583 } 1584 } 1585 1586 /** 1587 * Skips the whole body of a message including the beginning delimeter and 1588 * the ending delimeter. 1589 */ skipFieldMessage(Tokenizer tokenizer)1590 private void skipFieldMessage(Tokenizer tokenizer) throws ParseException { 1591 final String delimiter; 1592 if (tokenizer.tryConsume("<")) { 1593 delimiter = ">"; 1594 } else { 1595 tokenizer.consume("{"); 1596 delimiter = "}"; 1597 } 1598 while (!tokenizer.lookingAt(">") && !tokenizer.lookingAt("}")) { 1599 skipField(tokenizer); 1600 } 1601 tokenizer.consume(delimiter); 1602 } 1603 1604 /** 1605 * Skips a field value. 1606 */ skipFieldValue(Tokenizer tokenizer)1607 private void skipFieldValue(Tokenizer tokenizer) throws ParseException { 1608 if (tokenizer.tryConsumeString()) { 1609 while (tokenizer.tryConsumeString()) {} 1610 return; 1611 } 1612 if (!tokenizer.tryConsumeIdentifier() && // includes enum & boolean 1613 !tokenizer.tryConsumeInt64() && // includes int32 1614 !tokenizer.tryConsumeUInt64() && // includes uint32 1615 !tokenizer.tryConsumeDouble() && 1616 !tokenizer.tryConsumeFloat()) { 1617 throw tokenizer.parseException( 1618 "Invalid field value: " + tokenizer.currentToken); 1619 } 1620 } 1621 } 1622 1623 // ================================================================= 1624 // Utility functions 1625 // 1626 // Some of these methods are package-private because Descriptors.java uses 1627 // them. 1628 1629 private interface ByteSequence { size()1630 int size(); byteAt(int offset)1631 byte byteAt(int offset); 1632 } 1633 1634 /** 1635 * Escapes bytes in the format used in protocol buffer text format, which 1636 * is the same as the format used for C string literals. All bytes 1637 * that are not printable 7-bit ASCII characters are escaped, as well as 1638 * backslash, single-quote, and double-quote characters. Characters for 1639 * which no defined short-hand escape sequence is defined will be escaped 1640 * using 3-digit octal sequences. 1641 */ escapeBytes(final ByteSequence input)1642 private static String escapeBytes(final ByteSequence input) { 1643 final StringBuilder builder = new StringBuilder(input.size()); 1644 for (int i = 0; i < input.size(); i++) { 1645 final byte b = input.byteAt(i); 1646 switch (b) { 1647 // Java does not recognize \a or \v, apparently. 1648 case 0x07: builder.append("\\a" ); break; 1649 case '\b': builder.append("\\b" ); break; 1650 case '\f': builder.append("\\f" ); break; 1651 case '\n': builder.append("\\n" ); break; 1652 case '\r': builder.append("\\r" ); break; 1653 case '\t': builder.append("\\t" ); break; 1654 case 0x0b: builder.append("\\v" ); break; 1655 case '\\': builder.append("\\\\"); break; 1656 case '\'': builder.append("\\\'"); break; 1657 case '"' : builder.append("\\\""); break; 1658 default: 1659 // Note: Bytes with the high-order bit set should be escaped. Since 1660 // bytes are signed, such bytes will compare less than 0x20, hence 1661 // the following line is correct. 1662 if (b >= 0x20) { 1663 builder.append((char) b); 1664 } else { 1665 builder.append('\\'); 1666 builder.append((char) ('0' + ((b >>> 6) & 3))); 1667 builder.append((char) ('0' + ((b >>> 3) & 7))); 1668 builder.append((char) ('0' + (b & 7))); 1669 } 1670 break; 1671 } 1672 } 1673 return builder.toString(); 1674 } 1675 1676 /** 1677 * Escapes bytes in the format used in protocol buffer text format, which 1678 * is the same as the format used for C string literals. All bytes 1679 * that are not printable 7-bit ASCII characters are escaped, as well as 1680 * backslash, single-quote, and double-quote characters. Characters for 1681 * which no defined short-hand escape sequence is defined will be escaped 1682 * using 3-digit octal sequences. 1683 */ escapeBytes(final ByteString input)1684 static String escapeBytes(final ByteString input) { 1685 return escapeBytes(new ByteSequence() { 1686 public int size() { 1687 return input.size(); 1688 } 1689 public byte byteAt(int offset) { 1690 return input.byteAt(offset); 1691 } 1692 }); 1693 } 1694 1695 /** 1696 * Like {@link #escapeBytes(ByteString)}, but used for byte array. 1697 */ 1698 static String escapeBytes(final byte[] input) { 1699 return escapeBytes(new ByteSequence() { 1700 public int size() { 1701 return input.length; 1702 } 1703 public byte byteAt(int offset) { 1704 return input[offset]; 1705 } 1706 }); 1707 } 1708 1709 /** 1710 * Un-escape a byte sequence as escaped using 1711 * {@link #escapeBytes(ByteString)}. Two-digit hex escapes (starting with 1712 * "\x") are also recognized. 1713 */ 1714 static ByteString unescapeBytes(final CharSequence charString) 1715 throws InvalidEscapeSequenceException { 1716 // First convert the Java character sequence to UTF-8 bytes. 1717 ByteString input = ByteString.copyFromUtf8(charString.toString()); 1718 // Then unescape certain byte sequences introduced by ASCII '\\'. The valid 1719 // escapes can all be expressed with ASCII characters, so it is safe to 1720 // operate on bytes here. 1721 // 1722 // Unescaping the input byte array will result in a byte sequence that's no 1723 // longer than the input. That's because each escape sequence is between 1724 // two and four bytes long and stands for a single byte. 1725 final byte[] result = new byte[input.size()]; 1726 int pos = 0; 1727 for (int i = 0; i < input.size(); i++) { 1728 byte c = input.byteAt(i); 1729 if (c == '\\') { 1730 if (i + 1 < input.size()) { 1731 ++i; 1732 c = input.byteAt(i); 1733 if (isOctal(c)) { 1734 // Octal escape. 1735 int code = digitValue(c); 1736 if (i + 1 < input.size() && isOctal(input.byteAt(i + 1))) { 1737 ++i; 1738 code = code * 8 + digitValue(input.byteAt(i)); 1739 } 1740 if (i + 1 < input.size() && isOctal(input.byteAt(i + 1))) { 1741 ++i; 1742 code = code * 8 + digitValue(input.byteAt(i)); 1743 } 1744 // TODO: Check that 0 <= code && code <= 0xFF. 1745 result[pos++] = (byte)code; 1746 } else { 1747 switch (c) { 1748 case 'a' : result[pos++] = 0x07; break; 1749 case 'b' : result[pos++] = '\b'; break; 1750 case 'f' : result[pos++] = '\f'; break; 1751 case 'n' : result[pos++] = '\n'; break; 1752 case 'r' : result[pos++] = '\r'; break; 1753 case 't' : result[pos++] = '\t'; break; 1754 case 'v' : result[pos++] = 0x0b; break; 1755 case '\\': result[pos++] = '\\'; break; 1756 case '\'': result[pos++] = '\''; break; 1757 case '"' : result[pos++] = '\"'; break; 1758 1759 case 'x': 1760 // hex escape 1761 int code = 0; 1762 if (i + 1 < input.size() && isHex(input.byteAt(i + 1))) { 1763 ++i; 1764 code = digitValue(input.byteAt(i)); 1765 } else { 1766 throw new InvalidEscapeSequenceException( 1767 "Invalid escape sequence: '\\x' with no digits"); 1768 } 1769 if (i + 1 < input.size() && isHex(input.byteAt(i + 1))) { 1770 ++i; 1771 code = code * 16 + digitValue(input.byteAt(i)); 1772 } 1773 result[pos++] = (byte)code; 1774 break; 1775 1776 default: 1777 throw new InvalidEscapeSequenceException( 1778 "Invalid escape sequence: '\\" + (char)c + '\''); 1779 } 1780 } 1781 } else { 1782 throw new InvalidEscapeSequenceException( 1783 "Invalid escape sequence: '\\' at end of string."); 1784 } 1785 } else { 1786 result[pos++] = c; 1787 } 1788 } 1789 1790 return ByteString.copyFrom(result, 0, pos); 1791 } 1792 1793 /** 1794 * Thrown by {@link TextFormat#unescapeBytes} and 1795 * {@link TextFormat#unescapeText} when an invalid escape sequence is seen. 1796 */ 1797 static class InvalidEscapeSequenceException extends IOException { 1798 private static final long serialVersionUID = -8164033650142593304L; 1799 1800 InvalidEscapeSequenceException(final String description) { 1801 super(description); 1802 } 1803 } 1804 1805 /** 1806 * Like {@link #escapeBytes(ByteString)}, but escapes a text string. 1807 * Non-ASCII characters are first encoded as UTF-8, then each byte is escaped 1808 * individually as a 3-digit octal escape. Yes, it's weird. 1809 */ 1810 static String escapeText(final String input) { 1811 return escapeBytes(ByteString.copyFromUtf8(input)); 1812 } 1813 1814 /** 1815 * Escape double quotes and backslashes in a String for unicode output of a message. 1816 */ 1817 public static String escapeDoubleQuotesAndBackslashes(final String input) { 1818 return input.replace("\\", "\\\\").replace("\"", "\\\""); 1819 } 1820 1821 /** 1822 * Un-escape a text string as escaped using {@link #escapeText(String)}. 1823 * Two-digit hex escapes (starting with "\x") are also recognized. 1824 */ 1825 static String unescapeText(final String input) 1826 throws InvalidEscapeSequenceException { 1827 return unescapeBytes(input).toStringUtf8(); 1828 } 1829 1830 /** Is this an octal digit? */ 1831 private static boolean isOctal(final byte c) { 1832 return '0' <= c && c <= '7'; 1833 } 1834 1835 /** Is this a hex digit? */ 1836 private static boolean isHex(final byte c) { 1837 return ('0' <= c && c <= '9') || 1838 ('a' <= c && c <= 'f') || 1839 ('A' <= c && c <= 'F'); 1840 } 1841 1842 /** 1843 * Interpret a character as a digit (in any base up to 36) and return the 1844 * numeric value. This is like {@code Character.digit()} but we don't accept 1845 * non-ASCII digits. 1846 */ 1847 private static int digitValue(final byte c) { 1848 if ('0' <= c && c <= '9') { 1849 return c - '0'; 1850 } else if ('a' <= c && c <= 'z') { 1851 return c - 'a' + 10; 1852 } else { 1853 return c - 'A' + 10; 1854 } 1855 } 1856 1857 /** 1858 * Parse a 32-bit signed integer from the text. Unlike the Java standard 1859 * {@code Integer.parseInt()}, this function recognizes the prefixes "0x" 1860 * and "0" to signify hexadecimal and octal numbers, respectively. 1861 */ 1862 static int parseInt32(final String text) throws NumberFormatException { 1863 return (int) parseInteger(text, true, false); 1864 } 1865 1866 /** 1867 * Parse a 32-bit unsigned integer from the text. Unlike the Java standard 1868 * {@code Integer.parseInt()}, this function recognizes the prefixes "0x" 1869 * and "0" to signify hexadecimal and octal numbers, respectively. The 1870 * result is coerced to a (signed) {@code int} when returned since Java has 1871 * no unsigned integer type. 1872 */ 1873 static int parseUInt32(final String text) throws NumberFormatException { 1874 return (int) parseInteger(text, false, false); 1875 } 1876 1877 /** 1878 * Parse a 64-bit signed integer from the text. Unlike the Java standard 1879 * {@code Integer.parseInt()}, this function recognizes the prefixes "0x" 1880 * and "0" to signify hexadecimal and octal numbers, respectively. 1881 */ 1882 static long parseInt64(final String text) throws NumberFormatException { 1883 return parseInteger(text, true, true); 1884 } 1885 1886 /** 1887 * Parse a 64-bit unsigned integer from the text. Unlike the Java standard 1888 * {@code Integer.parseInt()}, this function recognizes the prefixes "0x" 1889 * and "0" to signify hexadecimal and octal numbers, respectively. The 1890 * result is coerced to a (signed) {@code long} when returned since Java has 1891 * no unsigned long type. 1892 */ 1893 static long parseUInt64(final String text) throws NumberFormatException { 1894 return parseInteger(text, false, true); 1895 } 1896 1897 private static long parseInteger(final String text, 1898 final boolean isSigned, 1899 final boolean isLong) 1900 throws NumberFormatException { 1901 int pos = 0; 1902 1903 boolean negative = false; 1904 if (text.startsWith("-", pos)) { 1905 if (!isSigned) { 1906 throw new NumberFormatException("Number must be positive: " + text); 1907 } 1908 ++pos; 1909 negative = true; 1910 } 1911 1912 int radix = 10; 1913 if (text.startsWith("0x", pos)) { 1914 pos += 2; 1915 radix = 16; 1916 } else if (text.startsWith("0", pos)) { 1917 radix = 8; 1918 } 1919 1920 final String numberText = text.substring(pos); 1921 1922 long result = 0; 1923 if (numberText.length() < 16) { 1924 // Can safely assume no overflow. 1925 result = Long.parseLong(numberText, radix); 1926 if (negative) { 1927 result = -result; 1928 } 1929 1930 // Check bounds. 1931 // No need to check for 64-bit numbers since they'd have to be 16 chars 1932 // or longer to overflow. 1933 if (!isLong) { 1934 if (isSigned) { 1935 if (result > Integer.MAX_VALUE || result < Integer.MIN_VALUE) { 1936 throw new NumberFormatException( 1937 "Number out of range for 32-bit signed integer: " + text); 1938 } 1939 } else { 1940 if (result >= (1L << 32) || result < 0) { 1941 throw new NumberFormatException( 1942 "Number out of range for 32-bit unsigned integer: " + text); 1943 } 1944 } 1945 } 1946 } else { 1947 BigInteger bigValue = new BigInteger(numberText, radix); 1948 if (negative) { 1949 bigValue = bigValue.negate(); 1950 } 1951 1952 // Check bounds. 1953 if (!isLong) { 1954 if (isSigned) { 1955 if (bigValue.bitLength() > 31) { 1956 throw new NumberFormatException( 1957 "Number out of range for 32-bit signed integer: " + text); 1958 } 1959 } else { 1960 if (bigValue.bitLength() > 32) { 1961 throw new NumberFormatException( 1962 "Number out of range for 32-bit unsigned integer: " + text); 1963 } 1964 } 1965 } else { 1966 if (isSigned) { 1967 if (bigValue.bitLength() > 63) { 1968 throw new NumberFormatException( 1969 "Number out of range for 64-bit signed integer: " + text); 1970 } 1971 } else { 1972 if (bigValue.bitLength() > 64) { 1973 throw new NumberFormatException( 1974 "Number out of range for 64-bit unsigned integer: " + text); 1975 } 1976 } 1977 } 1978 1979 result = bigValue.longValue(); 1980 } 1981 1982 return result; 1983 } 1984 } 1985