1 // Protocol Buffers - Google's data interchange format 2 // Copyright 2008 Google Inc. All rights reserved. 3 // https://developers.google.com/protocol-buffers/ 4 // 5 // Redistribution and use in source and binary forms, with or without 6 // modification, are permitted provided that the following conditions are 7 // met: 8 // 9 // * Redistributions of source code must retain the above copyright 10 // notice, this list of conditions and the following disclaimer. 11 // * Redistributions in binary form must reproduce the above 12 // copyright notice, this list of conditions and the following disclaimer 13 // in the documentation and/or other materials provided with the 14 // distribution. 15 // * Neither the name of Google Inc. nor the names of its 16 // contributors may be used to endorse or promote products derived from 17 // this software without specific prior written permission. 18 // 19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 31 package com.google.protobuf; 32 33 import static com.google.protobuf.UnsafeUtil.addressOffset; 34 import static com.google.protobuf.UnsafeUtil.hasUnsafeArrayOperations; 35 import static com.google.protobuf.UnsafeUtil.hasUnsafeByteBufferOperations; 36 import static java.lang.Character.MAX_SURROGATE; 37 import static java.lang.Character.MIN_HIGH_SURROGATE; 38 import static java.lang.Character.MIN_LOW_SURROGATE; 39 import static java.lang.Character.MIN_SUPPLEMENTARY_CODE_POINT; 40 import static java.lang.Character.MIN_SURROGATE; 41 import static java.lang.Character.isSurrogatePair; 42 import static java.lang.Character.toCodePoint; 43 44 import java.nio.ByteBuffer; 45 46 /** 47 * A set of low-level, high-performance static utility methods related to the UTF-8 character 48 * encoding. This class has no dependencies outside of the core JDK libraries. 49 * 50 * <p>There are several variants of UTF-8. The one implemented by this class is the restricted 51 * definition of UTF-8 introduced in Unicode 3.1, which mandates the rejection of "overlong" byte 52 * sequences as well as rejection of 3-byte surrogate codepoint byte sequences. Note that the UTF-8 53 * decoder included in Oracle's JDK has been modified to also reject "overlong" byte sequences, but 54 * (as of 2011) still accepts 3-byte surrogate codepoint byte sequences. 55 * 56 * <p>The byte sequences considered valid by this class are exactly those that can be roundtrip 57 * converted to Strings and back to bytes using the UTF-8 charset, without loss: 58 * 59 * <pre>{@code 60 * Arrays.equals(bytes, new String(bytes, Internal.UTF_8).getBytes(Internal.UTF_8)) 61 * }</pre> 62 * 63 * <p>See the Unicode Standard,</br> Table 3-6. <em>UTF-8 Bit Distribution</em>,</br> Table 3-7. 64 * <em>Well Formed UTF-8 Byte Sequences</em>. 65 * 66 * <p>This class supports decoding of partial byte sequences, so that the bytes in a complete UTF-8 67 * byte sequences can be stored in multiple segments. Methods typically return {@link #MALFORMED} if 68 * the partial byte sequence is definitely not well-formed, {@link #COMPLETE} if it is well-formed 69 * in the absence of additional input, or if the byte sequence apparently terminated in the middle 70 * of a character, an opaque integer "state" value containing enough information to decode the 71 * character when passed to a subsequent invocation of a partial decoding method. 72 * 73 * @author martinrb@google.com (Martin Buchholz) 74 */ 75 // TODO(nathanmittler): Copy changes in this class back to Guava 76 final class Utf8 { 77 78 /** 79 * UTF-8 is a runtime hot spot so we attempt to provide heavily optimized implementations 80 * depending on what is available on the platform. The processor is the platform-optimized 81 * delegate for which all methods are delegated directly to. 82 */ 83 private static final Processor processor = 84 (UnsafeProcessor.isAvailable() && !Android.isOnAndroidDevice()) 85 ? new UnsafeProcessor() 86 : new SafeProcessor(); 87 88 /** 89 * A mask used when performing unsafe reads to determine if a long value contains any non-ASCII 90 * characters (i.e. any byte >= 0x80). 91 */ 92 private static final long ASCII_MASK_LONG = 0x8080808080808080L; 93 94 /** 95 * Maximum number of bytes per Java UTF-16 char in UTF-8. 96 * 97 * @see java.nio.charset.CharsetEncoder#maxBytesPerChar() 98 */ 99 static final int MAX_BYTES_PER_CHAR = 3; 100 101 /** 102 * State value indicating that the byte sequence is well-formed and complete (no further bytes are 103 * needed to complete a character). 104 */ 105 public static final int COMPLETE = 0; 106 107 /** State value indicating that the byte sequence is definitely not well-formed. */ 108 public static final int MALFORMED = -1; 109 110 /** 111 * Used by {@code Unsafe} UTF-8 string validation logic to determine the minimum string length 112 * above which to employ an optimized algorithm for counting ASCII characters. The reason for this 113 * threshold is that for small strings, the optimization may not be beneficial or may even 114 * negatively impact performance since it requires additional logic to avoid unaligned reads (when 115 * calling {@code Unsafe.getLong}). This threshold guarantees that even if the initial offset is 116 * unaligned, we're guaranteed to make at least one call to {@code Unsafe.getLong()} which 117 * provides a performance improvement that entirely subsumes the cost of the additional logic. 118 */ 119 private static final int UNSAFE_COUNT_ASCII_THRESHOLD = 16; 120 121 // Other state values include the partial bytes of the incomplete 122 // character to be decoded in the simplest way: we pack the bytes 123 // into the state int in little-endian order. For example: 124 // 125 // int state = byte1 ^ (byte2 << 8) ^ (byte3 << 16); 126 // 127 // Such a state is unpacked thus (note the ~ operation for byte2 to 128 // undo byte1's sign-extension bits): 129 // 130 // int byte1 = (byte) state; 131 // int byte2 = (byte) ~(state >> 8); 132 // int byte3 = (byte) (state >> 16); 133 // 134 // We cannot store a zero byte in the state because it would be 135 // indistinguishable from the absence of a byte. But we don't need 136 // to, because partial bytes must always be negative. When building 137 // a state, we ensure that byte1 is negative and subsequent bytes 138 // are valid trailing bytes. 139 140 /** 141 * Returns {@code true} if the given byte array is a well-formed UTF-8 byte sequence. 142 * 143 * <p>This is a convenience method, equivalent to a call to {@code isValidUtf8(bytes, 0, 144 * bytes.length)}. 145 */ isValidUtf8(byte[] bytes)146 public static boolean isValidUtf8(byte[] bytes) { 147 return processor.isValidUtf8(bytes, 0, bytes.length); 148 } 149 150 /** 151 * Returns {@code true} if the given byte array slice is a well-formed UTF-8 byte sequence. The 152 * range of bytes to be checked extends from index {@code index}, inclusive, to {@code limit}, 153 * exclusive. 154 * 155 * <p>This is a convenience method, equivalent to {@code partialIsValidUtf8(bytes, index, limit) 156 * == Utf8.COMPLETE}. 157 */ isValidUtf8(byte[] bytes, int index, int limit)158 public static boolean isValidUtf8(byte[] bytes, int index, int limit) { 159 return processor.isValidUtf8(bytes, index, limit); 160 } 161 162 /** 163 * Tells whether the given byte array slice is a well-formed, malformed, or incomplete UTF-8 byte 164 * sequence. The range of bytes to be checked extends from index {@code index}, inclusive, to 165 * {@code limit}, exclusive. 166 * 167 * @param state either {@link Utf8#COMPLETE} (if this is the initial decoding operation) or the 168 * value returned from a call to a partial decoding method for the previous bytes 169 * @return {@link #MALFORMED} if the partial byte sequence is definitely not well-formed, {@link 170 * #COMPLETE} if it is well-formed (no additional input needed), or if the byte sequence is 171 * "incomplete", i.e. apparently terminated in the middle of a character, an opaque integer 172 * "state" value containing enough information to decode the character when passed to a 173 * subsequent invocation of a partial decoding method. 174 */ partialIsValidUtf8(int state, byte[] bytes, int index, int limit)175 public static int partialIsValidUtf8(int state, byte[] bytes, int index, int limit) { 176 return processor.partialIsValidUtf8(state, bytes, index, limit); 177 } 178 incompleteStateFor(int byte1)179 private static int incompleteStateFor(int byte1) { 180 return (byte1 > (byte) 0xF4) ? MALFORMED : byte1; 181 } 182 incompleteStateFor(int byte1, int byte2)183 private static int incompleteStateFor(int byte1, int byte2) { 184 return (byte1 > (byte) 0xF4 || byte2 > (byte) 0xBF) ? MALFORMED : byte1 ^ (byte2 << 8); 185 } 186 incompleteStateFor(int byte1, int byte2, int byte3)187 private static int incompleteStateFor(int byte1, int byte2, int byte3) { 188 return (byte1 > (byte) 0xF4 || byte2 > (byte) 0xBF || byte3 > (byte) 0xBF) 189 ? MALFORMED 190 : byte1 ^ (byte2 << 8) ^ (byte3 << 16); 191 } 192 incompleteStateFor(byte[] bytes, int index, int limit)193 private static int incompleteStateFor(byte[] bytes, int index, int limit) { 194 int byte1 = bytes[index - 1]; 195 switch (limit - index) { 196 case 0: 197 return incompleteStateFor(byte1); 198 case 1: 199 return incompleteStateFor(byte1, bytes[index]); 200 case 2: 201 return incompleteStateFor(byte1, bytes[index], bytes[index + 1]); 202 default: 203 throw new AssertionError(); 204 } 205 } 206 incompleteStateFor( final ByteBuffer buffer, final int byte1, final int index, final int remaining)207 private static int incompleteStateFor( 208 final ByteBuffer buffer, final int byte1, final int index, final int remaining) { 209 switch (remaining) { 210 case 0: 211 return incompleteStateFor(byte1); 212 case 1: 213 return incompleteStateFor(byte1, buffer.get(index)); 214 case 2: 215 return incompleteStateFor(byte1, buffer.get(index), buffer.get(index + 1)); 216 default: 217 throw new AssertionError(); 218 } 219 } 220 221 // These UTF-8 handling methods are copied from Guava's Utf8 class with a modification to throw 222 // a protocol buffer local exception. This exception is then caught in CodedOutputStream so it can 223 // fallback to more lenient behavior. 224 225 static class UnpairedSurrogateException extends IllegalArgumentException { UnpairedSurrogateException(int index, int length)226 UnpairedSurrogateException(int index, int length) { 227 super("Unpaired surrogate at index " + index + " of " + length); 228 } 229 } 230 231 /** 232 * Returns the number of bytes in the UTF-8-encoded form of {@code sequence}. For a string, this 233 * method is equivalent to {@code string.getBytes(UTF_8).length}, but is more efficient in both 234 * time and space. 235 * 236 * @throws IllegalArgumentException if {@code sequence} contains ill-formed UTF-16 (unpaired 237 * surrogates) 238 */ encodedLength(CharSequence sequence)239 static int encodedLength(CharSequence sequence) { 240 // Warning to maintainers: this implementation is highly optimized. 241 int utf16Length = sequence.length(); 242 int utf8Length = utf16Length; 243 int i = 0; 244 245 // This loop optimizes for pure ASCII. 246 while (i < utf16Length && sequence.charAt(i) < 0x80) { 247 i++; 248 } 249 250 // This loop optimizes for chars less than 0x800. 251 for (; i < utf16Length; i++) { 252 char c = sequence.charAt(i); 253 if (c < 0x800) { 254 utf8Length += ((0x7f - c) >>> 31); // branch free! 255 } else { 256 utf8Length += encodedLengthGeneral(sequence, i); 257 break; 258 } 259 } 260 261 if (utf8Length < utf16Length) { 262 // Necessary and sufficient condition for overflow because of maximum 3x expansion 263 throw new IllegalArgumentException( 264 "UTF-8 length does not fit in int: " + (utf8Length + (1L << 32))); 265 } 266 return utf8Length; 267 } 268 encodedLengthGeneral(CharSequence sequence, int start)269 private static int encodedLengthGeneral(CharSequence sequence, int start) { 270 int utf16Length = sequence.length(); 271 int utf8Length = 0; 272 for (int i = start; i < utf16Length; i++) { 273 char c = sequence.charAt(i); 274 if (c < 0x800) { 275 utf8Length += (0x7f - c) >>> 31; // branch free! 276 } else { 277 utf8Length += 2; 278 // jdk7+: if (Character.isSurrogate(c)) { 279 if (Character.MIN_SURROGATE <= c && c <= Character.MAX_SURROGATE) { 280 // Check that we have a well-formed surrogate pair. 281 int cp = Character.codePointAt(sequence, i); 282 if (cp < MIN_SUPPLEMENTARY_CODE_POINT) { 283 throw new UnpairedSurrogateException(i, utf16Length); 284 } 285 i++; 286 } 287 } 288 } 289 return utf8Length; 290 } 291 encode(CharSequence in, byte[] out, int offset, int length)292 static int encode(CharSequence in, byte[] out, int offset, int length) { 293 return processor.encodeUtf8(in, out, offset, length); 294 } 295 // End Guava UTF-8 methods. 296 297 /** 298 * Determines if the given {@link ByteBuffer} is a valid UTF-8 string. 299 * 300 * <p>Selects an optimal algorithm based on the type of {@link ByteBuffer} (i.e. heap or direct) 301 * and the capabilities of the platform. 302 * 303 * @param buffer the buffer to check. 304 * @see Utf8#isValidUtf8(byte[], int, int) 305 */ isValidUtf8(ByteBuffer buffer)306 static boolean isValidUtf8(ByteBuffer buffer) { 307 return processor.isValidUtf8(buffer, buffer.position(), buffer.remaining()); 308 } 309 310 /** 311 * Determines if the given {@link ByteBuffer} is a partially valid UTF-8 string. 312 * 313 * <p>Selects an optimal algorithm based on the type of {@link ByteBuffer} (i.e. heap or direct) 314 * and the capabilities of the platform. 315 * 316 * @param buffer the buffer to check. 317 * @see Utf8#partialIsValidUtf8(int, byte[], int, int) 318 */ partialIsValidUtf8(int state, ByteBuffer buffer, int index, int limit)319 static int partialIsValidUtf8(int state, ByteBuffer buffer, int index, int limit) { 320 return processor.partialIsValidUtf8(state, buffer, index, limit); 321 } 322 323 /** 324 * Decodes the given UTF-8 portion of the {@link ByteBuffer} into a {@link String}. 325 * 326 * @throws InvalidProtocolBufferException if the input is not valid UTF-8. 327 */ decodeUtf8(ByteBuffer buffer, int index, int size)328 static String decodeUtf8(ByteBuffer buffer, int index, int size) 329 throws InvalidProtocolBufferException { 330 return processor.decodeUtf8(buffer, index, size); 331 } 332 333 /** 334 * Decodes the given UTF-8 encoded byte array slice into a {@link String}. 335 * 336 * @throws InvalidProtocolBufferException if the input is not valid UTF-8. 337 */ decodeUtf8(byte[] bytes, int index, int size)338 static String decodeUtf8(byte[] bytes, int index, int size) 339 throws InvalidProtocolBufferException { 340 return processor.decodeUtf8(bytes, index, size); 341 } 342 343 /** 344 * Encodes the given characters to the target {@link ByteBuffer} using UTF-8 encoding. 345 * 346 * <p>Selects an optimal algorithm based on the type of {@link ByteBuffer} (i.e. heap or direct) 347 * and the capabilities of the platform. 348 * 349 * @param in the source string to be encoded 350 * @param out the target buffer to receive the encoded string. 351 * @see Utf8#encode(CharSequence, byte[], int, int) 352 */ encodeUtf8(CharSequence in, ByteBuffer out)353 static void encodeUtf8(CharSequence in, ByteBuffer out) { 354 processor.encodeUtf8(in, out); 355 } 356 357 /** 358 * Counts (approximately) the number of consecutive ASCII characters in the given buffer. The byte 359 * order of the {@link ByteBuffer} does not matter, so performance can be improved if native byte 360 * order is used (i.e. no byte-swapping in {@link ByteBuffer#getLong(int)}). 361 * 362 * @param buffer the buffer to be scanned for ASCII chars 363 * @param index the starting index of the scan 364 * @param limit the limit within buffer for the scan 365 * @return the number of ASCII characters found. The stopping position will be at or before the 366 * first non-ASCII byte. 367 */ estimateConsecutiveAscii(ByteBuffer buffer, int index, int limit)368 private static int estimateConsecutiveAscii(ByteBuffer buffer, int index, int limit) { 369 int i = index; 370 final int lim = limit - 7; 371 // This simple loop stops when we encounter a byte >= 0x80 (i.e. non-ASCII). 372 // To speed things up further, we're reading longs instead of bytes so we use a mask to 373 // determine if any byte in the current long is non-ASCII. 374 for (; i < lim && (buffer.getLong(i) & ASCII_MASK_LONG) == 0; i += 8) {} 375 return i - index; 376 } 377 378 /** A processor of UTF-8 strings, providing methods for checking validity and encoding. */ 379 // TODO(nathanmittler): Add support for Memory/MemoryBlock on Android. 380 abstract static class Processor { 381 /** 382 * Returns {@code true} if the given byte array slice is a well-formed UTF-8 byte sequence. The 383 * range of bytes to be checked extends from index {@code index}, inclusive, to {@code limit}, 384 * exclusive. 385 * 386 * <p>This is a convenience method, equivalent to {@code partialIsValidUtf8(bytes, index, limit) 387 * == Utf8.COMPLETE}. 388 */ isValidUtf8(byte[] bytes, int index, int limit)389 final boolean isValidUtf8(byte[] bytes, int index, int limit) { 390 return partialIsValidUtf8(COMPLETE, bytes, index, limit) == COMPLETE; 391 } 392 393 /** 394 * Tells whether the given byte array slice is a well-formed, malformed, or incomplete UTF-8 395 * byte sequence. The range of bytes to be checked extends from index {@code index}, inclusive, 396 * to {@code limit}, exclusive. 397 * 398 * @param state either {@link Utf8#COMPLETE} (if this is the initial decoding operation) or the 399 * value returned from a call to a partial decoding method for the previous bytes 400 * @return {@link #MALFORMED} if the partial byte sequence is definitely not well-formed, {@link 401 * #COMPLETE} if it is well-formed (no additional input needed), or if the byte sequence is 402 * "incomplete", i.e. apparently terminated in the middle of a character, an opaque integer 403 * "state" value containing enough information to decode the character when passed to a 404 * subsequent invocation of a partial decoding method. 405 */ partialIsValidUtf8(int state, byte[] bytes, int index, int limit)406 abstract int partialIsValidUtf8(int state, byte[] bytes, int index, int limit); 407 408 /** 409 * Returns {@code true} if the given portion of the {@link ByteBuffer} is a well-formed UTF-8 410 * byte sequence. The range of bytes to be checked extends from index {@code index}, inclusive, 411 * to {@code limit}, exclusive. 412 * 413 * <p>This is a convenience method, equivalent to {@code partialIsValidUtf8(bytes, index, limit) 414 * == Utf8.COMPLETE}. 415 */ isValidUtf8(ByteBuffer buffer, int index, int limit)416 final boolean isValidUtf8(ByteBuffer buffer, int index, int limit) { 417 return partialIsValidUtf8(COMPLETE, buffer, index, limit) == COMPLETE; 418 } 419 420 /** 421 * Indicates whether or not the given buffer contains a valid UTF-8 string. 422 * 423 * @param buffer the buffer to check. 424 * @return {@code true} if the given buffer contains a valid UTF-8 string. 425 */ partialIsValidUtf8( final int state, final ByteBuffer buffer, int index, final int limit)426 final int partialIsValidUtf8( 427 final int state, final ByteBuffer buffer, int index, final int limit) { 428 if (buffer.hasArray()) { 429 final int offset = buffer.arrayOffset(); 430 return partialIsValidUtf8(state, buffer.array(), offset + index, offset + limit); 431 } else if (buffer.isDirect()) { 432 return partialIsValidUtf8Direct(state, buffer, index, limit); 433 } 434 return partialIsValidUtf8Default(state, buffer, index, limit); 435 } 436 437 /** Performs validation for direct {@link ByteBuffer} instances. */ partialIsValidUtf8Direct( final int state, final ByteBuffer buffer, int index, final int limit)438 abstract int partialIsValidUtf8Direct( 439 final int state, final ByteBuffer buffer, int index, final int limit); 440 441 /** 442 * Performs validation for {@link ByteBuffer} instances using the {@link ByteBuffer} API rather 443 * than potentially faster approaches. This first completes validation for the current character 444 * (provided by {@code state}) and then finishes validation for the sequence. 445 */ partialIsValidUtf8Default( final int state, final ByteBuffer buffer, int index, final int limit)446 final int partialIsValidUtf8Default( 447 final int state, final ByteBuffer buffer, int index, final int limit) { 448 if (state != COMPLETE) { 449 // The previous decoding operation was incomplete (or malformed). 450 // We look for a well-formed sequence consisting of bytes from 451 // the previous decoding operation (stored in state) together 452 // with bytes from the array slice. 453 // 454 // We expect such "straddler characters" to be rare. 455 456 if (index >= limit) { // No bytes? No progress. 457 return state; 458 } 459 460 byte byte1 = (byte) state; 461 // byte1 is never ASCII. 462 if (byte1 < (byte) 0xE0) { 463 // two-byte form 464 465 // Simultaneously checks for illegal trailing-byte in 466 // leading position and overlong 2-byte form. 467 if (byte1 < (byte) 0xC2 468 // byte2 trailing-byte test 469 || buffer.get(index++) > (byte) 0xBF) { 470 return MALFORMED; 471 } 472 } else if (byte1 < (byte) 0xF0) { 473 // three-byte form 474 475 // Get byte2 from saved state or array 476 byte byte2 = (byte) ~(state >> 8); 477 if (byte2 == 0) { 478 byte2 = buffer.get(index++); 479 if (index >= limit) { 480 return incompleteStateFor(byte1, byte2); 481 } 482 } 483 if (byte2 > (byte) 0xBF 484 // overlong? 5 most significant bits must not all be zero 485 || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0) 486 // illegal surrogate codepoint? 487 || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0) 488 // byte3 trailing-byte test 489 || buffer.get(index++) > (byte) 0xBF) { 490 return MALFORMED; 491 } 492 } else { 493 // four-byte form 494 495 // Get byte2 and byte3 from saved state or array 496 byte byte2 = (byte) ~(state >> 8); 497 byte byte3 = 0; 498 if (byte2 == 0) { 499 byte2 = buffer.get(index++); 500 if (index >= limit) { 501 return incompleteStateFor(byte1, byte2); 502 } 503 } else { 504 byte3 = (byte) (state >> 16); 505 } 506 if (byte3 == 0) { 507 byte3 = buffer.get(index++); 508 if (index >= limit) { 509 return incompleteStateFor(byte1, byte2, byte3); 510 } 511 } 512 513 // If we were called with state == MALFORMED, then byte1 is 0xFF, 514 // which never occurs in well-formed UTF-8, and so we will return 515 // MALFORMED again below. 516 517 if (byte2 > (byte) 0xBF 518 // Check that 1 <= plane <= 16. Tricky optimized form of: 519 // if (byte1 > (byte) 0xF4 || 520 // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 || 521 // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F) 522 || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0 523 // byte3 trailing-byte test 524 || byte3 > (byte) 0xBF 525 // byte4 trailing-byte test 526 || buffer.get(index++) > (byte) 0xBF) { 527 return MALFORMED; 528 } 529 } 530 } 531 532 // Finish validation for the sequence. 533 return partialIsValidUtf8(buffer, index, limit); 534 } 535 536 /** 537 * Performs validation for {@link ByteBuffer} instances using the {@link ByteBuffer} API rather 538 * than potentially faster approaches. 539 */ partialIsValidUtf8(final ByteBuffer buffer, int index, final int limit)540 private static int partialIsValidUtf8(final ByteBuffer buffer, int index, final int limit) { 541 index += estimateConsecutiveAscii(buffer, index, limit); 542 543 for (; ; ) { 544 // Optimize for interior runs of ASCII bytes. 545 // TODO(nathanmittler): Consider checking 8 bytes at a time after some threshold? 546 // Maybe after seeing a few in a row that are ASCII, go back to fast mode? 547 int byte1; 548 do { 549 if (index >= limit) { 550 return COMPLETE; 551 } 552 } while ((byte1 = buffer.get(index++)) >= 0); 553 554 // If we're here byte1 is not ASCII. Only need to handle 2-4 byte forms. 555 if (byte1 < (byte) 0xE0) { 556 // Two-byte form (110xxxxx 10xxxxxx) 557 if (index >= limit) { 558 // Incomplete sequence 559 return byte1; 560 } 561 562 // Simultaneously checks for illegal trailing-byte in 563 // leading position and overlong 2-byte form. 564 if (byte1 < (byte) 0xC2 || buffer.get(index) > (byte) 0xBF) { 565 return MALFORMED; 566 } 567 index++; 568 } else if (byte1 < (byte) 0xF0) { 569 // Three-byte form (1110xxxx 10xxxxxx 10xxxxxx) 570 if (index >= limit - 1) { 571 // Incomplete sequence 572 return incompleteStateFor(buffer, byte1, index, limit - index); 573 } 574 575 final byte byte2 = buffer.get(index++); 576 if (byte2 > (byte) 0xBF 577 // overlong? 5 most significant bits must not all be zero 578 || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0) 579 // check for illegal surrogate codepoints 580 || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0) 581 // byte3 trailing-byte test 582 || buffer.get(index) > (byte) 0xBF) { 583 return MALFORMED; 584 } 585 index++; 586 } else { 587 // Four-byte form (1110xxxx 10xxxxxx 10xxxxxx 10xxxxxx) 588 if (index >= limit - 2) { 589 // Incomplete sequence 590 return incompleteStateFor(buffer, byte1, index, limit - index); 591 } 592 593 // TODO(nathanmittler): Consider using getInt() to improve performance. 594 final int byte2 = buffer.get(index++); 595 if (byte2 > (byte) 0xBF 596 // Check that 1 <= plane <= 16. Tricky optimized form of: 597 // if (byte1 > (byte) 0xF4 || 598 // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 || 599 // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F) 600 || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0 601 // byte3 trailing-byte test 602 || buffer.get(index++) > (byte) 0xBF 603 // byte4 trailing-byte test 604 || buffer.get(index++) > (byte) 0xBF) { 605 return MALFORMED; 606 } 607 } 608 } 609 } 610 611 /** 612 * Decodes the given byte array slice into a {@link String}. 613 * 614 * @throws InvalidProtocolBufferException if the byte array slice is not valid UTF-8. 615 */ decodeUtf8(byte[] bytes, int index, int size)616 abstract String decodeUtf8(byte[] bytes, int index, int size) 617 throws InvalidProtocolBufferException; 618 619 /** 620 * Decodes the given portion of the {@link ByteBuffer} into a {@link String}. 621 * 622 * @throws InvalidProtocolBufferException if the portion of the buffer is not valid UTF-8. 623 */ decodeUtf8(ByteBuffer buffer, int index, int size)624 final String decodeUtf8(ByteBuffer buffer, int index, int size) 625 throws InvalidProtocolBufferException { 626 if (buffer.hasArray()) { 627 final int offset = buffer.arrayOffset(); 628 return decodeUtf8(buffer.array(), offset + index, size); 629 } else if (buffer.isDirect()) { 630 return decodeUtf8Direct(buffer, index, size); 631 } 632 return decodeUtf8Default(buffer, index, size); 633 } 634 635 /** Decodes direct {@link ByteBuffer} instances into {@link String}. */ decodeUtf8Direct(ByteBuffer buffer, int index, int size)636 abstract String decodeUtf8Direct(ByteBuffer buffer, int index, int size) 637 throws InvalidProtocolBufferException; 638 639 /** 640 * Decodes {@link ByteBuffer} instances using the {@link ByteBuffer} API rather than potentially 641 * faster approaches. 642 */ decodeUtf8Default(ByteBuffer buffer, int index, int size)643 final String decodeUtf8Default(ByteBuffer buffer, int index, int size) 644 throws InvalidProtocolBufferException { 645 // Bitwise OR combines the sign bits so any negative value fails the check. 646 if ((index | size | buffer.limit() - index - size) < 0) { 647 throw new ArrayIndexOutOfBoundsException( 648 String.format("buffer limit=%d, index=%d, limit=%d", buffer.limit(), index, size)); 649 } 650 651 int offset = index; 652 final int limit = offset + size; 653 654 // The longest possible resulting String is the same as the number of input bytes, when it is 655 // all ASCII. For other cases, this over-allocates and we will truncate in the end. 656 char[] resultArr = new char[size]; 657 int resultPos = 0; 658 659 // Optimize for 100% ASCII (Hotspot loves small simple top-level loops like this). 660 // This simple loop stops when we encounter a byte >= 0x80 (i.e. non-ASCII). 661 while (offset < limit) { 662 byte b = buffer.get(offset); 663 if (!DecodeUtil.isOneByte(b)) { 664 break; 665 } 666 offset++; 667 DecodeUtil.handleOneByte(b, resultArr, resultPos++); 668 } 669 670 while (offset < limit) { 671 byte byte1 = buffer.get(offset++); 672 if (DecodeUtil.isOneByte(byte1)) { 673 DecodeUtil.handleOneByte(byte1, resultArr, resultPos++); 674 // It's common for there to be multiple ASCII characters in a run mixed in, so add an 675 // extra optimized loop to take care of these runs. 676 while (offset < limit) { 677 byte b = buffer.get(offset); 678 if (!DecodeUtil.isOneByte(b)) { 679 break; 680 } 681 offset++; 682 DecodeUtil.handleOneByte(b, resultArr, resultPos++); 683 } 684 } else if (DecodeUtil.isTwoBytes(byte1)) { 685 if (offset >= limit) { 686 throw InvalidProtocolBufferException.invalidUtf8(); 687 } 688 DecodeUtil.handleTwoBytes( 689 byte1, /* byte2 */ buffer.get(offset++), resultArr, resultPos++); 690 } else if (DecodeUtil.isThreeBytes(byte1)) { 691 if (offset >= limit - 1) { 692 throw InvalidProtocolBufferException.invalidUtf8(); 693 } 694 DecodeUtil.handleThreeBytes( 695 byte1, 696 /* byte2 */ buffer.get(offset++), 697 /* byte3 */ buffer.get(offset++), 698 resultArr, 699 resultPos++); 700 } else { 701 if (offset >= limit - 2) { 702 throw InvalidProtocolBufferException.invalidUtf8(); 703 } 704 DecodeUtil.handleFourBytes( 705 byte1, 706 /* byte2 */ buffer.get(offset++), 707 /* byte3 */ buffer.get(offset++), 708 /* byte4 */ buffer.get(offset++), 709 resultArr, 710 resultPos++); 711 // 4-byte case requires two chars. 712 resultPos++; 713 } 714 } 715 716 return new String(resultArr, 0, resultPos); 717 } 718 719 /** 720 * Encodes an input character sequence ({@code in}) to UTF-8 in the target array ({@code out}). 721 * For a string, this method is similar to 722 * 723 * <pre>{@code 724 * byte[] a = string.getBytes(UTF_8); 725 * System.arraycopy(a, 0, bytes, offset, a.length); 726 * return offset + a.length; 727 * }</pre> 728 * 729 * but is more efficient in both time and space. One key difference is that this method requires 730 * paired surrogates, and therefore does not support chunking. While {@code 731 * String.getBytes(UTF_8)} replaces unpaired surrogates with the default replacement character, 732 * this method throws {@link UnpairedSurrogateException}. 733 * 734 * <p>To ensure sufficient space in the output buffer, either call {@link #encodedLength} to 735 * compute the exact amount needed, or leave room for {@code Utf8.MAX_BYTES_PER_CHAR * 736 * sequence.length()}, which is the largest possible number of bytes that any input can be 737 * encoded to. 738 * 739 * @param in the input character sequence to be encoded 740 * @param out the target array 741 * @param offset the starting offset in {@code bytes} to start writing at 742 * @param length the length of the {@code bytes}, starting from {@code offset} 743 * @throws UnpairedSurrogateException if {@code sequence} contains ill-formed UTF-16 (unpaired 744 * surrogates) 745 * @throws ArrayIndexOutOfBoundsException if {@code sequence} encoded in UTF-8 is longer than 746 * {@code bytes.length - offset} 747 * @return the new offset, equivalent to {@code offset + Utf8.encodedLength(sequence)} 748 */ encodeUtf8(CharSequence in, byte[] out, int offset, int length)749 abstract int encodeUtf8(CharSequence in, byte[] out, int offset, int length); 750 751 /** 752 * Encodes an input character sequence ({@code in}) to UTF-8 in the target buffer ({@code out}). 753 * Upon returning from this method, the {@code out} position will point to the position after 754 * the last encoded byte. This method requires paired surrogates, and therefore does not support 755 * chunking. 756 * 757 * <p>To ensure sufficient space in the output buffer, either call {@link #encodedLength} to 758 * compute the exact amount needed, or leave room for {@code Utf8.MAX_BYTES_PER_CHAR * 759 * in.length()}, which is the largest possible number of bytes that any input can be encoded to. 760 * 761 * @param in the source character sequence to be encoded 762 * @param out the target buffer 763 * @throws UnpairedSurrogateException if {@code in} contains ill-formed UTF-16 (unpaired 764 * surrogates) 765 * @throws ArrayIndexOutOfBoundsException if {@code in} encoded in UTF-8 is longer than {@code 766 * out.remaining()} 767 */ encodeUtf8(CharSequence in, ByteBuffer out)768 final void encodeUtf8(CharSequence in, ByteBuffer out) { 769 if (out.hasArray()) { 770 final int offset = out.arrayOffset(); 771 int endIndex = Utf8.encode(in, out.array(), offset + out.position(), out.remaining()); 772 out.position(endIndex - offset); 773 } else if (out.isDirect()) { 774 encodeUtf8Direct(in, out); 775 } else { 776 encodeUtf8Default(in, out); 777 } 778 } 779 780 /** Encodes the input character sequence to a direct {@link ByteBuffer} instance. */ encodeUtf8Direct(CharSequence in, ByteBuffer out)781 abstract void encodeUtf8Direct(CharSequence in, ByteBuffer out); 782 783 /** 784 * Encodes the input character sequence to a {@link ByteBuffer} instance using the {@link 785 * ByteBuffer} API, rather than potentially faster approaches. 786 */ encodeUtf8Default(CharSequence in, ByteBuffer out)787 final void encodeUtf8Default(CharSequence in, ByteBuffer out) { 788 final int inLength = in.length(); 789 int outIx = out.position(); 790 int inIx = 0; 791 792 // Since ByteBuffer.putXXX() already checks boundaries for us, no need to explicitly check 793 // access. Assume the buffer is big enough and let it handle the out of bounds exception 794 // if it occurs. 795 try { 796 // Designed to take advantage of 797 // https://wiki.openjdk.java.net/display/HotSpotInternals/RangeCheckElimination 798 for (char c; inIx < inLength && (c = in.charAt(inIx)) < 0x80; ++inIx) { 799 out.put(outIx + inIx, (byte) c); 800 } 801 if (inIx == inLength) { 802 // Successfully encoded the entire string. 803 out.position(outIx + inIx); 804 return; 805 } 806 807 outIx += inIx; 808 for (char c; inIx < inLength; ++inIx, ++outIx) { 809 c = in.charAt(inIx); 810 if (c < 0x80) { 811 // One byte (0xxx xxxx) 812 out.put(outIx, (byte) c); 813 } else if (c < 0x800) { 814 // Two bytes (110x xxxx 10xx xxxx) 815 816 // Benchmarks show put performs better than putShort here (for HotSpot). 817 out.put(outIx++, (byte) (0xC0 | (c >>> 6))); 818 out.put(outIx, (byte) (0x80 | (0x3F & c))); 819 } else if (c < MIN_SURROGATE || MAX_SURROGATE < c) { 820 // Three bytes (1110 xxxx 10xx xxxx 10xx xxxx) 821 // Maximum single-char code point is 0xFFFF, 16 bits. 822 823 // Benchmarks show put performs better than putShort here (for HotSpot). 824 out.put(outIx++, (byte) (0xE0 | (c >>> 12))); 825 out.put(outIx++, (byte) (0x80 | (0x3F & (c >>> 6)))); 826 out.put(outIx, (byte) (0x80 | (0x3F & c))); 827 } else { 828 // Four bytes (1111 xxxx 10xx xxxx 10xx xxxx 10xx xxxx) 829 830 // Minimum code point represented by a surrogate pair is 0x10000, 17 bits, four UTF-8 831 // bytes 832 final char low; 833 if (inIx + 1 == inLength || !isSurrogatePair(c, (low = in.charAt(++inIx)))) { 834 throw new UnpairedSurrogateException(inIx, inLength); 835 } 836 // TODO(nathanmittler): Consider using putInt() to improve performance. 837 int codePoint = toCodePoint(c, low); 838 out.put(outIx++, (byte) ((0xF << 4) | (codePoint >>> 18))); 839 out.put(outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 12)))); 840 out.put(outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 6)))); 841 out.put(outIx, (byte) (0x80 | (0x3F & codePoint))); 842 } 843 } 844 845 // Successfully encoded the entire string. 846 out.position(outIx); 847 } catch (IndexOutOfBoundsException e) { 848 // TODO(nathanmittler): Consider making the API throw IndexOutOfBoundsException instead. 849 850 // If we failed in the outer ASCII loop, outIx will not have been updated. In this case, 851 // use inIx to determine the bad write index. 852 int badWriteIndex = out.position() + Math.max(inIx, outIx - out.position() + 1); 853 throw new ArrayIndexOutOfBoundsException( 854 "Failed writing " + in.charAt(inIx) + " at index " + badWriteIndex); 855 } 856 } 857 } 858 859 /** {@link Processor} implementation that does not use any {@code sun.misc.Unsafe} methods. */ 860 static final class SafeProcessor extends Processor { 861 @Override partialIsValidUtf8(int state, byte[] bytes, int index, int limit)862 int partialIsValidUtf8(int state, byte[] bytes, int index, int limit) { 863 if (state != COMPLETE) { 864 // The previous decoding operation was incomplete (or malformed). 865 // We look for a well-formed sequence consisting of bytes from 866 // the previous decoding operation (stored in state) together 867 // with bytes from the array slice. 868 // 869 // We expect such "straddler characters" to be rare. 870 871 if (index >= limit) { // No bytes? No progress. 872 return state; 873 } 874 int byte1 = (byte) state; 875 // byte1 is never ASCII. 876 if (byte1 < (byte) 0xE0) { 877 // two-byte form 878 879 // Simultaneously checks for illegal trailing-byte in 880 // leading position and overlong 2-byte form. 881 if (byte1 < (byte) 0xC2 882 // byte2 trailing-byte test 883 || bytes[index++] > (byte) 0xBF) { 884 return MALFORMED; 885 } 886 } else if (byte1 < (byte) 0xF0) { 887 // three-byte form 888 889 // Get byte2 from saved state or array 890 int byte2 = (byte) ~(state >> 8); 891 if (byte2 == 0) { 892 byte2 = bytes[index++]; 893 if (index >= limit) { 894 return incompleteStateFor(byte1, byte2); 895 } 896 } 897 if (byte2 > (byte) 0xBF 898 // overlong? 5 most significant bits must not all be zero 899 || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0) 900 // illegal surrogate codepoint? 901 || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0) 902 // byte3 trailing-byte test 903 || bytes[index++] > (byte) 0xBF) { 904 return MALFORMED; 905 } 906 } else { 907 // four-byte form 908 909 // Get byte2 and byte3 from saved state or array 910 int byte2 = (byte) ~(state >> 8); 911 int byte3 = 0; 912 if (byte2 == 0) { 913 byte2 = bytes[index++]; 914 if (index >= limit) { 915 return incompleteStateFor(byte1, byte2); 916 } 917 } else { 918 byte3 = (byte) (state >> 16); 919 } 920 if (byte3 == 0) { 921 byte3 = bytes[index++]; 922 if (index >= limit) { 923 return incompleteStateFor(byte1, byte2, byte3); 924 } 925 } 926 927 // If we were called with state == MALFORMED, then byte1 is 0xFF, 928 // which never occurs in well-formed UTF-8, and so we will return 929 // MALFORMED again below. 930 931 if (byte2 > (byte) 0xBF 932 // Check that 1 <= plane <= 16. Tricky optimized form of: 933 // if (byte1 > (byte) 0xF4 || 934 // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 || 935 // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F) 936 || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0 937 // byte3 trailing-byte test 938 || byte3 > (byte) 0xBF 939 // byte4 trailing-byte test 940 || bytes[index++] > (byte) 0xBF) { 941 return MALFORMED; 942 } 943 } 944 } 945 946 return partialIsValidUtf8(bytes, index, limit); 947 } 948 949 @Override partialIsValidUtf8Direct(int state, ByteBuffer buffer, int index, int limit)950 int partialIsValidUtf8Direct(int state, ByteBuffer buffer, int index, int limit) { 951 // For safe processing, we have to use the ByteBuffer API. 952 return partialIsValidUtf8Default(state, buffer, index, limit); 953 } 954 955 @Override decodeUtf8(byte[] bytes, int index, int size)956 String decodeUtf8(byte[] bytes, int index, int size) throws InvalidProtocolBufferException { 957 // Bitwise OR combines the sign bits so any negative value fails the check. 958 if ((index | size | bytes.length - index - size) < 0) { 959 throw new ArrayIndexOutOfBoundsException( 960 String.format("buffer length=%d, index=%d, size=%d", bytes.length, index, size)); 961 } 962 963 int offset = index; 964 final int limit = offset + size; 965 966 // The longest possible resulting String is the same as the number of input bytes, when it is 967 // all ASCII. For other cases, this over-allocates and we will truncate in the end. 968 char[] resultArr = new char[size]; 969 int resultPos = 0; 970 971 // Optimize for 100% ASCII (Hotspot loves small simple top-level loops like this). 972 // This simple loop stops when we encounter a byte >= 0x80 (i.e. non-ASCII). 973 while (offset < limit) { 974 byte b = bytes[offset]; 975 if (!DecodeUtil.isOneByte(b)) { 976 break; 977 } 978 offset++; 979 DecodeUtil.handleOneByte(b, resultArr, resultPos++); 980 } 981 982 while (offset < limit) { 983 byte byte1 = bytes[offset++]; 984 if (DecodeUtil.isOneByte(byte1)) { 985 DecodeUtil.handleOneByte(byte1, resultArr, resultPos++); 986 // It's common for there to be multiple ASCII characters in a run mixed in, so add an 987 // extra optimized loop to take care of these runs. 988 while (offset < limit) { 989 byte b = bytes[offset]; 990 if (!DecodeUtil.isOneByte(b)) { 991 break; 992 } 993 offset++; 994 DecodeUtil.handleOneByte(b, resultArr, resultPos++); 995 } 996 } else if (DecodeUtil.isTwoBytes(byte1)) { 997 if (offset >= limit) { 998 throw InvalidProtocolBufferException.invalidUtf8(); 999 } 1000 DecodeUtil.handleTwoBytes(byte1, /* byte2 */ bytes[offset++], resultArr, resultPos++); 1001 } else if (DecodeUtil.isThreeBytes(byte1)) { 1002 if (offset >= limit - 1) { 1003 throw InvalidProtocolBufferException.invalidUtf8(); 1004 } 1005 DecodeUtil.handleThreeBytes( 1006 byte1, 1007 /* byte2 */ bytes[offset++], 1008 /* byte3 */ bytes[offset++], 1009 resultArr, 1010 resultPos++); 1011 } else { 1012 if (offset >= limit - 2) { 1013 throw InvalidProtocolBufferException.invalidUtf8(); 1014 } 1015 DecodeUtil.handleFourBytes( 1016 byte1, 1017 /* byte2 */ bytes[offset++], 1018 /* byte3 */ bytes[offset++], 1019 /* byte4 */ bytes[offset++], 1020 resultArr, 1021 resultPos++); 1022 // 4-byte case requires two chars. 1023 resultPos++; 1024 } 1025 } 1026 1027 return new String(resultArr, 0, resultPos); 1028 } 1029 1030 @Override decodeUtf8Direct(ByteBuffer buffer, int index, int size)1031 String decodeUtf8Direct(ByteBuffer buffer, int index, int size) 1032 throws InvalidProtocolBufferException { 1033 // For safe processing, we have to use the ByteBufferAPI. 1034 return decodeUtf8Default(buffer, index, size); 1035 } 1036 1037 @Override encodeUtf8(CharSequence in, byte[] out, int offset, int length)1038 int encodeUtf8(CharSequence in, byte[] out, int offset, int length) { 1039 int utf16Length = in.length(); 1040 int j = offset; 1041 int i = 0; 1042 int limit = offset + length; 1043 // Designed to take advantage of 1044 // https://wiki.openjdk.java.net/display/HotSpotInternals/RangeCheckElimination 1045 for (char c; i < utf16Length && i + j < limit && (c = in.charAt(i)) < 0x80; i++) { 1046 out[j + i] = (byte) c; 1047 } 1048 if (i == utf16Length) { 1049 return j + utf16Length; 1050 } 1051 j += i; 1052 for (char c; i < utf16Length; i++) { 1053 c = in.charAt(i); 1054 if (c < 0x80 && j < limit) { 1055 out[j++] = (byte) c; 1056 } else if (c < 0x800 && j <= limit - 2) { // 11 bits, two UTF-8 bytes 1057 out[j++] = (byte) ((0xF << 6) | (c >>> 6)); 1058 out[j++] = (byte) (0x80 | (0x3F & c)); 1059 } else if ((c < Character.MIN_SURROGATE || Character.MAX_SURROGATE < c) && j <= limit - 3) { 1060 // Maximum single-char code point is 0xFFFF, 16 bits, three UTF-8 bytes 1061 out[j++] = (byte) ((0xF << 5) | (c >>> 12)); 1062 out[j++] = (byte) (0x80 | (0x3F & (c >>> 6))); 1063 out[j++] = (byte) (0x80 | (0x3F & c)); 1064 } else if (j <= limit - 4) { 1065 // Minimum code point represented by a surrogate pair is 0x10000, 17 bits, 1066 // four UTF-8 bytes 1067 final char low; 1068 if (i + 1 == in.length() || !Character.isSurrogatePair(c, (low = in.charAt(++i)))) { 1069 throw new UnpairedSurrogateException((i - 1), utf16Length); 1070 } 1071 int codePoint = Character.toCodePoint(c, low); 1072 out[j++] = (byte) ((0xF << 4) | (codePoint >>> 18)); 1073 out[j++] = (byte) (0x80 | (0x3F & (codePoint >>> 12))); 1074 out[j++] = (byte) (0x80 | (0x3F & (codePoint >>> 6))); 1075 out[j++] = (byte) (0x80 | (0x3F & codePoint)); 1076 } else { 1077 // If we are surrogates and we're not a surrogate pair, always throw an 1078 // UnpairedSurrogateException instead of an ArrayOutOfBoundsException. 1079 if ((Character.MIN_SURROGATE <= c && c <= Character.MAX_SURROGATE) 1080 && (i + 1 == in.length() || !Character.isSurrogatePair(c, in.charAt(i + 1)))) { 1081 throw new UnpairedSurrogateException(i, utf16Length); 1082 } 1083 throw new ArrayIndexOutOfBoundsException("Failed writing " + c + " at index " + j); 1084 } 1085 } 1086 return j; 1087 } 1088 1089 @Override encodeUtf8Direct(CharSequence in, ByteBuffer out)1090 void encodeUtf8Direct(CharSequence in, ByteBuffer out) { 1091 // For safe processing, we have to use the ByteBuffer API. 1092 encodeUtf8Default(in, out); 1093 } 1094 partialIsValidUtf8(byte[] bytes, int index, int limit)1095 private static int partialIsValidUtf8(byte[] bytes, int index, int limit) { 1096 // Optimize for 100% ASCII (Hotspot loves small simple top-level loops like this). 1097 // This simple loop stops when we encounter a byte >= 0x80 (i.e. non-ASCII). 1098 while (index < limit && bytes[index] >= 0) { 1099 index++; 1100 } 1101 1102 return (index >= limit) ? COMPLETE : partialIsValidUtf8NonAscii(bytes, index, limit); 1103 } 1104 partialIsValidUtf8NonAscii(byte[] bytes, int index, int limit)1105 private static int partialIsValidUtf8NonAscii(byte[] bytes, int index, int limit) { 1106 for (; ; ) { 1107 int byte1; 1108 int byte2; 1109 1110 // Optimize for interior runs of ASCII bytes. 1111 do { 1112 if (index >= limit) { 1113 return COMPLETE; 1114 } 1115 } while ((byte1 = bytes[index++]) >= 0); 1116 1117 if (byte1 < (byte) 0xE0) { 1118 // two-byte form 1119 1120 if (index >= limit) { 1121 // Incomplete sequence 1122 return byte1; 1123 } 1124 1125 // Simultaneously checks for illegal trailing-byte in 1126 // leading position and overlong 2-byte form. 1127 if (byte1 < (byte) 0xC2 || bytes[index++] > (byte) 0xBF) { 1128 return MALFORMED; 1129 } 1130 } else if (byte1 < (byte) 0xF0) { 1131 // three-byte form 1132 1133 if (index >= limit - 1) { // incomplete sequence 1134 return incompleteStateFor(bytes, index, limit); 1135 } 1136 if ((byte2 = bytes[index++]) > (byte) 0xBF 1137 // overlong? 5 most significant bits must not all be zero 1138 || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0) 1139 // check for illegal surrogate codepoints 1140 || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0) 1141 // byte3 trailing-byte test 1142 || bytes[index++] > (byte) 0xBF) { 1143 return MALFORMED; 1144 } 1145 } else { 1146 // four-byte form 1147 1148 if (index >= limit - 2) { // incomplete sequence 1149 return incompleteStateFor(bytes, index, limit); 1150 } 1151 if ((byte2 = bytes[index++]) > (byte) 0xBF 1152 // Check that 1 <= plane <= 16. Tricky optimized form of: 1153 // if (byte1 > (byte) 0xF4 || 1154 // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 || 1155 // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F) 1156 || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0 1157 // byte3 trailing-byte test 1158 || bytes[index++] > (byte) 0xBF 1159 // byte4 trailing-byte test 1160 || bytes[index++] > (byte) 0xBF) { 1161 return MALFORMED; 1162 } 1163 } 1164 } 1165 } 1166 } 1167 1168 /** {@link Processor} that uses {@code sun.misc.Unsafe} where possible to improve performance. */ 1169 static final class UnsafeProcessor extends Processor { 1170 /** Indicates whether or not all required unsafe operations are supported on this platform. */ isAvailable()1171 static boolean isAvailable() { 1172 return hasUnsafeArrayOperations() && hasUnsafeByteBufferOperations(); 1173 } 1174 1175 @Override partialIsValidUtf8(int state, byte[] bytes, final int index, final int limit)1176 int partialIsValidUtf8(int state, byte[] bytes, final int index, final int limit) { 1177 // Bitwise OR combines the sign bits so any negative value fails the check. 1178 if ((index | limit | bytes.length - limit) < 0) { 1179 throw new ArrayIndexOutOfBoundsException( 1180 String.format("Array length=%d, index=%d, limit=%d", bytes.length, index, limit)); 1181 } 1182 long offset = index; 1183 final long offsetLimit = limit; 1184 if (state != COMPLETE) { 1185 // The previous decoding operation was incomplete (or malformed). 1186 // We look for a well-formed sequence consisting of bytes from 1187 // the previous decoding operation (stored in state) together 1188 // with bytes from the array slice. 1189 // 1190 // We expect such "straddler characters" to be rare. 1191 1192 if (offset >= offsetLimit) { // No bytes? No progress. 1193 return state; 1194 } 1195 int byte1 = (byte) state; 1196 // byte1 is never ASCII. 1197 if (byte1 < (byte) 0xE0) { 1198 // two-byte form 1199 1200 // Simultaneously checks for illegal trailing-byte in 1201 // leading position and overlong 2-byte form. 1202 if (byte1 < (byte) 0xC2 1203 // byte2 trailing-byte test 1204 || UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF) { 1205 return MALFORMED; 1206 } 1207 } else if (byte1 < (byte) 0xF0) { 1208 // three-byte form 1209 1210 // Get byte2 from saved state or array 1211 int byte2 = (byte) ~(state >> 8); 1212 if (byte2 == 0) { 1213 byte2 = UnsafeUtil.getByte(bytes, offset++); 1214 if (offset >= offsetLimit) { 1215 return incompleteStateFor(byte1, byte2); 1216 } 1217 } 1218 if (byte2 > (byte) 0xBF 1219 // overlong? 5 most significant bits must not all be zero 1220 || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0) 1221 // illegal surrogate codepoint? 1222 || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0) 1223 // byte3 trailing-byte test 1224 || UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF) { 1225 return MALFORMED; 1226 } 1227 } else { 1228 // four-byte form 1229 1230 // Get byte2 and byte3 from saved state or array 1231 int byte2 = (byte) ~(state >> 8); 1232 int byte3 = 0; 1233 if (byte2 == 0) { 1234 byte2 = UnsafeUtil.getByte(bytes, offset++); 1235 if (offset >= offsetLimit) { 1236 return incompleteStateFor(byte1, byte2); 1237 } 1238 } else { 1239 byte3 = (byte) (state >> 16); 1240 } 1241 if (byte3 == 0) { 1242 byte3 = UnsafeUtil.getByte(bytes, offset++); 1243 if (offset >= offsetLimit) { 1244 return incompleteStateFor(byte1, byte2, byte3); 1245 } 1246 } 1247 1248 // If we were called with state == MALFORMED, then byte1 is 0xFF, 1249 // which never occurs in well-formed UTF-8, and so we will return 1250 // MALFORMED again below. 1251 1252 if (byte2 > (byte) 0xBF 1253 // Check that 1 <= plane <= 16. Tricky optimized form of: 1254 // if (byte1 > (byte) 0xF4 || 1255 // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 || 1256 // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F) 1257 || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0 1258 // byte3 trailing-byte test 1259 || byte3 > (byte) 0xBF 1260 // byte4 trailing-byte test 1261 || UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF) { 1262 return MALFORMED; 1263 } 1264 } 1265 } 1266 1267 return partialIsValidUtf8(bytes, offset, (int) (offsetLimit - offset)); 1268 } 1269 1270 @Override partialIsValidUtf8Direct( final int state, ByteBuffer buffer, final int index, final int limit)1271 int partialIsValidUtf8Direct( 1272 final int state, ByteBuffer buffer, final int index, final int limit) { 1273 // Bitwise OR combines the sign bits so any negative value fails the check. 1274 if ((index | limit | buffer.limit() - limit) < 0) { 1275 throw new ArrayIndexOutOfBoundsException( 1276 String.format("buffer limit=%d, index=%d, limit=%d", buffer.limit(), index, limit)); 1277 } 1278 long address = addressOffset(buffer) + index; 1279 final long addressLimit = address + (limit - index); 1280 if (state != COMPLETE) { 1281 // The previous decoding operation was incomplete (or malformed). 1282 // We look for a well-formed sequence consisting of bytes from 1283 // the previous decoding operation (stored in state) together 1284 // with bytes from the array slice. 1285 // 1286 // We expect such "straddler characters" to be rare. 1287 1288 if (address >= addressLimit) { // No bytes? No progress. 1289 return state; 1290 } 1291 1292 final int byte1 = (byte) state; 1293 // byte1 is never ASCII. 1294 if (byte1 < (byte) 0xE0) { 1295 // two-byte form 1296 1297 // Simultaneously checks for illegal trailing-byte in 1298 // leading position and overlong 2-byte form. 1299 if (byte1 < (byte) 0xC2 1300 // byte2 trailing-byte test 1301 || UnsafeUtil.getByte(address++) > (byte) 0xBF) { 1302 return MALFORMED; 1303 } 1304 } else if (byte1 < (byte) 0xF0) { 1305 // three-byte form 1306 1307 // Get byte2 from saved state or array 1308 int byte2 = (byte) ~(state >> 8); 1309 if (byte2 == 0) { 1310 byte2 = UnsafeUtil.getByte(address++); 1311 if (address >= addressLimit) { 1312 return incompleteStateFor(byte1, byte2); 1313 } 1314 } 1315 if (byte2 > (byte) 0xBF 1316 // overlong? 5 most significant bits must not all be zero 1317 || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0) 1318 // illegal surrogate codepoint? 1319 || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0) 1320 // byte3 trailing-byte test 1321 || UnsafeUtil.getByte(address++) > (byte) 0xBF) { 1322 return MALFORMED; 1323 } 1324 } else { 1325 // four-byte form 1326 1327 // Get byte2 and byte3 from saved state or array 1328 int byte2 = (byte) ~(state >> 8); 1329 int byte3 = 0; 1330 if (byte2 == 0) { 1331 byte2 = UnsafeUtil.getByte(address++); 1332 if (address >= addressLimit) { 1333 return incompleteStateFor(byte1, byte2); 1334 } 1335 } else { 1336 byte3 = (byte) (state >> 16); 1337 } 1338 if (byte3 == 0) { 1339 byte3 = UnsafeUtil.getByte(address++); 1340 if (address >= addressLimit) { 1341 return incompleteStateFor(byte1, byte2, byte3); 1342 } 1343 } 1344 1345 // If we were called with state == MALFORMED, then byte1 is 0xFF, 1346 // which never occurs in well-formed UTF-8, and so we will return 1347 // MALFORMED again below. 1348 1349 if (byte2 > (byte) 0xBF 1350 // Check that 1 <= plane <= 16. Tricky optimized form of: 1351 // if (byte1 > (byte) 0xF4 || 1352 // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 || 1353 // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F) 1354 || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0 1355 // byte3 trailing-byte test 1356 || byte3 > (byte) 0xBF 1357 // byte4 trailing-byte test 1358 || UnsafeUtil.getByte(address++) > (byte) 0xBF) { 1359 return MALFORMED; 1360 } 1361 } 1362 } 1363 1364 return partialIsValidUtf8(address, (int) (addressLimit - address)); 1365 } 1366 1367 @Override decodeUtf8(byte[] bytes, int index, int size)1368 String decodeUtf8(byte[] bytes, int index, int size) throws InvalidProtocolBufferException { 1369 if ((index | size | bytes.length - index - size) < 0) { 1370 throw new ArrayIndexOutOfBoundsException( 1371 String.format("buffer length=%d, index=%d, size=%d", bytes.length, index, size)); 1372 } 1373 1374 int offset = index; 1375 final int limit = offset + size; 1376 1377 // The longest possible resulting String is the same as the number of input bytes, when it is 1378 // all ASCII. For other cases, this over-allocates and we will truncate in the end. 1379 char[] resultArr = new char[size]; 1380 int resultPos = 0; 1381 1382 // Optimize for 100% ASCII (Hotspot loves small simple top-level loops like this). 1383 // This simple loop stops when we encounter a byte >= 0x80 (i.e. non-ASCII). 1384 while (offset < limit) { 1385 byte b = UnsafeUtil.getByte(bytes, offset); 1386 if (!DecodeUtil.isOneByte(b)) { 1387 break; 1388 } 1389 offset++; 1390 DecodeUtil.handleOneByte(b, resultArr, resultPos++); 1391 } 1392 1393 while (offset < limit) { 1394 byte byte1 = UnsafeUtil.getByte(bytes, offset++); 1395 if (DecodeUtil.isOneByte(byte1)) { 1396 DecodeUtil.handleOneByte(byte1, resultArr, resultPos++); 1397 // It's common for there to be multiple ASCII characters in a run mixed in, so add an 1398 // extra optimized loop to take care of these runs. 1399 while (offset < limit) { 1400 byte b = UnsafeUtil.getByte(bytes, offset); 1401 if (!DecodeUtil.isOneByte(b)) { 1402 break; 1403 } 1404 offset++; 1405 DecodeUtil.handleOneByte(b, resultArr, resultPos++); 1406 } 1407 } else if (DecodeUtil.isTwoBytes(byte1)) { 1408 if (offset >= limit) { 1409 throw InvalidProtocolBufferException.invalidUtf8(); 1410 } 1411 DecodeUtil.handleTwoBytes( 1412 byte1, /* byte2 */ UnsafeUtil.getByte(bytes, offset++), resultArr, resultPos++); 1413 } else if (DecodeUtil.isThreeBytes(byte1)) { 1414 if (offset >= limit - 1) { 1415 throw InvalidProtocolBufferException.invalidUtf8(); 1416 } 1417 DecodeUtil.handleThreeBytes( 1418 byte1, 1419 /* byte2 */ UnsafeUtil.getByte(bytes, offset++), 1420 /* byte3 */ UnsafeUtil.getByte(bytes, offset++), 1421 resultArr, 1422 resultPos++); 1423 } else { 1424 if (offset >= limit - 2) { 1425 throw InvalidProtocolBufferException.invalidUtf8(); 1426 } 1427 DecodeUtil.handleFourBytes( 1428 byte1, 1429 /* byte2 */ UnsafeUtil.getByte(bytes, offset++), 1430 /* byte3 */ UnsafeUtil.getByte(bytes, offset++), 1431 /* byte4 */ UnsafeUtil.getByte(bytes, offset++), 1432 resultArr, 1433 resultPos++); 1434 // 4-byte case requires two chars. 1435 resultPos++; 1436 } 1437 } 1438 1439 return new String(resultArr, 0, resultPos); 1440 } 1441 1442 @Override decodeUtf8Direct(ByteBuffer buffer, int index, int size)1443 String decodeUtf8Direct(ByteBuffer buffer, int index, int size) 1444 throws InvalidProtocolBufferException { 1445 // Bitwise OR combines the sign bits so any negative value fails the check. 1446 if ((index | size | buffer.limit() - index - size) < 0) { 1447 throw new ArrayIndexOutOfBoundsException( 1448 String.format("buffer limit=%d, index=%d, limit=%d", buffer.limit(), index, size)); 1449 } 1450 long address = UnsafeUtil.addressOffset(buffer) + index; 1451 final long addressLimit = address + size; 1452 1453 // The longest possible resulting String is the same as the number of input bytes, when it is 1454 // all ASCII. For other cases, this over-allocates and we will truncate in the end. 1455 char[] resultArr = new char[size]; 1456 int resultPos = 0; 1457 1458 // Optimize for 100% ASCII (Hotspot loves small simple top-level loops like this). 1459 // This simple loop stops when we encounter a byte >= 0x80 (i.e. non-ASCII). 1460 while (address < addressLimit) { 1461 byte b = UnsafeUtil.getByte(address); 1462 if (!DecodeUtil.isOneByte(b)) { 1463 break; 1464 } 1465 address++; 1466 DecodeUtil.handleOneByte(b, resultArr, resultPos++); 1467 } 1468 1469 while (address < addressLimit) { 1470 byte byte1 = UnsafeUtil.getByte(address++); 1471 if (DecodeUtil.isOneByte(byte1)) { 1472 DecodeUtil.handleOneByte(byte1, resultArr, resultPos++); 1473 // It's common for there to be multiple ASCII characters in a run mixed in, so add an 1474 // extra optimized loop to take care of these runs. 1475 while (address < addressLimit) { 1476 byte b = UnsafeUtil.getByte(address); 1477 if (!DecodeUtil.isOneByte(b)) { 1478 break; 1479 } 1480 address++; 1481 DecodeUtil.handleOneByte(b, resultArr, resultPos++); 1482 } 1483 } else if (DecodeUtil.isTwoBytes(byte1)) { 1484 if (address >= addressLimit) { 1485 throw InvalidProtocolBufferException.invalidUtf8(); 1486 } 1487 DecodeUtil.handleTwoBytes( 1488 byte1, /* byte2 */ UnsafeUtil.getByte(address++), resultArr, resultPos++); 1489 } else if (DecodeUtil.isThreeBytes(byte1)) { 1490 if (address >= addressLimit - 1) { 1491 throw InvalidProtocolBufferException.invalidUtf8(); 1492 } 1493 DecodeUtil.handleThreeBytes( 1494 byte1, 1495 /* byte2 */ UnsafeUtil.getByte(address++), 1496 /* byte3 */ UnsafeUtil.getByte(address++), 1497 resultArr, 1498 resultPos++); 1499 } else { 1500 if (address >= addressLimit - 2) { 1501 throw InvalidProtocolBufferException.invalidUtf8(); 1502 } 1503 DecodeUtil.handleFourBytes( 1504 byte1, 1505 /* byte2 */ UnsafeUtil.getByte(address++), 1506 /* byte3 */ UnsafeUtil.getByte(address++), 1507 /* byte4 */ UnsafeUtil.getByte(address++), 1508 resultArr, 1509 resultPos++); 1510 // 4-byte case requires two chars. 1511 resultPos++; 1512 } 1513 } 1514 1515 return new String(resultArr, 0, resultPos); 1516 } 1517 1518 @Override encodeUtf8(final CharSequence in, final byte[] out, final int offset, final int length)1519 int encodeUtf8(final CharSequence in, final byte[] out, final int offset, final int length) { 1520 long outIx = offset; 1521 final long outLimit = outIx + length; 1522 final int inLimit = in.length(); 1523 if (inLimit > length || out.length - length < offset) { 1524 // Not even enough room for an ASCII-encoded string. 1525 throw new ArrayIndexOutOfBoundsException( 1526 "Failed writing " + in.charAt(inLimit - 1) + " at index " + (offset + length)); 1527 } 1528 1529 // Designed to take advantage of 1530 // https://wiki.openjdk.java.net/display/HotSpotInternals/RangeCheckElimination 1531 int inIx = 0; 1532 for (char c; inIx < inLimit && (c = in.charAt(inIx)) < 0x80; ++inIx) { 1533 UnsafeUtil.putByte(out, outIx++, (byte) c); 1534 } 1535 if (inIx == inLimit) { 1536 // We're done, it was ASCII encoded. 1537 return (int) outIx; 1538 } 1539 1540 for (char c; inIx < inLimit; ++inIx) { 1541 c = in.charAt(inIx); 1542 if (c < 0x80 && outIx < outLimit) { 1543 UnsafeUtil.putByte(out, outIx++, (byte) c); 1544 } else if (c < 0x800 && outIx <= outLimit - 2L) { // 11 bits, two UTF-8 bytes 1545 UnsafeUtil.putByte(out, outIx++, (byte) ((0xF << 6) | (c >>> 6))); 1546 UnsafeUtil.putByte(out, outIx++, (byte) (0x80 | (0x3F & c))); 1547 } else if ((c < MIN_SURROGATE || MAX_SURROGATE < c) && outIx <= outLimit - 3L) { 1548 // Maximum single-char code point is 0xFFFF, 16 bits, three UTF-8 bytes 1549 UnsafeUtil.putByte(out, outIx++, (byte) ((0xF << 5) | (c >>> 12))); 1550 UnsafeUtil.putByte(out, outIx++, (byte) (0x80 | (0x3F & (c >>> 6)))); 1551 UnsafeUtil.putByte(out, outIx++, (byte) (0x80 | (0x3F & c))); 1552 } else if (outIx <= outLimit - 4L) { 1553 // Minimum code point represented by a surrogate pair is 0x10000, 17 bits, four UTF-8 1554 // bytes 1555 final char low; 1556 if (inIx + 1 == inLimit || !isSurrogatePair(c, (low = in.charAt(++inIx)))) { 1557 throw new UnpairedSurrogateException((inIx - 1), inLimit); 1558 } 1559 int codePoint = toCodePoint(c, low); 1560 UnsafeUtil.putByte(out, outIx++, (byte) ((0xF << 4) | (codePoint >>> 18))); 1561 UnsafeUtil.putByte(out, outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 12)))); 1562 UnsafeUtil.putByte(out, outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 6)))); 1563 UnsafeUtil.putByte(out, outIx++, (byte) (0x80 | (0x3F & codePoint))); 1564 } else { 1565 if ((MIN_SURROGATE <= c && c <= MAX_SURROGATE) 1566 && (inIx + 1 == inLimit || !isSurrogatePair(c, in.charAt(inIx + 1)))) { 1567 // We are surrogates and we're not a surrogate pair. 1568 throw new UnpairedSurrogateException(inIx, inLimit); 1569 } 1570 // Not enough space in the output buffer. 1571 throw new ArrayIndexOutOfBoundsException("Failed writing " + c + " at index " + outIx); 1572 } 1573 } 1574 1575 // All bytes have been encoded. 1576 return (int) outIx; 1577 } 1578 1579 @Override encodeUtf8Direct(CharSequence in, ByteBuffer out)1580 void encodeUtf8Direct(CharSequence in, ByteBuffer out) { 1581 final long address = addressOffset(out); 1582 long outIx = address + out.position(); 1583 final long outLimit = address + out.limit(); 1584 final int inLimit = in.length(); 1585 if (inLimit > outLimit - outIx) { 1586 // Not even enough room for an ASCII-encoded string. 1587 throw new ArrayIndexOutOfBoundsException( 1588 "Failed writing " + in.charAt(inLimit - 1) + " at index " + out.limit()); 1589 } 1590 1591 // Designed to take advantage of 1592 // https://wiki.openjdk.java.net/display/HotSpotInternals/RangeCheckElimination 1593 int inIx = 0; 1594 for (char c; inIx < inLimit && (c = in.charAt(inIx)) < 0x80; ++inIx) { 1595 UnsafeUtil.putByte(outIx++, (byte) c); 1596 } 1597 if (inIx == inLimit) { 1598 // We're done, it was ASCII encoded. 1599 out.position((int) (outIx - address)); 1600 return; 1601 } 1602 1603 for (char c; inIx < inLimit; ++inIx) { 1604 c = in.charAt(inIx); 1605 if (c < 0x80 && outIx < outLimit) { 1606 UnsafeUtil.putByte(outIx++, (byte) c); 1607 } else if (c < 0x800 && outIx <= outLimit - 2L) { // 11 bits, two UTF-8 bytes 1608 UnsafeUtil.putByte(outIx++, (byte) ((0xF << 6) | (c >>> 6))); 1609 UnsafeUtil.putByte(outIx++, (byte) (0x80 | (0x3F & c))); 1610 } else if ((c < MIN_SURROGATE || MAX_SURROGATE < c) && outIx <= outLimit - 3L) { 1611 // Maximum single-char code point is 0xFFFF, 16 bits, three UTF-8 bytes 1612 UnsafeUtil.putByte(outIx++, (byte) ((0xF << 5) | (c >>> 12))); 1613 UnsafeUtil.putByte(outIx++, (byte) (0x80 | (0x3F & (c >>> 6)))); 1614 UnsafeUtil.putByte(outIx++, (byte) (0x80 | (0x3F & c))); 1615 } else if (outIx <= outLimit - 4L) { 1616 // Minimum code point represented by a surrogate pair is 0x10000, 17 bits, four UTF-8 1617 // bytes 1618 final char low; 1619 if (inIx + 1 == inLimit || !isSurrogatePair(c, (low = in.charAt(++inIx)))) { 1620 throw new UnpairedSurrogateException((inIx - 1), inLimit); 1621 } 1622 int codePoint = toCodePoint(c, low); 1623 UnsafeUtil.putByte(outIx++, (byte) ((0xF << 4) | (codePoint >>> 18))); 1624 UnsafeUtil.putByte(outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 12)))); 1625 UnsafeUtil.putByte(outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 6)))); 1626 UnsafeUtil.putByte(outIx++, (byte) (0x80 | (0x3F & codePoint))); 1627 } else { 1628 if ((MIN_SURROGATE <= c && c <= MAX_SURROGATE) 1629 && (inIx + 1 == inLimit || !isSurrogatePair(c, in.charAt(inIx + 1)))) { 1630 // We are surrogates and we're not a surrogate pair. 1631 throw new UnpairedSurrogateException(inIx, inLimit); 1632 } 1633 // Not enough space in the output buffer. 1634 throw new ArrayIndexOutOfBoundsException("Failed writing " + c + " at index " + outIx); 1635 } 1636 } 1637 1638 // All bytes have been encoded. 1639 out.position((int) (outIx - address)); 1640 } 1641 1642 /** 1643 * Counts (approximately) the number of consecutive ASCII characters starting from the given 1644 * position, using the most efficient method available to the platform. 1645 * 1646 * @param bytes the array containing the character sequence 1647 * @param offset the offset position of the index (same as index + arrayBaseOffset) 1648 * @param maxChars the maximum number of characters to count 1649 * @return the number of ASCII characters found. The stopping position will be at or before the 1650 * first non-ASCII byte. 1651 */ unsafeEstimateConsecutiveAscii( byte[] bytes, long offset, final int maxChars)1652 private static int unsafeEstimateConsecutiveAscii( 1653 byte[] bytes, long offset, final int maxChars) { 1654 if (maxChars < UNSAFE_COUNT_ASCII_THRESHOLD) { 1655 // Don't bother with small strings. 1656 return 0; 1657 } 1658 1659 for (int i = 0; i < maxChars; i++) { 1660 if (UnsafeUtil.getByte(bytes, offset++) < 0) { 1661 return i; 1662 } 1663 } 1664 return maxChars; 1665 } 1666 1667 /** 1668 * Same as {@link Utf8#estimateConsecutiveAscii(ByteBuffer, int, int)} except that it uses the 1669 * most efficient method available to the platform. 1670 */ unsafeEstimateConsecutiveAscii(long address, final int maxChars)1671 private static int unsafeEstimateConsecutiveAscii(long address, final int maxChars) { 1672 int remaining = maxChars; 1673 if (remaining < UNSAFE_COUNT_ASCII_THRESHOLD) { 1674 // Don't bother with small strings. 1675 return 0; 1676 } 1677 1678 // Read bytes until 8-byte aligned so that we can read longs in the loop below. 1679 // We do this by ANDing the address with 7 to determine the number of bytes that need to 1680 // be read before we're 8-byte aligned. 1681 final int unaligned = 8 - ((int) address & 7); 1682 for (int j = unaligned; j > 0; j--) { 1683 if (UnsafeUtil.getByte(address++) < 0) { 1684 return unaligned - j; 1685 } 1686 } 1687 1688 // This simple loop stops when we encounter a byte >= 0x80 (i.e. non-ASCII). 1689 // To speed things up further, we're reading longs instead of bytes so we use a mask to 1690 // determine if any byte in the current long is non-ASCII. 1691 remaining -= unaligned; 1692 for (; 1693 remaining >= 8 && (UnsafeUtil.getLong(address) & ASCII_MASK_LONG) == 0; 1694 address += 8, remaining -= 8) {} 1695 return maxChars - remaining; 1696 } 1697 partialIsValidUtf8(final byte[] bytes, long offset, int remaining)1698 private static int partialIsValidUtf8(final byte[] bytes, long offset, int remaining) { 1699 // Skip past ASCII characters as quickly as possible. 1700 final int skipped = unsafeEstimateConsecutiveAscii(bytes, offset, remaining); 1701 remaining -= skipped; 1702 offset += skipped; 1703 1704 for (; ; ) { 1705 // Optimize for interior runs of ASCII bytes. 1706 // TODO(nathanmittler): Consider checking 8 bytes at a time after some threshold? 1707 // Maybe after seeing a few in a row that are ASCII, go back to fast mode? 1708 int byte1 = 0; 1709 for (; remaining > 0 && (byte1 = UnsafeUtil.getByte(bytes, offset++)) >= 0; --remaining) {} 1710 if (remaining == 0) { 1711 return COMPLETE; 1712 } 1713 remaining--; 1714 1715 // If we're here byte1 is not ASCII. Only need to handle 2-4 byte forms. 1716 if (byte1 < (byte) 0xE0) { 1717 // Two-byte form (110xxxxx 10xxxxxx) 1718 if (remaining == 0) { 1719 // Incomplete sequence 1720 return byte1; 1721 } 1722 remaining--; 1723 1724 // Simultaneously checks for illegal trailing-byte in 1725 // leading position and overlong 2-byte form. 1726 if (byte1 < (byte) 0xC2 || UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF) { 1727 return MALFORMED; 1728 } 1729 } else if (byte1 < (byte) 0xF0) { 1730 // Three-byte form (1110xxxx 10xxxxxx 10xxxxxx) 1731 if (remaining < 2) { 1732 // Incomplete sequence 1733 return unsafeIncompleteStateFor(bytes, byte1, offset, remaining); 1734 } 1735 remaining -= 2; 1736 1737 final int byte2; 1738 if ((byte2 = UnsafeUtil.getByte(bytes, offset++)) > (byte) 0xBF 1739 // overlong? 5 most significant bits must not all be zero 1740 || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0) 1741 // check for illegal surrogate codepoints 1742 || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0) 1743 // byte3 trailing-byte test 1744 || UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF) { 1745 return MALFORMED; 1746 } 1747 } else { 1748 // Four-byte form (1110xxxx 10xxxxxx 10xxxxxx 10xxxxxx) 1749 if (remaining < 3) { 1750 // Incomplete sequence 1751 return unsafeIncompleteStateFor(bytes, byte1, offset, remaining); 1752 } 1753 remaining -= 3; 1754 1755 final int byte2; 1756 if ((byte2 = UnsafeUtil.getByte(bytes, offset++)) > (byte) 0xBF 1757 // Check that 1 <= plane <= 16. Tricky optimized form of: 1758 // if (byte1 > (byte) 0xF4 || 1759 // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 || 1760 // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F) 1761 || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0 1762 // byte3 trailing-byte test 1763 || UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF 1764 // byte4 trailing-byte test 1765 || UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF) { 1766 return MALFORMED; 1767 } 1768 } 1769 } 1770 } 1771 partialIsValidUtf8(long address, int remaining)1772 private static int partialIsValidUtf8(long address, int remaining) { 1773 // Skip past ASCII characters as quickly as possible. 1774 final int skipped = unsafeEstimateConsecutiveAscii(address, remaining); 1775 address += skipped; 1776 remaining -= skipped; 1777 1778 for (; ; ) { 1779 // Optimize for interior runs of ASCII bytes. 1780 // TODO(nathanmittler): Consider checking 8 bytes at a time after some threshold? 1781 // Maybe after seeing a few in a row that are ASCII, go back to fast mode? 1782 int byte1 = 0; 1783 for (; remaining > 0 && (byte1 = UnsafeUtil.getByte(address++)) >= 0; --remaining) {} 1784 if (remaining == 0) { 1785 return COMPLETE; 1786 } 1787 remaining--; 1788 1789 if (byte1 < (byte) 0xE0) { 1790 // Two-byte form 1791 1792 if (remaining == 0) { 1793 // Incomplete sequence 1794 return byte1; 1795 } 1796 remaining--; 1797 1798 // Simultaneously checks for illegal trailing-byte in 1799 // leading position and overlong 2-byte form. 1800 if (byte1 < (byte) 0xC2 || UnsafeUtil.getByte(address++) > (byte) 0xBF) { 1801 return MALFORMED; 1802 } 1803 } else if (byte1 < (byte) 0xF0) { 1804 // Three-byte form 1805 1806 if (remaining < 2) { 1807 // Incomplete sequence 1808 return unsafeIncompleteStateFor(address, byte1, remaining); 1809 } 1810 remaining -= 2; 1811 1812 final byte byte2 = UnsafeUtil.getByte(address++); 1813 if (byte2 > (byte) 0xBF 1814 // overlong? 5 most significant bits must not all be zero 1815 || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0) 1816 // check for illegal surrogate codepoints 1817 || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0) 1818 // byte3 trailing-byte test 1819 || UnsafeUtil.getByte(address++) > (byte) 0xBF) { 1820 return MALFORMED; 1821 } 1822 } else { 1823 // Four-byte form 1824 1825 if (remaining < 3) { 1826 // Incomplete sequence 1827 return unsafeIncompleteStateFor(address, byte1, remaining); 1828 } 1829 remaining -= 3; 1830 1831 final byte byte2 = UnsafeUtil.getByte(address++); 1832 if (byte2 > (byte) 0xBF 1833 // Check that 1 <= plane <= 16. Tricky optimized form of: 1834 // if (byte1 > (byte) 0xF4 || 1835 // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 || 1836 // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F) 1837 || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0 1838 // byte3 trailing-byte test 1839 || UnsafeUtil.getByte(address++) > (byte) 0xBF 1840 // byte4 trailing-byte test 1841 || UnsafeUtil.getByte(address++) > (byte) 0xBF) { 1842 return MALFORMED; 1843 } 1844 } 1845 } 1846 } 1847 unsafeIncompleteStateFor( byte[] bytes, int byte1, long offset, int remaining)1848 private static int unsafeIncompleteStateFor( 1849 byte[] bytes, int byte1, long offset, int remaining) { 1850 switch (remaining) { 1851 case 0: 1852 return incompleteStateFor(byte1); 1853 case 1: 1854 return incompleteStateFor(byte1, UnsafeUtil.getByte(bytes, offset)); 1855 case 2: 1856 return incompleteStateFor( 1857 byte1, UnsafeUtil.getByte(bytes, offset), UnsafeUtil.getByte(bytes, offset + 1)); 1858 default: 1859 throw new AssertionError(); 1860 } 1861 } 1862 unsafeIncompleteStateFor(long address, final int byte1, int remaining)1863 private static int unsafeIncompleteStateFor(long address, final int byte1, int remaining) { 1864 switch (remaining) { 1865 case 0: 1866 return incompleteStateFor(byte1); 1867 case 1: 1868 return incompleteStateFor(byte1, UnsafeUtil.getByte(address)); 1869 case 2: 1870 return incompleteStateFor( 1871 byte1, UnsafeUtil.getByte(address), UnsafeUtil.getByte(address + 1)); 1872 default: 1873 throw new AssertionError(); 1874 } 1875 } 1876 } 1877 1878 /** 1879 * Utility methods for decoding bytes into {@link String}. Callers are responsible for extracting 1880 * bytes (possibly using Unsafe methods), and checking remaining bytes. All other UTF-8 validity 1881 * checks and codepoint conversion happen in this class. 1882 */ 1883 private static class DecodeUtil { 1884 1885 /** Returns whether this is a single-byte codepoint (i.e., ASCII) with the form '0XXXXXXX'. */ isOneByte(byte b)1886 private static boolean isOneByte(byte b) { 1887 return b >= 0; 1888 } 1889 1890 /** Returns whether this is a two-byte codepoint with the form '10XXXXXX'. */ isTwoBytes(byte b)1891 private static boolean isTwoBytes(byte b) { 1892 return b < (byte) 0xE0; 1893 } 1894 1895 /** Returns whether this is a three-byte codepoint with the form '110XXXXX'. */ isThreeBytes(byte b)1896 private static boolean isThreeBytes(byte b) { 1897 return b < (byte) 0xF0; 1898 } 1899 handleOneByte(byte byte1, char[] resultArr, int resultPos)1900 private static void handleOneByte(byte byte1, char[] resultArr, int resultPos) { 1901 resultArr[resultPos] = (char) byte1; 1902 } 1903 handleTwoBytes(byte byte1, byte byte2, char[] resultArr, int resultPos)1904 private static void handleTwoBytes(byte byte1, byte byte2, char[] resultArr, int resultPos) 1905 throws InvalidProtocolBufferException { 1906 // Simultaneously checks for illegal trailing-byte in leading position (<= '11000000') and 1907 // overlong 2-byte, '11000001'. 1908 if (byte1 < (byte) 0xC2 || isNotTrailingByte(byte2)) { 1909 throw InvalidProtocolBufferException.invalidUtf8(); 1910 } 1911 resultArr[resultPos] = (char) (((byte1 & 0x1F) << 6) | trailingByteValue(byte2)); 1912 } 1913 handleThreeBytes( byte byte1, byte byte2, byte byte3, char[] resultArr, int resultPos)1914 private static void handleThreeBytes( 1915 byte byte1, byte byte2, byte byte3, char[] resultArr, int resultPos) 1916 throws InvalidProtocolBufferException { 1917 if (isNotTrailingByte(byte2) 1918 // overlong? 5 most significant bits must not all be zero 1919 || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0) 1920 // check for illegal surrogate codepoints 1921 || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0) 1922 || isNotTrailingByte(byte3)) { 1923 throw InvalidProtocolBufferException.invalidUtf8(); 1924 } 1925 resultArr[resultPos] = 1926 (char) 1927 (((byte1 & 0x0F) << 12) | (trailingByteValue(byte2) << 6) | trailingByteValue(byte3)); 1928 } 1929 handleFourBytes( byte byte1, byte byte2, byte byte3, byte byte4, char[] resultArr, int resultPos)1930 private static void handleFourBytes( 1931 byte byte1, byte byte2, byte byte3, byte byte4, char[] resultArr, int resultPos) 1932 throws InvalidProtocolBufferException { 1933 if (isNotTrailingByte(byte2) 1934 // Check that 1 <= plane <= 16. Tricky optimized form of: 1935 // valid 4-byte leading byte? 1936 // if (byte1 > (byte) 0xF4 || 1937 // overlong? 4 most significant bits must not all be zero 1938 // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 || 1939 // codepoint larger than the highest code point (U+10FFFF)? 1940 // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F) 1941 || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0 1942 || isNotTrailingByte(byte3) 1943 || isNotTrailingByte(byte4)) { 1944 throw InvalidProtocolBufferException.invalidUtf8(); 1945 } 1946 int codepoint = 1947 ((byte1 & 0x07) << 18) 1948 | (trailingByteValue(byte2) << 12) 1949 | (trailingByteValue(byte3) << 6) 1950 | trailingByteValue(byte4); 1951 resultArr[resultPos] = DecodeUtil.highSurrogate(codepoint); 1952 resultArr[resultPos + 1] = DecodeUtil.lowSurrogate(codepoint); 1953 } 1954 1955 /** Returns whether the byte is not a valid continuation of the form '10XXXXXX'. */ isNotTrailingByte(byte b)1956 private static boolean isNotTrailingByte(byte b) { 1957 return b > (byte) 0xBF; 1958 } 1959 1960 /** Returns the actual value of the trailing byte (removes the prefix '10') for composition. */ trailingByteValue(byte b)1961 private static int trailingByteValue(byte b) { 1962 return b & 0x3F; 1963 } 1964 highSurrogate(int codePoint)1965 private static char highSurrogate(int codePoint) { 1966 return (char) 1967 ((MIN_HIGH_SURROGATE - (MIN_SUPPLEMENTARY_CODE_POINT >>> 10)) + (codePoint >>> 10)); 1968 } 1969 lowSurrogate(int codePoint)1970 private static char lowSurrogate(int codePoint) { 1971 return (char) (MIN_LOW_SURROGATE + (codePoint & 0x3ff)); 1972 } 1973 } 1974 Utf8()1975 private Utf8() {} 1976 } 1977