1 // Protocol Buffers - Google's data interchange format 2 // Copyright 2008 Google Inc. All rights reserved. 3 // 4 // Use of this source code is governed by a BSD-style 5 // license that can be found in the LICENSE file or at 6 // https://developers.google.com/open-source/licenses/bsd 7 8 package com.google.protobuf; 9 10 import static com.google.protobuf.UnsafeUtil.addressOffset; 11 import static com.google.protobuf.UnsafeUtil.hasUnsafeArrayOperations; 12 import static com.google.protobuf.UnsafeUtil.hasUnsafeByteBufferOperations; 13 import static java.lang.Character.MAX_SURROGATE; 14 import static java.lang.Character.MIN_HIGH_SURROGATE; 15 import static java.lang.Character.MIN_LOW_SURROGATE; 16 import static java.lang.Character.MIN_SUPPLEMENTARY_CODE_POINT; 17 import static java.lang.Character.MIN_SURROGATE; 18 import static java.lang.Character.isSurrogatePair; 19 import static java.lang.Character.toCodePoint; 20 21 import java.nio.ByteBuffer; 22 import java.util.Arrays; 23 24 /** 25 * A set of low-level, high-performance static utility methods related to the UTF-8 character 26 * encoding. This class has no dependencies outside of the core JDK libraries. 27 * 28 * <p>There are several variants of UTF-8. The one implemented by this class is the restricted 29 * definition of UTF-8 introduced in Unicode 3.1, which mandates the rejection of "overlong" byte 30 * sequences as well as rejection of 3-byte surrogate codepoint byte sequences. Note that the UTF-8 31 * decoder included in Oracle's JDK has been modified to also reject "overlong" byte sequences, but 32 * (as of 2011) still accepts 3-byte surrogate codepoint byte sequences. 33 * 34 * <p>The byte sequences considered valid by this class are exactly those that can be roundtrip 35 * converted to Strings and back to bytes using the UTF-8 charset, without loss: 36 * 37 * <pre>{@code 38 * Arrays.equals(bytes, new String(bytes, Internal.UTF_8).getBytes(Internal.UTF_8)) 39 * }</pre> 40 * 41 * <p>See the Unicode Standard,</br> Table 3-6. <em>UTF-8 Bit Distribution</em>,</br> Table 3-7. 42 * <em>Well Formed UTF-8 Byte Sequences</em>. 43 * 44 * <p>This class supports decoding of partial byte sequences, so that the bytes in a complete UTF-8 45 * byte sequence can be stored in multiple segments. Methods typically return {@link #MALFORMED} if 46 * the partial byte sequence is definitely not well-formed; {@link #COMPLETE} if it is well-formed 47 * in the absence of additional input; or, if the byte sequence apparently terminated in the middle 48 * of a character, an opaque integer "state" value containing enough information to decode the 49 * character when passed to a subsequent invocation of a partial decoding method. 50 * 51 * @author martinrb@google.com (Martin Buchholz) 52 */ 53 // TODO: Copy changes in this class back to Guava 54 final class Utf8 { 55 56 /** 57 * UTF-8 is a runtime hot spot so we attempt to provide heavily optimized implementations 58 * depending on what is available on the platform. The processor is the platform-optimized 59 * delegate for which all methods are delegated directly to. 60 */ 61 private static final Processor processor = 62 (UnsafeProcessor.isAvailable() && !Android.isOnAndroidDevice()) 63 ? new UnsafeProcessor() 64 : new SafeProcessor(); 65 66 /** 67 * A mask used when performing unsafe reads to determine if a long value contains any non-ASCII 68 * characters (i.e. any byte >= 0x80). 69 */ 70 private static final long ASCII_MASK_LONG = 0x8080808080808080L; 71 72 /** 73 * Maximum number of bytes per Java UTF-16 char in UTF-8. 74 * 75 * @see java.nio.charset.CharsetEncoder#maxBytesPerChar() 76 */ 77 static final int MAX_BYTES_PER_CHAR = 3; 78 79 /** 80 * State value indicating that the byte sequence is well-formed and complete (no further bytes are 81 * needed to complete a character). 82 */ 83 static final int COMPLETE = 0; 84 85 /** State value indicating that the byte sequence is definitely not well-formed. */ 86 static final int MALFORMED = -1; 87 88 /** 89 * Used by {@code Unsafe} UTF-8 string validation logic to determine the minimum string length 90 * above which to employ an optimized algorithm for counting ASCII characters. The reason for this 91 * threshold is that for small strings, the optimization may not be beneficial or may even 92 * negatively impact performance since it requires additional logic to avoid unaligned reads (when 93 * calling {@code Unsafe.getLong}). This threshold guarantees that even if the initial offset is 94 * unaligned, we're guaranteed to make at least one call to {@code Unsafe.getLong()} which 95 * provides a performance improvement that entirely subsumes the cost of the additional logic. 96 */ 97 private static final int UNSAFE_COUNT_ASCII_THRESHOLD = 16; 98 99 // Other state values include the partial bytes of the incomplete 100 // character to be decoded in the simplest way: we pack the bytes 101 // into the state int in little-endian order. For example: 102 // 103 // int state = byte1 ^ (byte2 << 8) ^ (byte3 << 16); 104 // 105 // Such a state is unpacked thus (note the ~ operation for byte2 to 106 // undo byte1's sign-extension bits): 107 // 108 // int byte1 = (byte) state; 109 // int byte2 = (byte) ~(state >> 8); 110 // int byte3 = (byte) (state >> 16); 111 // 112 // We cannot store a zero byte in the state because it would be 113 // indistinguishable from the absence of a byte. But we don't need 114 // to, because partial bytes must always be negative. When building 115 // a state, we ensure that byte1 is negative and subsequent bytes 116 // are valid trailing bytes. 117 118 /** 119 * Returns {@code true} if the given byte array is a well-formed UTF-8 byte sequence. 120 * 121 * <p>This is a convenience method, equivalent to a call to {@code isValidUtf8(bytes, 0, 122 * bytes.length)}. 123 */ isValidUtf8(byte[] bytes)124 static boolean isValidUtf8(byte[] bytes) { 125 return processor.isValidUtf8(bytes, 0, bytes.length); 126 } 127 128 /** 129 * Returns {@code true} if the given byte array slice is a well-formed UTF-8 byte sequence. The 130 * range of bytes to be checked extends from index {@code index}, inclusive, to {@code limit}, 131 * exclusive. 132 * 133 * <p>This is a convenience method, equivalent to {@code partialIsValidUtf8(bytes, index, limit) 134 * == Utf8.COMPLETE}. 135 */ isValidUtf8(byte[] bytes, int index, int limit)136 static boolean isValidUtf8(byte[] bytes, int index, int limit) { 137 return processor.isValidUtf8(bytes, index, limit); 138 } 139 140 /** 141 * Tells whether the given byte array slice is a well-formed, malformed, or incomplete UTF-8 byte 142 * sequence. The range of bytes to be checked extends from index {@code index}, inclusive, to 143 * {@code limit}, exclusive. 144 * 145 * @param state either {@link Utf8#COMPLETE} (if this is the initial decoding operation) or the 146 * value returned from a call to a partial decoding method for the previous bytes 147 * @return {@link #MALFORMED} if the partial byte sequence is definitely not well-formed, {@link 148 * #COMPLETE} if it is well-formed (no additional input needed), or if the byte sequence is 149 * "incomplete", i.e. apparently terminated in the middle of a character, an opaque integer 150 * "state" value containing enough information to decode the character when passed to a 151 * subsequent invocation of a partial decoding method. 152 */ partialIsValidUtf8(int state, byte[] bytes, int index, int limit)153 static int partialIsValidUtf8(int state, byte[] bytes, int index, int limit) { 154 return processor.partialIsValidUtf8(state, bytes, index, limit); 155 } 156 incompleteStateFor(int byte1)157 private static int incompleteStateFor(int byte1) { 158 return (byte1 > (byte) 0xF4) ? MALFORMED : byte1; 159 } 160 incompleteStateFor(int byte1, int byte2)161 private static int incompleteStateFor(int byte1, int byte2) { 162 return (byte1 > (byte) 0xF4 || byte2 > (byte) 0xBF) ? MALFORMED : byte1 ^ (byte2 << 8); 163 } 164 incompleteStateFor(int byte1, int byte2, int byte3)165 private static int incompleteStateFor(int byte1, int byte2, int byte3) { 166 return (byte1 > (byte) 0xF4 || byte2 > (byte) 0xBF || byte3 > (byte) 0xBF) 167 ? MALFORMED 168 : byte1 ^ (byte2 << 8) ^ (byte3 << 16); 169 } 170 incompleteStateFor(byte[] bytes, int index, int limit)171 private static int incompleteStateFor(byte[] bytes, int index, int limit) { 172 int byte1 = bytes[index - 1]; 173 switch (limit - index) { 174 case 0: 175 return incompleteStateFor(byte1); 176 case 1: 177 return incompleteStateFor(byte1, bytes[index]); 178 case 2: 179 return incompleteStateFor(byte1, bytes[index], bytes[index + 1]); 180 default: 181 throw new AssertionError(); 182 } 183 } 184 incompleteStateFor( final ByteBuffer buffer, final int byte1, final int index, final int remaining)185 private static int incompleteStateFor( 186 final ByteBuffer buffer, final int byte1, final int index, final int remaining) { 187 switch (remaining) { 188 case 0: 189 return incompleteStateFor(byte1); 190 case 1: 191 return incompleteStateFor(byte1, buffer.get(index)); 192 case 2: 193 return incompleteStateFor(byte1, buffer.get(index), buffer.get(index + 1)); 194 default: 195 throw new AssertionError(); 196 } 197 } 198 199 // These UTF-8 handling methods are copied from Guava's Utf8 class with a modification to throw 200 // a protocol buffer local exception. This exception is then caught in CodedOutputStream so it can 201 // fallback to more lenient behavior. 202 203 static class UnpairedSurrogateException extends IllegalArgumentException { UnpairedSurrogateException(int index, int length)204 UnpairedSurrogateException(int index, int length) { 205 super("Unpaired surrogate at index " + index + " of " + length); 206 } 207 } 208 209 /** 210 * Returns the number of bytes in the UTF-8-encoded form of {@code sequence}. For a string, this 211 * method is equivalent to {@code string.getBytes(UTF_8).length}, but is more efficient in both 212 * time and space. 213 * 214 * @throws IllegalArgumentException if {@code sequence} contains ill-formed UTF-16 (unpaired 215 * surrogates) 216 */ encodedLength(String string)217 static int encodedLength(String string) { 218 // Warning to maintainers: this implementation is highly optimized. 219 int utf16Length = string.length(); 220 int utf8Length = utf16Length; 221 int i = 0; 222 223 // This loop optimizes for pure ASCII. 224 while (i < utf16Length && string.charAt(i) < 0x80) { 225 i++; 226 } 227 228 // This loop optimizes for chars less than 0x800. 229 for (; i < utf16Length; i++) { 230 char c = string.charAt(i); 231 if (c < 0x800) { 232 utf8Length += ((0x7f - c) >>> 31); // branch free! 233 } else { 234 utf8Length += encodedLengthGeneral(string, i); 235 break; 236 } 237 } 238 239 if (utf8Length < utf16Length) { 240 // Necessary and sufficient condition for overflow because of maximum 3x expansion 241 throw new IllegalArgumentException( 242 "UTF-8 length does not fit in int: " + (utf8Length + (1L << 32))); 243 } 244 return utf8Length; 245 } 246 encodedLengthGeneral(String string, int start)247 private static int encodedLengthGeneral(String string, int start) { 248 int utf16Length = string.length(); 249 int utf8Length = 0; 250 for (int i = start; i < utf16Length; i++) { 251 char c = string.charAt(i); 252 if (c < 0x800) { 253 utf8Length += (0x7f - c) >>> 31; // branch free! 254 } else { 255 utf8Length += 2; 256 // jdk7+: if (Character.isSurrogate(c)) { 257 if (Character.MIN_SURROGATE <= c && c <= Character.MAX_SURROGATE) { 258 // Check that we have a well-formed surrogate pair. 259 int cp = Character.codePointAt(string, i); 260 if (cp < MIN_SUPPLEMENTARY_CODE_POINT) { 261 throw new UnpairedSurrogateException(i, utf16Length); 262 } 263 i++; 264 } 265 } 266 } 267 return utf8Length; 268 } 269 encode(String in, byte[] out, int offset, int length)270 static int encode(String in, byte[] out, int offset, int length) { 271 return processor.encodeUtf8(in, out, offset, length); 272 } 273 // End Guava UTF-8 methods. 274 275 /** 276 * Determines if the given {@link ByteBuffer} is a valid UTF-8 string. 277 * 278 * <p>Selects an optimal algorithm based on the type of {@link ByteBuffer} (i.e. heap or direct) 279 * and the capabilities of the platform. 280 * 281 * @param buffer the buffer to check. 282 * @see Utf8#isValidUtf8(byte[], int, int) 283 */ isValidUtf8(ByteBuffer buffer)284 static boolean isValidUtf8(ByteBuffer buffer) { 285 return processor.isValidUtf8(buffer, buffer.position(), buffer.remaining()); 286 } 287 288 /** 289 * Determines if the given {@link ByteBuffer} is a partially valid UTF-8 string. 290 * 291 * <p>Selects an optimal algorithm based on the type of {@link ByteBuffer} (i.e. heap or direct) 292 * and the capabilities of the platform. 293 * 294 * @param buffer the buffer to check. 295 * @see Utf8#partialIsValidUtf8(int, byte[], int, int) 296 */ partialIsValidUtf8(int state, ByteBuffer buffer, int index, int limit)297 static int partialIsValidUtf8(int state, ByteBuffer buffer, int index, int limit) { 298 return processor.partialIsValidUtf8(state, buffer, index, limit); 299 } 300 301 /** 302 * Decodes the given UTF-8 portion of the {@link ByteBuffer} into a {@link String}. 303 * 304 * @throws InvalidProtocolBufferException if the input is not valid UTF-8. 305 */ decodeUtf8(ByteBuffer buffer, int index, int size)306 static String decodeUtf8(ByteBuffer buffer, int index, int size) 307 throws InvalidProtocolBufferException { 308 return processor.decodeUtf8(buffer, index, size); 309 } 310 311 /** 312 * Decodes the given UTF-8 encoded byte array slice into a {@link String}. 313 * 314 * @throws InvalidProtocolBufferException if the input is not valid UTF-8. 315 */ decodeUtf8(byte[] bytes, int index, int size)316 static String decodeUtf8(byte[] bytes, int index, int size) 317 throws InvalidProtocolBufferException { 318 return processor.decodeUtf8(bytes, index, size); 319 } 320 321 /** 322 * Encodes the given characters to the target {@link ByteBuffer} using UTF-8 encoding. 323 * 324 * <p>Selects an optimal algorithm based on the type of {@link ByteBuffer} (i.e. heap or direct) 325 * and the capabilities of the platform. 326 * 327 * @param in the source string to be encoded 328 * @param out the target buffer to receive the encoded string. 329 * @see Utf8#encode(String, byte[], int, int) 330 */ encodeUtf8(String in, ByteBuffer out)331 static void encodeUtf8(String in, ByteBuffer out) { 332 processor.encodeUtf8(in, out); 333 } 334 335 /** 336 * Counts (approximately) the number of consecutive ASCII characters in the given buffer. The byte 337 * order of the {@link ByteBuffer} does not matter, so performance can be improved if native byte 338 * order is used (i.e. no byte-swapping in {@link ByteBuffer#getLong(int)}). 339 * 340 * @param buffer the buffer to be scanned for ASCII chars 341 * @param index the starting index of the scan 342 * @param limit the limit within buffer for the scan 343 * @return the number of ASCII characters found. The stopping position will be at or before the 344 * first non-ASCII byte. 345 */ estimateConsecutiveAscii(ByteBuffer buffer, int index, int limit)346 private static int estimateConsecutiveAscii(ByteBuffer buffer, int index, int limit) { 347 int i = index; 348 final int lim = limit - 7; 349 // This simple loop stops when we encounter a byte >= 0x80 (i.e. non-ASCII). 350 // To speed things up further, we're reading longs instead of bytes so we use a mask to 351 // determine if any byte in the current long is non-ASCII. 352 for (; i < lim && (buffer.getLong(i) & ASCII_MASK_LONG) == 0; i += 8) {} 353 return i - index; 354 } 355 356 /** A processor of UTF-8 strings, providing methods for checking validity and encoding. */ 357 // TODO: Add support for Memory/MemoryBlock on Android. 358 abstract static class Processor { 359 /** 360 * Returns {@code true} if the given byte array slice is a well-formed UTF-8 byte sequence. The 361 * range of bytes to be checked extends from index {@code index}, inclusive, to {@code limit}, 362 * exclusive. 363 * 364 * <p>This is a convenience method, equivalent to {@code partialIsValidUtf8(bytes, index, limit) 365 * == Utf8.COMPLETE}. 366 */ isValidUtf8(byte[] bytes, int index, int limit)367 final boolean isValidUtf8(byte[] bytes, int index, int limit) { 368 return partialIsValidUtf8(COMPLETE, bytes, index, limit) == COMPLETE; 369 } 370 371 /** 372 * Tells whether the given byte array slice is a well-formed, malformed, or incomplete UTF-8 373 * byte sequence. The range of bytes to be checked extends from index {@code index}, inclusive, 374 * to {@code limit}, exclusive. 375 * 376 * @param state either {@link Utf8#COMPLETE} (if this is the initial decoding operation) or the 377 * value returned from a call to a partial decoding method for the previous bytes 378 * @return {@link #MALFORMED} if the partial byte sequence is definitely not well-formed, {@link 379 * #COMPLETE} if it is well-formed (no additional input needed), or if the byte sequence is 380 * "incomplete", i.e. apparently terminated in the middle of a character, an opaque integer 381 * "state" value containing enough information to decode the character when passed to a 382 * subsequent invocation of a partial decoding method. 383 */ partialIsValidUtf8(int state, byte[] bytes, int index, int limit)384 abstract int partialIsValidUtf8(int state, byte[] bytes, int index, int limit); 385 386 /** 387 * Returns {@code true} if the given portion of the {@link ByteBuffer} is a well-formed UTF-8 388 * byte sequence. The range of bytes to be checked extends from index {@code index}, inclusive, 389 * to {@code limit}, exclusive. 390 * 391 * <p>This is a convenience method, equivalent to {@code partialIsValidUtf8(bytes, index, limit) 392 * == Utf8.COMPLETE}. 393 */ isValidUtf8(ByteBuffer buffer, int index, int limit)394 final boolean isValidUtf8(ByteBuffer buffer, int index, int limit) { 395 return partialIsValidUtf8(COMPLETE, buffer, index, limit) == COMPLETE; 396 } 397 398 /** 399 * Indicates whether or not the given buffer contains a valid UTF-8 string. 400 * 401 * @param buffer the buffer to check. 402 * @return {@code true} if the given buffer contains a valid UTF-8 string. 403 */ partialIsValidUtf8( final int state, final ByteBuffer buffer, int index, final int limit)404 final int partialIsValidUtf8( 405 final int state, final ByteBuffer buffer, int index, final int limit) { 406 if (buffer.hasArray()) { 407 final int offset = buffer.arrayOffset(); 408 return partialIsValidUtf8(state, buffer.array(), offset + index, offset + limit); 409 } else if (buffer.isDirect()) { 410 return partialIsValidUtf8Direct(state, buffer, index, limit); 411 } 412 return partialIsValidUtf8Default(state, buffer, index, limit); 413 } 414 415 /** Performs validation for direct {@link ByteBuffer} instances. */ partialIsValidUtf8Direct( final int state, final ByteBuffer buffer, int index, final int limit)416 abstract int partialIsValidUtf8Direct( 417 final int state, final ByteBuffer buffer, int index, final int limit); 418 419 /** 420 * Performs validation for {@link ByteBuffer} instances using the {@link ByteBuffer} API rather 421 * than potentially faster approaches. This first completes validation for the current character 422 * (provided by {@code state}) and then finishes validation for the sequence. 423 */ partialIsValidUtf8Default( final int state, final ByteBuffer buffer, int index, final int limit)424 final int partialIsValidUtf8Default( 425 final int state, final ByteBuffer buffer, int index, final int limit) { 426 if (state != COMPLETE) { 427 // The previous decoding operation was incomplete (or malformed). 428 // We look for a well-formed sequence consisting of bytes from 429 // the previous decoding operation (stored in state) together 430 // with bytes from the array slice. 431 // 432 // We expect such "straddler characters" to be rare. 433 434 if (index >= limit) { // No bytes? No progress. 435 return state; 436 } 437 438 byte byte1 = (byte) state; 439 // byte1 is never ASCII. 440 if (byte1 < (byte) 0xE0) { 441 // two-byte form 442 443 // Simultaneously checks for illegal trailing-byte in 444 // leading position and overlong 2-byte form. 445 if (byte1 < (byte) 0xC2 446 // byte2 trailing-byte test 447 || buffer.get(index++) > (byte) 0xBF) { 448 return MALFORMED; 449 } 450 } else if (byte1 < (byte) 0xF0) { 451 // three-byte form 452 453 // Get byte2 from saved state or array 454 byte byte2 = (byte) ~(state >> 8); 455 if (byte2 == 0) { 456 byte2 = buffer.get(index++); 457 if (index >= limit) { 458 return incompleteStateFor(byte1, byte2); 459 } 460 } 461 if (byte2 > (byte) 0xBF 462 // overlong? 5 most significant bits must not all be zero 463 || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0) 464 // illegal surrogate codepoint? 465 || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0) 466 // byte3 trailing-byte test 467 || buffer.get(index++) > (byte) 0xBF) { 468 return MALFORMED; 469 } 470 } else { 471 // four-byte form 472 473 // Get byte2 and byte3 from saved state or array 474 byte byte2 = (byte) ~(state >> 8); 475 byte byte3 = 0; 476 if (byte2 == 0) { 477 byte2 = buffer.get(index++); 478 if (index >= limit) { 479 return incompleteStateFor(byte1, byte2); 480 } 481 } else { 482 byte3 = (byte) (state >> 16); 483 } 484 if (byte3 == 0) { 485 byte3 = buffer.get(index++); 486 if (index >= limit) { 487 return incompleteStateFor(byte1, byte2, byte3); 488 } 489 } 490 491 // If we were called with state == MALFORMED, then byte1 is 0xFF, 492 // which never occurs in well-formed UTF-8, and so we will return 493 // MALFORMED again below. 494 495 if (byte2 > (byte) 0xBF 496 // Check that 1 <= plane <= 16. Tricky optimized form of: 497 // if (byte1 > (byte) 0xF4 || 498 // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 || 499 // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F) 500 || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0 501 // byte3 trailing-byte test 502 || byte3 > (byte) 0xBF 503 // byte4 trailing-byte test 504 || buffer.get(index++) > (byte) 0xBF) { 505 return MALFORMED; 506 } 507 } 508 } 509 510 // Finish validation for the sequence. 511 return partialIsValidUtf8(buffer, index, limit); 512 } 513 514 /** 515 * Performs validation for {@link ByteBuffer} instances using the {@link ByteBuffer} API rather 516 * than potentially faster approaches. 517 */ partialIsValidUtf8(final ByteBuffer buffer, int index, final int limit)518 private static int partialIsValidUtf8(final ByteBuffer buffer, int index, final int limit) { 519 index += estimateConsecutiveAscii(buffer, index, limit); 520 521 for (; ; ) { 522 // Optimize for interior runs of ASCII bytes. 523 // TODO: Consider checking 8 bytes at a time after some threshold? 524 // Maybe after seeing a few in a row that are ASCII, go back to fast mode? 525 int byte1; 526 do { 527 if (index >= limit) { 528 return COMPLETE; 529 } 530 } while ((byte1 = buffer.get(index++)) >= 0); 531 532 // If we're here byte1 is not ASCII. Only need to handle 2-4 byte forms. 533 if (byte1 < (byte) 0xE0) { 534 // Two-byte form (110xxxxx 10xxxxxx) 535 if (index >= limit) { 536 // Incomplete sequence 537 return byte1; 538 } 539 540 // Simultaneously checks for illegal trailing-byte in 541 // leading position and overlong 2-byte form. 542 if (byte1 < (byte) 0xC2 || buffer.get(index) > (byte) 0xBF) { 543 return MALFORMED; 544 } 545 index++; 546 } else if (byte1 < (byte) 0xF0) { 547 // Three-byte form (1110xxxx 10xxxxxx 10xxxxxx) 548 if (index >= limit - 1) { 549 // Incomplete sequence 550 return incompleteStateFor(buffer, byte1, index, limit - index); 551 } 552 553 byte byte2 = buffer.get(index++); 554 if (byte2 > (byte) 0xBF 555 // overlong? 5 most significant bits must not all be zero 556 || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0) 557 // check for illegal surrogate codepoints 558 || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0) 559 // byte3 trailing-byte test 560 || buffer.get(index) > (byte) 0xBF) { 561 return MALFORMED; 562 } 563 index++; 564 } else { 565 // Four-byte form (1110xxxx 10xxxxxx 10xxxxxx 10xxxxxx) 566 if (index >= limit - 2) { 567 // Incomplete sequence 568 return incompleteStateFor(buffer, byte1, index, limit - index); 569 } 570 571 // TODO: Consider using getInt() to improve performance. 572 int byte2 = buffer.get(index++); 573 if (byte2 > (byte) 0xBF 574 // Check that 1 <= plane <= 16. Tricky optimized form of: 575 // if (byte1 > (byte) 0xF4 || 576 // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 || 577 // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F) 578 || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0 579 // byte3 trailing-byte test 580 || buffer.get(index++) > (byte) 0xBF 581 // byte4 trailing-byte test 582 || buffer.get(index++) > (byte) 0xBF) { 583 return MALFORMED; 584 } 585 } 586 } 587 } 588 589 /** 590 * Decodes the given byte array slice into a {@link String}. 591 * 592 * @throws InvalidProtocolBufferException if the byte array slice is not valid UTF-8 593 */ decodeUtf8(byte[] bytes, int index, int size)594 abstract String decodeUtf8(byte[] bytes, int index, int size) 595 throws InvalidProtocolBufferException; 596 597 /** 598 * Decodes the given portion of the {@link ByteBuffer} into a {@link String}. 599 * 600 * @throws InvalidProtocolBufferException if the portion of the buffer is not valid UTF-8 601 */ decodeUtf8(ByteBuffer buffer, int index, int size)602 final String decodeUtf8(ByteBuffer buffer, int index, int size) 603 throws InvalidProtocolBufferException { 604 if (buffer.hasArray()) { 605 final int offset = buffer.arrayOffset(); 606 return decodeUtf8(buffer.array(), offset + index, size); 607 } else if (buffer.isDirect()) { 608 return decodeUtf8Direct(buffer, index, size); 609 } 610 return decodeUtf8Default(buffer, index, size); 611 } 612 613 /** Decodes direct {@link ByteBuffer} instances into {@link String}. */ decodeUtf8Direct(ByteBuffer buffer, int index, int size)614 abstract String decodeUtf8Direct(ByteBuffer buffer, int index, int size) 615 throws InvalidProtocolBufferException; 616 617 /** 618 * Decodes {@link ByteBuffer} instances using the {@link ByteBuffer} API rather than potentially 619 * faster approaches. 620 */ decodeUtf8Default(ByteBuffer buffer, int index, int size)621 final String decodeUtf8Default(ByteBuffer buffer, int index, int size) 622 throws InvalidProtocolBufferException { 623 // Bitwise OR combines the sign bits so any negative value fails the check. 624 if ((index | size | buffer.limit() - index - size) < 0) { 625 throw new ArrayIndexOutOfBoundsException( 626 String.format("buffer limit=%d, index=%d, limit=%d", buffer.limit(), index, size)); 627 } 628 629 int offset = index; 630 int limit = offset + size; 631 632 // The longest possible resulting String is the same as the number of input bytes, when it is 633 // all ASCII. For other cases, this over-allocates and we will truncate in the end. 634 char[] resultArr = new char[size]; 635 int resultPos = 0; 636 637 // Optimize for 100% ASCII (Hotspot loves small simple top-level loops like this). 638 // This simple loop stops when we encounter a byte >= 0x80 (i.e. non-ASCII). 639 while (offset < limit) { 640 byte b = buffer.get(offset); 641 if (!DecodeUtil.isOneByte(b)) { 642 break; 643 } 644 offset++; 645 DecodeUtil.handleOneByte(b, resultArr, resultPos++); 646 } 647 648 while (offset < limit) { 649 byte byte1 = buffer.get(offset++); 650 if (DecodeUtil.isOneByte(byte1)) { 651 DecodeUtil.handleOneByte(byte1, resultArr, resultPos++); 652 // It's common for there to be multiple ASCII characters in a run mixed in, so add an 653 // extra optimized loop to take care of these runs. 654 while (offset < limit) { 655 byte b = buffer.get(offset); 656 if (!DecodeUtil.isOneByte(b)) { 657 break; 658 } 659 offset++; 660 DecodeUtil.handleOneByte(b, resultArr, resultPos++); 661 } 662 } else if (DecodeUtil.isTwoBytes(byte1)) { 663 if (offset >= limit) { 664 throw InvalidProtocolBufferException.invalidUtf8(); 665 } 666 DecodeUtil.handleTwoBytes( 667 byte1, /* byte2 */ buffer.get(offset++), resultArr, resultPos++); 668 } else if (DecodeUtil.isThreeBytes(byte1)) { 669 if (offset >= limit - 1) { 670 throw InvalidProtocolBufferException.invalidUtf8(); 671 } 672 DecodeUtil.handleThreeBytes( 673 byte1, 674 /* byte2 */ buffer.get(offset++), 675 /* byte3 */ buffer.get(offset++), 676 resultArr, 677 resultPos++); 678 } else { 679 if (offset >= limit - 2) { 680 throw InvalidProtocolBufferException.invalidUtf8(); 681 } 682 DecodeUtil.handleFourBytes( 683 byte1, 684 /* byte2 */ buffer.get(offset++), 685 /* byte3 */ buffer.get(offset++), 686 /* byte4 */ buffer.get(offset++), 687 resultArr, 688 resultPos++); 689 // 4-byte case requires two chars. 690 resultPos++; 691 } 692 } 693 694 return new String(resultArr, 0, resultPos); 695 } 696 697 /** 698 * Encodes an input character sequence ({@code in}) to UTF-8 in the target array ({@code out}). 699 * For a string, this method is similar to 700 * 701 * <pre>{@code 702 * byte[] a = string.getBytes(UTF_8); 703 * System.arraycopy(a, 0, bytes, offset, a.length); 704 * return offset + a.length; 705 * }</pre> 706 * 707 * but is more efficient in both time and space. One key difference is that this method requires 708 * paired surrogates, and therefore does not support chunking. While {@code 709 * String.getBytes(UTF_8)} replaces unpaired surrogates with the default replacement character, 710 * this method throws {@link UnpairedSurrogateException}. 711 * 712 * <p>To ensure sufficient space in the output buffer, either call {@link #encodedLength} to 713 * compute the exact amount needed, or leave room for {@code Utf8.MAX_BYTES_PER_CHAR * 714 * sequence.length()}, which is the largest possible number of bytes that any input can be 715 * encoded to. 716 * 717 * @param in the input character sequence to be encoded 718 * @param out the target array 719 * @param offset the starting offset in {@code bytes} to start writing at 720 * @param length the length of the {@code bytes}, starting from {@code offset} 721 * @throws UnpairedSurrogateException if {@code sequence} contains ill-formed UTF-16 (unpaired 722 * surrogates) 723 * @throws ArrayIndexOutOfBoundsException if {@code sequence} encoded in UTF-8 is longer than 724 * {@code bytes.length - offset} 725 * @return the new offset, equivalent to {@code offset + Utf8.encodedLength(sequence)} 726 */ encodeUtf8(String in, byte[] out, int offset, int length)727 abstract int encodeUtf8(String in, byte[] out, int offset, int length); 728 729 /** 730 * Encodes an input character sequence ({@code in}) to UTF-8 in the target buffer ({@code out}). 731 * Upon returning from this method, the {@code out} position will point to the position after 732 * the last encoded byte. This method requires paired surrogates, and therefore does not support 733 * chunking. 734 * 735 * <p>To ensure sufficient space in the output buffer, either call {@link #encodedLength} to 736 * compute the exact amount needed, or leave room for {@code Utf8.MAX_BYTES_PER_CHAR * 737 * in.length()}, which is the largest possible number of bytes that any input can be encoded to. 738 * 739 * @param in the source character sequence to be encoded 740 * @param out the target buffer 741 * @throws UnpairedSurrogateException if {@code in} contains ill-formed UTF-16 (unpaired 742 * surrogates) 743 * @throws ArrayIndexOutOfBoundsException if {@code in} encoded in UTF-8 is longer than {@code 744 * out.remaining()} 745 */ encodeUtf8(String in, ByteBuffer out)746 final void encodeUtf8(String in, ByteBuffer out) { 747 if (out.hasArray()) { 748 final int offset = out.arrayOffset(); 749 int endIndex = Utf8.encode(in, out.array(), offset + out.position(), out.remaining()); 750 Java8Compatibility.position(out, endIndex - offset); 751 } else if (out.isDirect()) { 752 encodeUtf8Direct(in, out); 753 } else { 754 encodeUtf8Default(in, out); 755 } 756 } 757 758 /** Encodes the input character sequence to a direct {@link ByteBuffer} instance. */ encodeUtf8Direct(String in, ByteBuffer out)759 abstract void encodeUtf8Direct(String in, ByteBuffer out); 760 761 /** 762 * Encodes the input character sequence to a {@link ByteBuffer} instance using the {@link 763 * ByteBuffer} API, rather than potentially faster approaches. 764 */ encodeUtf8Default(String in, ByteBuffer out)765 final void encodeUtf8Default(String in, ByteBuffer out) { 766 final int inLength = in.length(); 767 int outIx = out.position(); 768 int inIx = 0; 769 770 // Since ByteBuffer.putXXX() already checks boundaries for us, no need to explicitly check 771 // access. Assume the buffer is big enough and let it handle the out of bounds exception 772 // if it occurs. 773 try { 774 // Designed to take advantage of 775 // https://wiki.openjdk.java.net/display/HotSpotInternals/RangeCheckElimination 776 for (char c; inIx < inLength && (c = in.charAt(inIx)) < 0x80; ++inIx) { 777 out.put(outIx + inIx, (byte) c); 778 } 779 if (inIx == inLength) { 780 // Successfully encoded the entire string. 781 Java8Compatibility.position(out, outIx + inIx); 782 return; 783 } 784 785 outIx += inIx; 786 for (char c; inIx < inLength; ++inIx, ++outIx) { 787 c = in.charAt(inIx); 788 if (c < 0x80) { 789 // One byte (0xxx xxxx) 790 out.put(outIx, (byte) c); 791 } else if (c < 0x800) { 792 // Two bytes (110x xxxx 10xx xxxx) 793 794 // Benchmarks show put performs better than putShort here (for HotSpot). 795 out.put(outIx++, (byte) (0xC0 | (c >>> 6))); 796 out.put(outIx, (byte) (0x80 | (0x3F & c))); 797 } else if (c < MIN_SURROGATE || MAX_SURROGATE < c) { 798 // Three bytes (1110 xxxx 10xx xxxx 10xx xxxx) 799 // Maximum single-char code point is 0xFFFF, 16 bits. 800 801 // Benchmarks show put performs better than putShort here (for HotSpot). 802 out.put(outIx++, (byte) (0xE0 | (c >>> 12))); 803 out.put(outIx++, (byte) (0x80 | (0x3F & (c >>> 6)))); 804 out.put(outIx, (byte) (0x80 | (0x3F & c))); 805 } else { 806 // Four bytes (1111 xxxx 10xx xxxx 10xx xxxx 10xx xxxx) 807 808 // Minimum code point represented by a surrogate pair is 0x10000, 17 bits, four UTF-8 809 // bytes 810 final char low; 811 if (inIx + 1 == inLength || !isSurrogatePair(c, (low = in.charAt(++inIx)))) { 812 throw new UnpairedSurrogateException(inIx, inLength); 813 } 814 // TODO: Consider using putInt() to improve performance. 815 int codePoint = toCodePoint(c, low); 816 out.put(outIx++, (byte) ((0xF << 4) | (codePoint >>> 18))); 817 out.put(outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 12)))); 818 out.put(outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 6)))); 819 out.put(outIx, (byte) (0x80 | (0x3F & codePoint))); 820 } 821 } 822 823 // Successfully encoded the entire string. 824 Java8Compatibility.position(out, outIx); 825 } catch (IndexOutOfBoundsException e) { 826 // TODO: Consider making the API throw IndexOutOfBoundsException instead. 827 828 // If we failed in the outer ASCII loop, outIx will not have been updated. In this case, 829 // use inIx to determine the bad write index. 830 int badWriteIndex = out.position() + Math.max(inIx, outIx - out.position() + 1); 831 throw new ArrayIndexOutOfBoundsException( 832 "Failed writing " + in.charAt(inIx) + " at index " + badWriteIndex); 833 } 834 } 835 } 836 837 /** {@link Processor} implementation that does not use any {@code sun.misc.Unsafe} methods. */ 838 static final class SafeProcessor extends Processor { 839 @Override partialIsValidUtf8(int state, byte[] bytes, int index, int limit)840 int partialIsValidUtf8(int state, byte[] bytes, int index, int limit) { 841 if (state != COMPLETE) { 842 // The previous decoding operation was incomplete (or malformed). 843 // We look for a well-formed sequence consisting of bytes from 844 // the previous decoding operation (stored in state) together 845 // with bytes from the array slice. 846 // 847 // We expect such "straddler characters" to be rare. 848 849 if (index >= limit) { // No bytes? No progress. 850 return state; 851 } 852 int byte1 = (byte) state; 853 // byte1 is never ASCII. 854 if (byte1 < (byte) 0xE0) { 855 // two-byte form 856 857 // Simultaneously checks for illegal trailing-byte in 858 // leading position and overlong 2-byte form. 859 if (byte1 < (byte) 0xC2 860 // byte2 trailing-byte test 861 || bytes[index++] > (byte) 0xBF) { 862 return MALFORMED; 863 } 864 } else if (byte1 < (byte) 0xF0) { 865 // three-byte form 866 867 // Get byte2 from saved state or array 868 int byte2 = (byte) ~(state >> 8); 869 if (byte2 == 0) { 870 byte2 = bytes[index++]; 871 if (index >= limit) { 872 return incompleteStateFor(byte1, byte2); 873 } 874 } 875 if (byte2 > (byte) 0xBF 876 // overlong? 5 most significant bits must not all be zero 877 || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0) 878 // illegal surrogate codepoint? 879 || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0) 880 // byte3 trailing-byte test 881 || bytes[index++] > (byte) 0xBF) { 882 return MALFORMED; 883 } 884 } else { 885 // four-byte form 886 887 // Get byte2 and byte3 from saved state or array 888 int byte2 = (byte) ~(state >> 8); 889 int byte3 = 0; 890 if (byte2 == 0) { 891 byte2 = bytes[index++]; 892 if (index >= limit) { 893 return incompleteStateFor(byte1, byte2); 894 } 895 } else { 896 byte3 = (byte) (state >> 16); 897 } 898 if (byte3 == 0) { 899 byte3 = bytes[index++]; 900 if (index >= limit) { 901 return incompleteStateFor(byte1, byte2, byte3); 902 } 903 } 904 905 // If we were called with state == MALFORMED, then byte1 is 0xFF, 906 // which never occurs in well-formed UTF-8, and so we will return 907 // MALFORMED again below. 908 909 if (byte2 > (byte) 0xBF 910 // Check that 1 <= plane <= 16. Tricky optimized form of: 911 // if (byte1 > (byte) 0xF4 || 912 // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 || 913 // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F) 914 || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0 915 // byte3 trailing-byte test 916 || byte3 > (byte) 0xBF 917 // byte4 trailing-byte test 918 || bytes[index++] > (byte) 0xBF) { 919 return MALFORMED; 920 } 921 } 922 } 923 924 return partialIsValidUtf8(bytes, index, limit); 925 } 926 927 @Override partialIsValidUtf8Direct(int state, ByteBuffer buffer, int index, int limit)928 int partialIsValidUtf8Direct(int state, ByteBuffer buffer, int index, int limit) { 929 // For safe processing, we have to use the ByteBuffer API. 930 return partialIsValidUtf8Default(state, buffer, index, limit); 931 } 932 933 @Override decodeUtf8(byte[] bytes, int index, int size)934 String decodeUtf8(byte[] bytes, int index, int size) throws InvalidProtocolBufferException { 935 // Bitwise OR combines the sign bits so any negative value fails the check. 936 if ((index | size | bytes.length - index - size) < 0) { 937 throw new ArrayIndexOutOfBoundsException( 938 String.format("buffer length=%d, index=%d, size=%d", bytes.length, index, size)); 939 } 940 941 int offset = index; 942 final int limit = offset + size; 943 944 // The longest possible resulting String is the same as the number of input bytes, when it is 945 // all ASCII. For other cases, this over-allocates and we will truncate in the end. 946 char[] resultArr = new char[size]; 947 int resultPos = 0; 948 949 // Optimize for 100% ASCII (Hotspot loves small simple top-level loops like this). 950 // This simple loop stops when we encounter a byte >= 0x80 (i.e. non-ASCII). 951 while (offset < limit) { 952 byte b = bytes[offset]; 953 if (!DecodeUtil.isOneByte(b)) { 954 break; 955 } 956 offset++; 957 DecodeUtil.handleOneByte(b, resultArr, resultPos++); 958 } 959 960 while (offset < limit) { 961 byte byte1 = bytes[offset++]; 962 if (DecodeUtil.isOneByte(byte1)) { 963 DecodeUtil.handleOneByte(byte1, resultArr, resultPos++); 964 // It's common for there to be multiple ASCII characters in a run mixed in, so add an 965 // extra optimized loop to take care of these runs. 966 while (offset < limit) { 967 byte b = bytes[offset]; 968 if (!DecodeUtil.isOneByte(b)) { 969 break; 970 } 971 offset++; 972 DecodeUtil.handleOneByte(b, resultArr, resultPos++); 973 } 974 } else if (DecodeUtil.isTwoBytes(byte1)) { 975 if (offset >= limit) { 976 throw InvalidProtocolBufferException.invalidUtf8(); 977 } 978 DecodeUtil.handleTwoBytes(byte1, /* byte2 */ bytes[offset++], resultArr, resultPos++); 979 } else if (DecodeUtil.isThreeBytes(byte1)) { 980 if (offset >= limit - 1) { 981 throw InvalidProtocolBufferException.invalidUtf8(); 982 } 983 DecodeUtil.handleThreeBytes( 984 byte1, 985 /* byte2 */ bytes[offset++], 986 /* byte3 */ bytes[offset++], 987 resultArr, 988 resultPos++); 989 } else { 990 if (offset >= limit - 2) { 991 throw InvalidProtocolBufferException.invalidUtf8(); 992 } 993 DecodeUtil.handleFourBytes( 994 byte1, 995 /* byte2 */ bytes[offset++], 996 /* byte3 */ bytes[offset++], 997 /* byte4 */ bytes[offset++], 998 resultArr, 999 resultPos++); 1000 // 4-byte case requires two chars. 1001 resultPos++; 1002 } 1003 } 1004 1005 return new String(resultArr, 0, resultPos); 1006 } 1007 1008 @Override decodeUtf8Direct(ByteBuffer buffer, int index, int size)1009 String decodeUtf8Direct(ByteBuffer buffer, int index, int size) 1010 throws InvalidProtocolBufferException { 1011 // For safe processing, we have to use the ByteBufferAPI. 1012 return decodeUtf8Default(buffer, index, size); 1013 } 1014 1015 @Override encodeUtf8(String in, byte[] out, int offset, int length)1016 int encodeUtf8(String in, byte[] out, int offset, int length) { 1017 int utf16Length = in.length(); 1018 int j = offset; 1019 int i = 0; 1020 int limit = offset + length; 1021 // Designed to take advantage of 1022 // https://wiki.openjdk.java.net/display/HotSpotInternals/RangeCheckElimination 1023 for (char c; i < utf16Length && i + j < limit && (c = in.charAt(i)) < 0x80; i++) { 1024 out[j + i] = (byte) c; 1025 } 1026 if (i == utf16Length) { 1027 return j + utf16Length; 1028 } 1029 j += i; 1030 for (char c; i < utf16Length; i++) { 1031 c = in.charAt(i); 1032 if (c < 0x80 && j < limit) { 1033 out[j++] = (byte) c; 1034 } else if (c < 0x800 && j <= limit - 2) { // 11 bits, two UTF-8 bytes 1035 out[j++] = (byte) ((0xF << 6) | (c >>> 6)); 1036 out[j++] = (byte) (0x80 | (0x3F & c)); 1037 } else if ((c < Character.MIN_SURROGATE || Character.MAX_SURROGATE < c) && j <= limit - 3) { 1038 // Maximum single-char code point is 0xFFFF, 16 bits, three UTF-8 bytes 1039 out[j++] = (byte) ((0xF << 5) | (c >>> 12)); 1040 out[j++] = (byte) (0x80 | (0x3F & (c >>> 6))); 1041 out[j++] = (byte) (0x80 | (0x3F & c)); 1042 } else if (j <= limit - 4) { 1043 // Minimum code point represented by a surrogate pair is 0x10000, 17 bits, 1044 // four UTF-8 bytes 1045 final char low; 1046 if (i + 1 == in.length() || !Character.isSurrogatePair(c, (low = in.charAt(++i)))) { 1047 throw new UnpairedSurrogateException((i - 1), utf16Length); 1048 } 1049 int codePoint = Character.toCodePoint(c, low); 1050 out[j++] = (byte) ((0xF << 4) | (codePoint >>> 18)); 1051 out[j++] = (byte) (0x80 | (0x3F & (codePoint >>> 12))); 1052 out[j++] = (byte) (0x80 | (0x3F & (codePoint >>> 6))); 1053 out[j++] = (byte) (0x80 | (0x3F & codePoint)); 1054 } else { 1055 // If we are surrogates and we're not a surrogate pair, always throw an 1056 // UnpairedSurrogateException instead of an ArrayOutOfBoundsException. 1057 if ((Character.MIN_SURROGATE <= c && c <= Character.MAX_SURROGATE) 1058 && (i + 1 == in.length() || !Character.isSurrogatePair(c, in.charAt(i + 1)))) { 1059 throw new UnpairedSurrogateException(i, utf16Length); 1060 } 1061 throw new ArrayIndexOutOfBoundsException("Failed writing " + c + " at index " + j); 1062 } 1063 } 1064 return j; 1065 } 1066 1067 @Override encodeUtf8Direct(String in, ByteBuffer out)1068 void encodeUtf8Direct(String in, ByteBuffer out) { 1069 // For safe processing, we have to use the ByteBuffer API. 1070 encodeUtf8Default(in, out); 1071 } 1072 partialIsValidUtf8(byte[] bytes, int index, int limit)1073 private static int partialIsValidUtf8(byte[] bytes, int index, int limit) { 1074 // Optimize for 100% ASCII (Hotspot loves small simple top-level loops like this). 1075 // This simple loop stops when we encounter a byte >= 0x80 (i.e. non-ASCII). 1076 while (index < limit && bytes[index] >= 0) { 1077 index++; 1078 } 1079 1080 return (index >= limit) ? COMPLETE : partialIsValidUtf8NonAscii(bytes, index, limit); 1081 } 1082 partialIsValidUtf8NonAscii(byte[] bytes, int index, int limit)1083 private static int partialIsValidUtf8NonAscii(byte[] bytes, int index, int limit) { 1084 for (; ; ) { 1085 int byte1; 1086 int byte2; 1087 1088 // Optimize for interior runs of ASCII bytes. 1089 do { 1090 if (index >= limit) { 1091 return COMPLETE; 1092 } 1093 } while ((byte1 = bytes[index++]) >= 0); 1094 1095 if (byte1 < (byte) 0xE0) { 1096 // two-byte form 1097 1098 if (index >= limit) { 1099 // Incomplete sequence 1100 return byte1; 1101 } 1102 1103 // Simultaneously checks for illegal trailing-byte in 1104 // leading position and overlong 2-byte form. 1105 if (byte1 < (byte) 0xC2 || bytes[index++] > (byte) 0xBF) { 1106 return MALFORMED; 1107 } 1108 } else if (byte1 < (byte) 0xF0) { 1109 // three-byte form 1110 1111 if (index >= limit - 1) { // incomplete sequence 1112 return incompleteStateFor(bytes, index, limit); 1113 } 1114 if ((byte2 = bytes[index++]) > (byte) 0xBF 1115 // overlong? 5 most significant bits must not all be zero 1116 || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0) 1117 // check for illegal surrogate codepoints 1118 || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0) 1119 // byte3 trailing-byte test 1120 || bytes[index++] > (byte) 0xBF) { 1121 return MALFORMED; 1122 } 1123 } else { 1124 // four-byte form 1125 1126 if (index >= limit - 2) { // incomplete sequence 1127 return incompleteStateFor(bytes, index, limit); 1128 } 1129 if ((byte2 = bytes[index++]) > (byte) 0xBF 1130 // Check that 1 <= plane <= 16. Tricky optimized form of: 1131 // if (byte1 > (byte) 0xF4 || 1132 // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 || 1133 // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F) 1134 || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0 1135 // byte3 trailing-byte test 1136 || bytes[index++] > (byte) 0xBF 1137 // byte4 trailing-byte test 1138 || bytes[index++] > (byte) 0xBF) { 1139 return MALFORMED; 1140 } 1141 } 1142 } 1143 } 1144 } 1145 1146 /** {@link Processor} that uses {@code sun.misc.Unsafe} where possible to improve performance. */ 1147 static final class UnsafeProcessor extends Processor { 1148 /** Indicates whether or not all required unsafe operations are supported on this platform. */ isAvailable()1149 static boolean isAvailable() { 1150 return hasUnsafeArrayOperations() && hasUnsafeByteBufferOperations(); 1151 } 1152 1153 @Override partialIsValidUtf8(int state, byte[] bytes, final int index, final int limit)1154 int partialIsValidUtf8(int state, byte[] bytes, final int index, final int limit) { 1155 // Bitwise OR combines the sign bits so any negative value fails the check. 1156 if ((index | limit | bytes.length - limit) < 0) { 1157 throw new ArrayIndexOutOfBoundsException( 1158 String.format("Array length=%d, index=%d, limit=%d", bytes.length, index, limit)); 1159 } 1160 long offset = index; 1161 final long offsetLimit = limit; 1162 if (state != COMPLETE) { 1163 // The previous decoding operation was incomplete (or malformed). 1164 // We look for a well-formed sequence consisting of bytes from 1165 // the previous decoding operation (stored in state) together 1166 // with bytes from the array slice. 1167 // 1168 // We expect such "straddler characters" to be rare. 1169 1170 if (offset >= offsetLimit) { // No bytes? No progress. 1171 return state; 1172 } 1173 int byte1 = (byte) state; 1174 // byte1 is never ASCII. 1175 if (byte1 < (byte) 0xE0) { 1176 // two-byte form 1177 1178 // Simultaneously checks for illegal trailing-byte in 1179 // leading position and overlong 2-byte form. 1180 if (byte1 < (byte) 0xC2 1181 // byte2 trailing-byte test 1182 || UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF) { 1183 return MALFORMED; 1184 } 1185 } else if (byte1 < (byte) 0xF0) { 1186 // three-byte form 1187 1188 // Get byte2 from saved state or array 1189 int byte2 = (byte) ~(state >> 8); 1190 if (byte2 == 0) { 1191 byte2 = UnsafeUtil.getByte(bytes, offset++); 1192 if (offset >= offsetLimit) { 1193 return incompleteStateFor(byte1, byte2); 1194 } 1195 } 1196 if (byte2 > (byte) 0xBF 1197 // overlong? 5 most significant bits must not all be zero 1198 || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0) 1199 // illegal surrogate codepoint? 1200 || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0) 1201 // byte3 trailing-byte test 1202 || UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF) { 1203 return MALFORMED; 1204 } 1205 } else { 1206 // four-byte form 1207 1208 // Get byte2 and byte3 from saved state or array 1209 int byte2 = (byte) ~(state >> 8); 1210 int byte3 = 0; 1211 if (byte2 == 0) { 1212 byte2 = UnsafeUtil.getByte(bytes, offset++); 1213 if (offset >= offsetLimit) { 1214 return incompleteStateFor(byte1, byte2); 1215 } 1216 } else { 1217 byte3 = (byte) (state >> 16); 1218 } 1219 if (byte3 == 0) { 1220 byte3 = UnsafeUtil.getByte(bytes, offset++); 1221 if (offset >= offsetLimit) { 1222 return incompleteStateFor(byte1, byte2, byte3); 1223 } 1224 } 1225 1226 // If we were called with state == MALFORMED, then byte1 is 0xFF, 1227 // which never occurs in well-formed UTF-8, and so we will return 1228 // MALFORMED again below. 1229 1230 if (byte2 > (byte) 0xBF 1231 // Check that 1 <= plane <= 16. Tricky optimized form of: 1232 // if (byte1 > (byte) 0xF4 || 1233 // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 || 1234 // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F) 1235 || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0 1236 // byte3 trailing-byte test 1237 || byte3 > (byte) 0xBF 1238 // byte4 trailing-byte test 1239 || UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF) { 1240 return MALFORMED; 1241 } 1242 } 1243 } 1244 1245 return partialIsValidUtf8(bytes, offset, (int) (offsetLimit - offset)); 1246 } 1247 1248 @Override partialIsValidUtf8Direct( final int state, ByteBuffer buffer, final int index, final int limit)1249 int partialIsValidUtf8Direct( 1250 final int state, ByteBuffer buffer, final int index, final int limit) { 1251 // Bitwise OR combines the sign bits so any negative value fails the check. 1252 if ((index | limit | buffer.limit() - limit) < 0) { 1253 throw new ArrayIndexOutOfBoundsException( 1254 String.format("buffer limit=%d, index=%d, limit=%d", buffer.limit(), index, limit)); 1255 } 1256 long address = addressOffset(buffer) + index; 1257 final long addressLimit = address + (limit - index); 1258 if (state != COMPLETE) { 1259 // The previous decoding operation was incomplete (or malformed). 1260 // We look for a well-formed sequence consisting of bytes from 1261 // the previous decoding operation (stored in state) together 1262 // with bytes from the array slice. 1263 // 1264 // We expect such "straddler characters" to be rare. 1265 1266 if (address >= addressLimit) { // No bytes? No progress. 1267 return state; 1268 } 1269 1270 final int byte1 = (byte) state; 1271 // byte1 is never ASCII. 1272 if (byte1 < (byte) 0xE0) { 1273 // two-byte form 1274 1275 // Simultaneously checks for illegal trailing-byte in 1276 // leading position and overlong 2-byte form. 1277 if (byte1 < (byte) 0xC2 1278 // byte2 trailing-byte test 1279 || UnsafeUtil.getByte(address++) > (byte) 0xBF) { 1280 return MALFORMED; 1281 } 1282 } else if (byte1 < (byte) 0xF0) { 1283 // three-byte form 1284 1285 // Get byte2 from saved state or array 1286 int byte2 = (byte) ~(state >> 8); 1287 if (byte2 == 0) { 1288 byte2 = UnsafeUtil.getByte(address++); 1289 if (address >= addressLimit) { 1290 return incompleteStateFor(byte1, byte2); 1291 } 1292 } 1293 if (byte2 > (byte) 0xBF 1294 // overlong? 5 most significant bits must not all be zero 1295 || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0) 1296 // illegal surrogate codepoint? 1297 || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0) 1298 // byte3 trailing-byte test 1299 || UnsafeUtil.getByte(address++) > (byte) 0xBF) { 1300 return MALFORMED; 1301 } 1302 } else { 1303 // four-byte form 1304 1305 // Get byte2 and byte3 from saved state or array 1306 int byte2 = (byte) ~(state >> 8); 1307 int byte3 = 0; 1308 if (byte2 == 0) { 1309 byte2 = UnsafeUtil.getByte(address++); 1310 if (address >= addressLimit) { 1311 return incompleteStateFor(byte1, byte2); 1312 } 1313 } else { 1314 byte3 = (byte) (state >> 16); 1315 } 1316 if (byte3 == 0) { 1317 byte3 = UnsafeUtil.getByte(address++); 1318 if (address >= addressLimit) { 1319 return incompleteStateFor(byte1, byte2, byte3); 1320 } 1321 } 1322 1323 // If we were called with state == MALFORMED, then byte1 is 0xFF, 1324 // which never occurs in well-formed UTF-8, and so we will return 1325 // MALFORMED again below. 1326 1327 if (byte2 > (byte) 0xBF 1328 // Check that 1 <= plane <= 16. Tricky optimized form of: 1329 // if (byte1 > (byte) 0xF4 || 1330 // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 || 1331 // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F) 1332 || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0 1333 // byte3 trailing-byte test 1334 || byte3 > (byte) 0xBF 1335 // byte4 trailing-byte test 1336 || UnsafeUtil.getByte(address++) > (byte) 0xBF) { 1337 return MALFORMED; 1338 } 1339 } 1340 } 1341 1342 return partialIsValidUtf8(address, (int) (addressLimit - address)); 1343 } 1344 1345 @Override decodeUtf8(byte[] bytes, int index, int size)1346 String decodeUtf8(byte[] bytes, int index, int size) throws InvalidProtocolBufferException { 1347 String s = new String(bytes, index, size, Internal.UTF_8); 1348 1349 // '\uFFFD' is the UTF-8 default replacement char, which illegal byte sequences get replaced 1350 // with. 1351 if (s.indexOf('\uFFFD') < 0) { 1352 return s; 1353 } 1354 1355 // Since s contains '\uFFFD' there are 2 options: 1356 // 1) The byte array slice is invalid UTF-8. 1357 // 2) The byte array slice is valid UTF-8 and contains encodings for "\uFFFD". 1358 // To rule out (1), we encode s and compare it to the byte array slice. 1359 // If the byte array slice was invalid UTF-8, then we would get a different sequence of bytes. 1360 if (Arrays.equals( 1361 s.getBytes(Internal.UTF_8), Arrays.copyOfRange(bytes, index, index + size))) { 1362 return s; 1363 } 1364 1365 throw InvalidProtocolBufferException.invalidUtf8(); 1366 } 1367 1368 @Override decodeUtf8Direct(ByteBuffer buffer, int index, int size)1369 String decodeUtf8Direct(ByteBuffer buffer, int index, int size) 1370 throws InvalidProtocolBufferException { 1371 // Bitwise OR combines the sign bits so any negative value fails the check. 1372 if ((index | size | buffer.limit() - index - size) < 0) { 1373 throw new ArrayIndexOutOfBoundsException( 1374 String.format("buffer limit=%d, index=%d, limit=%d", buffer.limit(), index, size)); 1375 } 1376 long address = UnsafeUtil.addressOffset(buffer) + index; 1377 final long addressLimit = address + size; 1378 1379 // The longest possible resulting String is the same as the number of input bytes, when it is 1380 // all ASCII. For other cases, this over-allocates and we will truncate in the end. 1381 char[] resultArr = new char[size]; 1382 int resultPos = 0; 1383 1384 // Optimize for 100% ASCII (Hotspot loves small simple top-level loops like this). 1385 // This simple loop stops when we encounter a byte >= 0x80 (i.e. non-ASCII). 1386 while (address < addressLimit) { 1387 byte b = UnsafeUtil.getByte(address); 1388 if (!DecodeUtil.isOneByte(b)) { 1389 break; 1390 } 1391 address++; 1392 DecodeUtil.handleOneByte(b, resultArr, resultPos++); 1393 } 1394 1395 while (address < addressLimit) { 1396 byte byte1 = UnsafeUtil.getByte(address++); 1397 if (DecodeUtil.isOneByte(byte1)) { 1398 DecodeUtil.handleOneByte(byte1, resultArr, resultPos++); 1399 // It's common for there to be multiple ASCII characters in a run mixed in, so add an 1400 // extra optimized loop to take care of these runs. 1401 while (address < addressLimit) { 1402 byte b = UnsafeUtil.getByte(address); 1403 if (!DecodeUtil.isOneByte(b)) { 1404 break; 1405 } 1406 address++; 1407 DecodeUtil.handleOneByte(b, resultArr, resultPos++); 1408 } 1409 } else if (DecodeUtil.isTwoBytes(byte1)) { 1410 if (address >= addressLimit) { 1411 throw InvalidProtocolBufferException.invalidUtf8(); 1412 } 1413 DecodeUtil.handleTwoBytes( 1414 byte1, /* byte2 */ UnsafeUtil.getByte(address++), resultArr, resultPos++); 1415 } else if (DecodeUtil.isThreeBytes(byte1)) { 1416 if (address >= addressLimit - 1) { 1417 throw InvalidProtocolBufferException.invalidUtf8(); 1418 } 1419 DecodeUtil.handleThreeBytes( 1420 byte1, 1421 /* byte2 */ UnsafeUtil.getByte(address++), 1422 /* byte3 */ UnsafeUtil.getByte(address++), 1423 resultArr, 1424 resultPos++); 1425 } else { 1426 if (address >= addressLimit - 2) { 1427 throw InvalidProtocolBufferException.invalidUtf8(); 1428 } 1429 DecodeUtil.handleFourBytes( 1430 byte1, 1431 /* byte2 */ UnsafeUtil.getByte(address++), 1432 /* byte3 */ UnsafeUtil.getByte(address++), 1433 /* byte4 */ UnsafeUtil.getByte(address++), 1434 resultArr, 1435 resultPos++); 1436 // 4-byte case requires two chars. 1437 resultPos++; 1438 } 1439 } 1440 1441 return new String(resultArr, 0, resultPos); 1442 } 1443 1444 @Override encodeUtf8(final String in, final byte[] out, final int offset, final int length)1445 int encodeUtf8(final String in, final byte[] out, final int offset, final int length) { 1446 long outIx = offset; 1447 final long outLimit = outIx + length; 1448 final int inLimit = in.length(); 1449 if (inLimit > length || out.length - length < offset) { 1450 // Not even enough room for an ASCII-encoded string. 1451 throw new ArrayIndexOutOfBoundsException( 1452 "Failed writing " + in.charAt(inLimit - 1) + " at index " + (offset + length)); 1453 } 1454 1455 // Designed to take advantage of 1456 // https://wiki.openjdk.java.net/display/HotSpotInternals/RangeCheckElimination 1457 int inIx = 0; 1458 for (char c; inIx < inLimit && (c = in.charAt(inIx)) < 0x80; ++inIx) { 1459 UnsafeUtil.putByte(out, outIx++, (byte) c); 1460 } 1461 if (inIx == inLimit) { 1462 // We're done, it was ASCII encoded. 1463 return (int) outIx; 1464 } 1465 1466 for (char c; inIx < inLimit; ++inIx) { 1467 c = in.charAt(inIx); 1468 if (c < 0x80 && outIx < outLimit) { 1469 UnsafeUtil.putByte(out, outIx++, (byte) c); 1470 } else if (c < 0x800 && outIx <= outLimit - 2L) { // 11 bits, two UTF-8 bytes 1471 UnsafeUtil.putByte(out, outIx++, (byte) ((0xF << 6) | (c >>> 6))); 1472 UnsafeUtil.putByte(out, outIx++, (byte) (0x80 | (0x3F & c))); 1473 } else if ((c < MIN_SURROGATE || MAX_SURROGATE < c) && outIx <= outLimit - 3L) { 1474 // Maximum single-char code point is 0xFFFF, 16 bits, three UTF-8 bytes 1475 UnsafeUtil.putByte(out, outIx++, (byte) ((0xF << 5) | (c >>> 12))); 1476 UnsafeUtil.putByte(out, outIx++, (byte) (0x80 | (0x3F & (c >>> 6)))); 1477 UnsafeUtil.putByte(out, outIx++, (byte) (0x80 | (0x3F & c))); 1478 } else if (outIx <= outLimit - 4L) { 1479 // Minimum code point represented by a surrogate pair is 0x10000, 17 bits, four UTF-8 1480 // bytes 1481 final char low; 1482 if (inIx + 1 == inLimit || !isSurrogatePair(c, (low = in.charAt(++inIx)))) { 1483 throw new UnpairedSurrogateException((inIx - 1), inLimit); 1484 } 1485 int codePoint = toCodePoint(c, low); 1486 UnsafeUtil.putByte(out, outIx++, (byte) ((0xF << 4) | (codePoint >>> 18))); 1487 UnsafeUtil.putByte(out, outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 12)))); 1488 UnsafeUtil.putByte(out, outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 6)))); 1489 UnsafeUtil.putByte(out, outIx++, (byte) (0x80 | (0x3F & codePoint))); 1490 } else { 1491 if ((MIN_SURROGATE <= c && c <= MAX_SURROGATE) 1492 && (inIx + 1 == inLimit || !isSurrogatePair(c, in.charAt(inIx + 1)))) { 1493 // We are surrogates and we're not a surrogate pair. 1494 throw new UnpairedSurrogateException(inIx, inLimit); 1495 } 1496 // Not enough space in the output buffer. 1497 throw new ArrayIndexOutOfBoundsException("Failed writing " + c + " at index " + outIx); 1498 } 1499 } 1500 1501 // All bytes have been encoded. 1502 return (int) outIx; 1503 } 1504 1505 @Override encodeUtf8Direct(String in, ByteBuffer out)1506 void encodeUtf8Direct(String in, ByteBuffer out) { 1507 final long address = addressOffset(out); 1508 long outIx = address + out.position(); 1509 final long outLimit = address + out.limit(); 1510 final int inLimit = in.length(); 1511 if (inLimit > outLimit - outIx) { 1512 // Not even enough room for an ASCII-encoded string. 1513 throw new ArrayIndexOutOfBoundsException( 1514 "Failed writing " + in.charAt(inLimit - 1) + " at index " + out.limit()); 1515 } 1516 1517 // Designed to take advantage of 1518 // https://wiki.openjdk.java.net/display/HotSpotInternals/RangeCheckElimination 1519 int inIx = 0; 1520 for (char c; inIx < inLimit && (c = in.charAt(inIx)) < 0x80; ++inIx) { 1521 UnsafeUtil.putByte(outIx++, (byte) c); 1522 } 1523 if (inIx == inLimit) { 1524 // We're done, it was ASCII encoded. 1525 Java8Compatibility.position(out, (int) (outIx - address)); 1526 return; 1527 } 1528 1529 for (char c; inIx < inLimit; ++inIx) { 1530 c = in.charAt(inIx); 1531 if (c < 0x80 && outIx < outLimit) { 1532 UnsafeUtil.putByte(outIx++, (byte) c); 1533 } else if (c < 0x800 && outIx <= outLimit - 2L) { // 11 bits, two UTF-8 bytes 1534 UnsafeUtil.putByte(outIx++, (byte) ((0xF << 6) | (c >>> 6))); 1535 UnsafeUtil.putByte(outIx++, (byte) (0x80 | (0x3F & c))); 1536 } else if ((c < MIN_SURROGATE || MAX_SURROGATE < c) && outIx <= outLimit - 3L) { 1537 // Maximum single-char code point is 0xFFFF, 16 bits, three UTF-8 bytes 1538 UnsafeUtil.putByte(outIx++, (byte) ((0xF << 5) | (c >>> 12))); 1539 UnsafeUtil.putByte(outIx++, (byte) (0x80 | (0x3F & (c >>> 6)))); 1540 UnsafeUtil.putByte(outIx++, (byte) (0x80 | (0x3F & c))); 1541 } else if (outIx <= outLimit - 4L) { 1542 // Minimum code point represented by a surrogate pair is 0x10000, 17 bits, four UTF-8 1543 // bytes 1544 final char low; 1545 if (inIx + 1 == inLimit || !isSurrogatePair(c, (low = in.charAt(++inIx)))) { 1546 throw new UnpairedSurrogateException((inIx - 1), inLimit); 1547 } 1548 int codePoint = toCodePoint(c, low); 1549 UnsafeUtil.putByte(outIx++, (byte) ((0xF << 4) | (codePoint >>> 18))); 1550 UnsafeUtil.putByte(outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 12)))); 1551 UnsafeUtil.putByte(outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 6)))); 1552 UnsafeUtil.putByte(outIx++, (byte) (0x80 | (0x3F & codePoint))); 1553 } else { 1554 if ((MIN_SURROGATE <= c && c <= MAX_SURROGATE) 1555 && (inIx + 1 == inLimit || !isSurrogatePair(c, in.charAt(inIx + 1)))) { 1556 // We are surrogates and we're not a surrogate pair. 1557 throw new UnpairedSurrogateException(inIx, inLimit); 1558 } 1559 // Not enough space in the output buffer. 1560 throw new ArrayIndexOutOfBoundsException("Failed writing " + c + " at index " + outIx); 1561 } 1562 } 1563 1564 // All bytes have been encoded. 1565 Java8Compatibility.position(out, (int) (outIx - address)); 1566 } 1567 1568 /** 1569 * Counts (approximately) the number of consecutive ASCII characters starting from the given 1570 * position, using the most efficient method available to the platform. 1571 * 1572 * @param bytes the array containing the character sequence 1573 * @param offset the offset position of the index (same as index + arrayBaseOffset) 1574 * @param maxChars the maximum number of characters to count 1575 * @return the number of ASCII characters found. The stopping position will be at or before the 1576 * first non-ASCII byte. 1577 */ unsafeEstimateConsecutiveAscii( byte[] bytes, long offset, final int maxChars)1578 private static int unsafeEstimateConsecutiveAscii( 1579 byte[] bytes, long offset, final int maxChars) { 1580 if (maxChars < UNSAFE_COUNT_ASCII_THRESHOLD) { 1581 // Don't bother with small strings. 1582 return 0; 1583 } 1584 1585 // Read bytes until 8-byte aligned so that we can read longs in the loop below. 1586 // Byte arrays are already either 8 or 16-byte aligned, so we just need to make sure that 1587 // the index (relative to the start of the array) is also 8-byte aligned. We do this by 1588 // ANDing the index with 7 to determine the number of bytes that need to be read before 1589 // we're 8-byte aligned. 1590 final int unaligned = 8 - ((int) offset & 7); 1591 int i; 1592 for (i = 0; i < unaligned; i++) { 1593 if (UnsafeUtil.getByte(bytes, offset++) < 0) { 1594 return i; 1595 } 1596 } 1597 1598 for (; i + 8 <= maxChars; i += 8) { 1599 if ((UnsafeUtil.getLong(bytes, UnsafeUtil.BYTE_ARRAY_BASE_OFFSET + offset) 1600 & ASCII_MASK_LONG) 1601 != 0L) { 1602 break; 1603 } 1604 offset += 8; 1605 } 1606 1607 for (; i < maxChars; i++) { 1608 if (UnsafeUtil.getByte(bytes, offset++) < 0) { 1609 return i; 1610 } 1611 } 1612 return maxChars; 1613 } 1614 1615 /** 1616 * Same as {@link Utf8#estimateConsecutiveAscii(ByteBuffer, int, int)} except that it uses the 1617 * most efficient method available to the platform. 1618 */ unsafeEstimateConsecutiveAscii(long address, final int maxChars)1619 private static int unsafeEstimateConsecutiveAscii(long address, final int maxChars) { 1620 int remaining = maxChars; 1621 if (remaining < UNSAFE_COUNT_ASCII_THRESHOLD) { 1622 // Don't bother with small strings. 1623 return 0; 1624 } 1625 1626 // Read bytes until 8-byte aligned so that we can read longs in the loop below. 1627 // This is equivalent to (8-address) mod 8, the number of bytes we need to read before we're 1628 // 8-byte aligned. 1629 final int unaligned = (int) (-address & 7); 1630 for (int j = unaligned; j > 0; j--) { 1631 if (UnsafeUtil.getByte(address++) < 0) { 1632 return unaligned - j; 1633 } 1634 } 1635 1636 // This simple loop stops when we encounter a byte >= 0x80 (i.e. non-ASCII). 1637 // To speed things up further, we're reading longs instead of bytes so we use a mask to 1638 // determine if any byte in the current long is non-ASCII. 1639 remaining -= unaligned; 1640 for (; 1641 remaining >= 8 && (UnsafeUtil.getLong(address) & ASCII_MASK_LONG) == 0; 1642 address += 8, remaining -= 8) {} 1643 return maxChars - remaining; 1644 } 1645 partialIsValidUtf8(final byte[] bytes, long offset, int remaining)1646 private static int partialIsValidUtf8(final byte[] bytes, long offset, int remaining) { 1647 // Skip past ASCII characters as quickly as possible. 1648 final int skipped = unsafeEstimateConsecutiveAscii(bytes, offset, remaining); 1649 remaining -= skipped; 1650 offset += skipped; 1651 1652 for (; ; ) { 1653 // Optimize for interior runs of ASCII bytes. 1654 // TODO: Consider checking 8 bytes at a time after some threshold? 1655 // Maybe after seeing a few in a row that are ASCII, go back to fast mode? 1656 int byte1 = 0; 1657 for (; remaining > 0 && (byte1 = UnsafeUtil.getByte(bytes, offset++)) >= 0; --remaining) {} 1658 if (remaining == 0) { 1659 return COMPLETE; 1660 } 1661 remaining--; 1662 1663 // If we're here byte1 is not ASCII. Only need to handle 2-4 byte forms. 1664 if (byte1 < (byte) 0xE0) { 1665 // Two-byte form (110xxxxx 10xxxxxx) 1666 if (remaining == 0) { 1667 // Incomplete sequence 1668 return byte1; 1669 } 1670 remaining--; 1671 1672 // Simultaneously checks for illegal trailing-byte in 1673 // leading position and overlong 2-byte form. 1674 if (byte1 < (byte) 0xC2 || UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF) { 1675 return MALFORMED; 1676 } 1677 } else if (byte1 < (byte) 0xF0) { 1678 // Three-byte form (1110xxxx 10xxxxxx 10xxxxxx) 1679 if (remaining < 2) { 1680 // Incomplete sequence 1681 return unsafeIncompleteStateFor(bytes, byte1, offset, remaining); 1682 } 1683 remaining -= 2; 1684 1685 final int byte2; 1686 if ((byte2 = UnsafeUtil.getByte(bytes, offset++)) > (byte) 0xBF 1687 // overlong? 5 most significant bits must not all be zero 1688 || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0) 1689 // check for illegal surrogate codepoints 1690 || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0) 1691 // byte3 trailing-byte test 1692 || UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF) { 1693 return MALFORMED; 1694 } 1695 } else { 1696 // Four-byte form (1110xxxx 10xxxxxx 10xxxxxx 10xxxxxx) 1697 if (remaining < 3) { 1698 // Incomplete sequence 1699 return unsafeIncompleteStateFor(bytes, byte1, offset, remaining); 1700 } 1701 remaining -= 3; 1702 1703 final int byte2; 1704 if ((byte2 = UnsafeUtil.getByte(bytes, offset++)) > (byte) 0xBF 1705 // Check that 1 <= plane <= 16. Tricky optimized form of: 1706 // if (byte1 > (byte) 0xF4 || 1707 // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 || 1708 // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F) 1709 || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0 1710 // byte3 trailing-byte test 1711 || UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF 1712 // byte4 trailing-byte test 1713 || UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF) { 1714 return MALFORMED; 1715 } 1716 } 1717 } 1718 } 1719 partialIsValidUtf8(long address, int remaining)1720 private static int partialIsValidUtf8(long address, int remaining) { 1721 // Skip past ASCII characters as quickly as possible. 1722 final int skipped = unsafeEstimateConsecutiveAscii(address, remaining); 1723 address += skipped; 1724 remaining -= skipped; 1725 1726 for (; ; ) { 1727 // Optimize for interior runs of ASCII bytes. 1728 // TODO: Consider checking 8 bytes at a time after some threshold? 1729 // Maybe after seeing a few in a row that are ASCII, go back to fast mode? 1730 int byte1 = 0; 1731 for (; remaining > 0 && (byte1 = UnsafeUtil.getByte(address++)) >= 0; --remaining) {} 1732 if (remaining == 0) { 1733 return COMPLETE; 1734 } 1735 remaining--; 1736 1737 if (byte1 < (byte) 0xE0) { 1738 // Two-byte form 1739 1740 if (remaining == 0) { 1741 // Incomplete sequence 1742 return byte1; 1743 } 1744 remaining--; 1745 1746 // Simultaneously checks for illegal trailing-byte in 1747 // leading position and overlong 2-byte form. 1748 if (byte1 < (byte) 0xC2 || UnsafeUtil.getByte(address++) > (byte) 0xBF) { 1749 return MALFORMED; 1750 } 1751 } else if (byte1 < (byte) 0xF0) { 1752 // Three-byte form 1753 1754 if (remaining < 2) { 1755 // Incomplete sequence 1756 return unsafeIncompleteStateFor(address, byte1, remaining); 1757 } 1758 remaining -= 2; 1759 1760 final byte byte2 = UnsafeUtil.getByte(address++); 1761 if (byte2 > (byte) 0xBF 1762 // overlong? 5 most significant bits must not all be zero 1763 || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0) 1764 // check for illegal surrogate codepoints 1765 || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0) 1766 // byte3 trailing-byte test 1767 || UnsafeUtil.getByte(address++) > (byte) 0xBF) { 1768 return MALFORMED; 1769 } 1770 } else { 1771 // Four-byte form 1772 1773 if (remaining < 3) { 1774 // Incomplete sequence 1775 return unsafeIncompleteStateFor(address, byte1, remaining); 1776 } 1777 remaining -= 3; 1778 1779 final byte byte2 = UnsafeUtil.getByte(address++); 1780 if (byte2 > (byte) 0xBF 1781 // Check that 1 <= plane <= 16. Tricky optimized form of: 1782 // if (byte1 > (byte) 0xF4 || 1783 // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 || 1784 // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F) 1785 || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0 1786 // byte3 trailing-byte test 1787 || UnsafeUtil.getByte(address++) > (byte) 0xBF 1788 // byte4 trailing-byte test 1789 || UnsafeUtil.getByte(address++) > (byte) 0xBF) { 1790 return MALFORMED; 1791 } 1792 } 1793 } 1794 } 1795 unsafeIncompleteStateFor( byte[] bytes, int byte1, long offset, int remaining)1796 private static int unsafeIncompleteStateFor( 1797 byte[] bytes, int byte1, long offset, int remaining) { 1798 switch (remaining) { 1799 case 0: 1800 return incompleteStateFor(byte1); 1801 case 1: 1802 return incompleteStateFor(byte1, UnsafeUtil.getByte(bytes, offset)); 1803 case 2: 1804 return incompleteStateFor( 1805 byte1, UnsafeUtil.getByte(bytes, offset), UnsafeUtil.getByte(bytes, offset + 1)); 1806 default: 1807 throw new AssertionError(); 1808 } 1809 } 1810 unsafeIncompleteStateFor(long address, final int byte1, int remaining)1811 private static int unsafeIncompleteStateFor(long address, final int byte1, int remaining) { 1812 switch (remaining) { 1813 case 0: 1814 return incompleteStateFor(byte1); 1815 case 1: 1816 return incompleteStateFor(byte1, UnsafeUtil.getByte(address)); 1817 case 2: 1818 return incompleteStateFor( 1819 byte1, UnsafeUtil.getByte(address), UnsafeUtil.getByte(address + 1)); 1820 default: 1821 throw new AssertionError(); 1822 } 1823 } 1824 } 1825 1826 /** 1827 * Utility methods for decoding bytes into {@link String}. Callers are responsible for extracting 1828 * bytes (possibly using Unsafe methods), and checking remaining bytes. All other UTF-8 validity 1829 * checks and codepoint conversion happen in this class. 1830 */ 1831 private static class DecodeUtil { 1832 1833 /** Returns whether this is a single-byte codepoint (i.e., ASCII) with the form '0XXXXXXX'. */ isOneByte(byte b)1834 private static boolean isOneByte(byte b) { 1835 return b >= 0; 1836 } 1837 1838 /** 1839 * Returns whether this is a two-byte codepoint with the form '10XXXXXX' iff 1840 * {@link #isOneByte(byte)} is false. This private method works in the limited use in 1841 * this class where this method is only called when {@link #isOneByte(byte)} has already 1842 * returned false. It is not suitable for general or public use. 1843 */ isTwoBytes(byte b)1844 private static boolean isTwoBytes(byte b) { 1845 return b < (byte) 0xE0; 1846 } 1847 1848 /** 1849 * Returns whether this is a three-byte codepoint with the form '110XXXXX' iff 1850 * {@link #isOneByte(byte)} and {@link #isTwoBytes(byte)} are false. 1851 * This private method works in the limited use in 1852 * this class where this method is only called when {@link #isOneByte(byte)} an 1853 * {@link #isTwoBytes(byte)} have already returned false. It is not suitable for general 1854 * or public use. 1855 */ isThreeBytes(byte b)1856 private static boolean isThreeBytes(byte b) { 1857 return b < (byte) 0xF0; 1858 } 1859 handleOneByte(byte byte1, char[] resultArr, int resultPos)1860 private static void handleOneByte(byte byte1, char[] resultArr, int resultPos) { 1861 resultArr[resultPos] = (char) byte1; 1862 } 1863 handleTwoBytes(byte byte1, byte byte2, char[] resultArr, int resultPos)1864 private static void handleTwoBytes(byte byte1, byte byte2, char[] resultArr, int resultPos) 1865 throws InvalidProtocolBufferException { 1866 // Simultaneously checks for illegal trailing-byte in leading position (<= '11000000') and 1867 // overlong 2-byte, '11000001'. 1868 if (byte1 < (byte) 0xC2 || isNotTrailingByte(byte2)) { 1869 throw InvalidProtocolBufferException.invalidUtf8(); 1870 } 1871 resultArr[resultPos] = (char) (((byte1 & 0x1F) << 6) | trailingByteValue(byte2)); 1872 } 1873 handleThreeBytes( byte byte1, byte byte2, byte byte3, char[] resultArr, int resultPos)1874 private static void handleThreeBytes( 1875 byte byte1, byte byte2, byte byte3, char[] resultArr, int resultPos) 1876 throws InvalidProtocolBufferException { 1877 if (isNotTrailingByte(byte2) 1878 // overlong? 5 most significant bits must not all be zero 1879 || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0) 1880 // check for illegal surrogate codepoints 1881 || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0) 1882 || isNotTrailingByte(byte3)) { 1883 throw InvalidProtocolBufferException.invalidUtf8(); 1884 } 1885 resultArr[resultPos] = 1886 (char) 1887 (((byte1 & 0x0F) << 12) | (trailingByteValue(byte2) << 6) | trailingByteValue(byte3)); 1888 } 1889 handleFourBytes( byte byte1, byte byte2, byte byte3, byte byte4, char[] resultArr, int resultPos)1890 private static void handleFourBytes( 1891 byte byte1, byte byte2, byte byte3, byte byte4, char[] resultArr, int resultPos) 1892 throws InvalidProtocolBufferException { 1893 if (isNotTrailingByte(byte2) 1894 // Check that 1 <= plane <= 16. Tricky optimized form of: 1895 // valid 4-byte leading byte? 1896 // if (byte1 > (byte) 0xF4 || 1897 // overlong? 4 most significant bits must not all be zero 1898 // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 || 1899 // codepoint larger than the highest code point (U+10FFFF)? 1900 // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F) 1901 || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0 1902 || isNotTrailingByte(byte3) 1903 || isNotTrailingByte(byte4)) { 1904 throw InvalidProtocolBufferException.invalidUtf8(); 1905 } 1906 int codepoint = 1907 ((byte1 & 0x07) << 18) 1908 | (trailingByteValue(byte2) << 12) 1909 | (trailingByteValue(byte3) << 6) 1910 | trailingByteValue(byte4); 1911 resultArr[resultPos] = DecodeUtil.highSurrogate(codepoint); 1912 resultArr[resultPos + 1] = DecodeUtil.lowSurrogate(codepoint); 1913 } 1914 1915 /** Returns whether the byte is not a valid continuation of the form '10XXXXXX'. */ isNotTrailingByte(byte b)1916 private static boolean isNotTrailingByte(byte b) { 1917 return b > (byte) 0xBF; 1918 } 1919 1920 /** Returns the actual value of the trailing byte (removes the prefix '10') for composition. */ trailingByteValue(byte b)1921 private static int trailingByteValue(byte b) { 1922 return b & 0x3F; 1923 } 1924 highSurrogate(int codePoint)1925 private static char highSurrogate(int codePoint) { 1926 return (char) 1927 ((MIN_HIGH_SURROGATE - (MIN_SUPPLEMENTARY_CODE_POINT >>> 10)) + (codePoint >>> 10)); 1928 } 1929 lowSurrogate(int codePoint)1930 private static char lowSurrogate(int codePoint) { 1931 return (char) (MIN_LOW_SURROGATE + (codePoint & 0x3ff)); 1932 } 1933 } 1934 Utf8()1935 private Utf8() {} 1936 } 1937