1 // Protocol Buffers - Google's data interchange format 2 // Copyright 2008 Google Inc. All rights reserved. 3 // https://developers.google.com/protocol-buffers/ 4 // 5 // Redistribution and use in source and binary forms, with or without 6 // modification, are permitted provided that the following conditions are 7 // met: 8 // 9 // * Redistributions of source code must retain the above copyright 10 // notice, this list of conditions and the following disclaimer. 11 // * Redistributions in binary form must reproduce the above 12 // copyright notice, this list of conditions and the following disclaimer 13 // in the documentation and/or other materials provided with the 14 // distribution. 15 // * Neither the name of Google Inc. nor the names of its 16 // contributors may be used to endorse or promote products derived from 17 // this software without specific prior written permission. 18 // 19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 31 package com.google.protobuf; 32 33 import static com.google.protobuf.UnsafeUtil.addressOffset; 34 import static com.google.protobuf.UnsafeUtil.getArrayBaseOffset; 35 import static com.google.protobuf.UnsafeUtil.hasUnsafeArrayOperations; 36 import static com.google.protobuf.UnsafeUtil.hasUnsafeByteBufferOperations; 37 import static java.lang.Character.MAX_SURROGATE; 38 import static java.lang.Character.MIN_SURROGATE; 39 import static java.lang.Character.isSurrogatePair; 40 import static java.lang.Character.toCodePoint; 41 42 import java.nio.ByteBuffer; 43 44 /** 45 * A set of low-level, high-performance static utility methods related 46 * to the UTF-8 character encoding. This class has no dependencies 47 * outside of the core JDK libraries. 48 * 49 * <p>There are several variants of UTF-8. The one implemented by 50 * this class is the restricted definition of UTF-8 introduced in 51 * Unicode 3.1, which mandates the rejection of "overlong" byte 52 * sequences as well as rejection of 3-byte surrogate codepoint byte 53 * sequences. Note that the UTF-8 decoder included in Oracle's JDK 54 * has been modified to also reject "overlong" byte sequences, but (as 55 * of 2011) still accepts 3-byte surrogate codepoint byte sequences. 56 * 57 * <p>The byte sequences considered valid by this class are exactly 58 * those that can be roundtrip converted to Strings and back to bytes 59 * using the UTF-8 charset, without loss: <pre> {@code 60 * Arrays.equals(bytes, new String(bytes, Internal.UTF_8).getBytes(Internal.UTF_8)) 61 * }</pre> 62 * 63 * <p>See the Unicode Standard,</br> 64 * Table 3-6. <em>UTF-8 Bit Distribution</em>,</br> 65 * Table 3-7. <em>Well Formed UTF-8 Byte Sequences</em>. 66 * 67 * <p>This class supports decoding of partial byte sequences, so that the 68 * bytes in a complete UTF-8 byte sequences can be stored in multiple 69 * segments. Methods typically return {@link #MALFORMED} if the partial 70 * byte sequence is definitely not well-formed, {@link #COMPLETE} if it is 71 * well-formed in the absence of additional input, or if the byte sequence 72 * apparently terminated in the middle of a character, an opaque integer 73 * "state" value containing enough information to decode the character when 74 * passed to a subsequent invocation of a partial decoding method. 75 * 76 * @author martinrb@google.com (Martin Buchholz) 77 */ 78 // TODO(nathanmittler): Copy changes in this class back to Guava 79 final class Utf8 { 80 81 /** 82 * UTF-8 is a runtime hot spot so we attempt to provide heavily optimized implementations 83 * depending on what is available on the platform. The processor is the platform-optimized 84 * delegate for which all methods are delegated directly to. 85 */ 86 private static final Processor processor = 87 UnsafeProcessor.isAvailable() ? new UnsafeProcessor() : new SafeProcessor(); 88 89 /** 90 * A mask used when performing unsafe reads to determine if a long value contains any non-ASCII 91 * characters (i.e. any byte >= 0x80). 92 */ 93 private static final long ASCII_MASK_LONG = 0x8080808080808080L; 94 95 /** 96 * Maximum number of bytes per Java UTF-16 char in UTF-8. 97 * @see java.nio.charset.CharsetEncoder#maxBytesPerChar() 98 */ 99 static final int MAX_BYTES_PER_CHAR = 3; 100 101 /** 102 * State value indicating that the byte sequence is well-formed and 103 * complete (no further bytes are needed to complete a character). 104 */ 105 public static final int COMPLETE = 0; 106 107 /** 108 * State value indicating that the byte sequence is definitely not 109 * well-formed. 110 */ 111 public static final int MALFORMED = -1; 112 113 /** 114 * Used by {@code Unsafe} UTF-8 string validation logic to determine the minimum string length 115 * above which to employ an optimized algorithm for counting ASCII characters. The reason for this 116 * threshold is that for small strings, the optimization may not be beneficial or may even 117 * negatively impact performance since it requires additional logic to avoid unaligned reads 118 * (when calling {@code Unsafe.getLong}). This threshold guarantees that even if the initial 119 * offset is unaligned, we're guaranteed to make at least one call to {@code Unsafe.getLong()} 120 * which provides a performance improvement that entirely subsumes the cost of the additional 121 * logic. 122 */ 123 private static final int UNSAFE_COUNT_ASCII_THRESHOLD = 16; 124 125 // Other state values include the partial bytes of the incomplete 126 // character to be decoded in the simplest way: we pack the bytes 127 // into the state int in little-endian order. For example: 128 // 129 // int state = byte1 ^ (byte2 << 8) ^ (byte3 << 16); 130 // 131 // Such a state is unpacked thus (note the ~ operation for byte2 to 132 // undo byte1's sign-extension bits): 133 // 134 // int byte1 = (byte) state; 135 // int byte2 = (byte) ~(state >> 8); 136 // int byte3 = (byte) (state >> 16); 137 // 138 // We cannot store a zero byte in the state because it would be 139 // indistinguishable from the absence of a byte. But we don't need 140 // to, because partial bytes must always be negative. When building 141 // a state, we ensure that byte1 is negative and subsequent bytes 142 // are valid trailing bytes. 143 144 /** 145 * Returns {@code true} if the given byte array is a well-formed 146 * UTF-8 byte sequence. 147 * 148 * <p>This is a convenience method, equivalent to a call to {@code 149 * isValidUtf8(bytes, 0, bytes.length)}. 150 */ isValidUtf8(byte[] bytes)151 public static boolean isValidUtf8(byte[] bytes) { 152 return processor.isValidUtf8(bytes, 0, bytes.length); 153 } 154 155 /** 156 * Returns {@code true} if the given byte array slice is a 157 * well-formed UTF-8 byte sequence. The range of bytes to be 158 * checked extends from index {@code index}, inclusive, to {@code 159 * limit}, exclusive. 160 * 161 * <p>This is a convenience method, equivalent to {@code 162 * partialIsValidUtf8(bytes, index, limit) == Utf8.COMPLETE}. 163 */ isValidUtf8(byte[] bytes, int index, int limit)164 public static boolean isValidUtf8(byte[] bytes, int index, int limit) { 165 return processor.isValidUtf8(bytes, index, limit); 166 } 167 168 /** 169 * Tells whether the given byte array slice is a well-formed, 170 * malformed, or incomplete UTF-8 byte sequence. The range of bytes 171 * to be checked extends from index {@code index}, inclusive, to 172 * {@code limit}, exclusive. 173 * 174 * @param state either {@link Utf8#COMPLETE} (if this is the initial decoding 175 * operation) or the value returned from a call to a partial decoding method 176 * for the previous bytes 177 * 178 * @return {@link #MALFORMED} if the partial byte sequence is 179 * definitely not well-formed, {@link #COMPLETE} if it is well-formed 180 * (no additional input needed), or if the byte sequence is 181 * "incomplete", i.e. apparently terminated in the middle of a character, 182 * an opaque integer "state" value containing enough information to 183 * decode the character when passed to a subsequent invocation of a 184 * partial decoding method. 185 */ partialIsValidUtf8(int state, byte[] bytes, int index, int limit)186 public static int partialIsValidUtf8(int state, byte[] bytes, int index, int limit) { 187 return processor.partialIsValidUtf8(state, bytes, index, limit); 188 } 189 incompleteStateFor(int byte1)190 private static int incompleteStateFor(int byte1) { 191 return (byte1 > (byte) 0xF4) ? 192 MALFORMED : byte1; 193 } 194 incompleteStateFor(int byte1, int byte2)195 private static int incompleteStateFor(int byte1, int byte2) { 196 return (byte1 > (byte) 0xF4 || 197 byte2 > (byte) 0xBF) ? 198 MALFORMED : byte1 ^ (byte2 << 8); 199 } 200 incompleteStateFor(int byte1, int byte2, int byte3)201 private static int incompleteStateFor(int byte1, int byte2, int byte3) { 202 return (byte1 > (byte) 0xF4 || 203 byte2 > (byte) 0xBF || 204 byte3 > (byte) 0xBF) ? 205 MALFORMED : byte1 ^ (byte2 << 8) ^ (byte3 << 16); 206 } 207 incompleteStateFor(byte[] bytes, int index, int limit)208 private static int incompleteStateFor(byte[] bytes, int index, int limit) { 209 int byte1 = bytes[index - 1]; 210 switch (limit - index) { 211 case 0: return incompleteStateFor(byte1); 212 case 1: return incompleteStateFor(byte1, bytes[index]); 213 case 2: return incompleteStateFor(byte1, bytes[index], bytes[index + 1]); 214 default: throw new AssertionError(); 215 } 216 } 217 incompleteStateFor( final ByteBuffer buffer, final int byte1, final int index, final int remaining)218 private static int incompleteStateFor( 219 final ByteBuffer buffer, final int byte1, final int index, final int remaining) { 220 switch (remaining) { 221 case 0: 222 return incompleteStateFor(byte1); 223 case 1: 224 return incompleteStateFor(byte1, buffer.get(index)); 225 case 2: 226 return incompleteStateFor(byte1, buffer.get(index), buffer.get(index + 1)); 227 default: 228 throw new AssertionError(); 229 } 230 } 231 232 // These UTF-8 handling methods are copied from Guava's Utf8 class with a modification to throw 233 // a protocol buffer local exception. This exception is then caught in CodedOutputStream so it can 234 // fallback to more lenient behavior. 235 236 static class UnpairedSurrogateException extends IllegalArgumentException { UnpairedSurrogateException(int index, int length)237 UnpairedSurrogateException(int index, int length) { 238 super("Unpaired surrogate at index " + index + " of " + length); 239 } 240 } 241 242 /** 243 * Returns the number of bytes in the UTF-8-encoded form of {@code sequence}. For a string, 244 * this method is equivalent to {@code string.getBytes(UTF_8).length}, but is more efficient in 245 * both time and space. 246 * 247 * @throws IllegalArgumentException if {@code sequence} contains ill-formed UTF-16 (unpaired 248 * surrogates) 249 */ encodedLength(CharSequence sequence)250 static int encodedLength(CharSequence sequence) { 251 // Warning to maintainers: this implementation is highly optimized. 252 int utf16Length = sequence.length(); 253 int utf8Length = utf16Length; 254 int i = 0; 255 256 // This loop optimizes for pure ASCII. 257 while (i < utf16Length && sequence.charAt(i) < 0x80) { 258 i++; 259 } 260 261 // This loop optimizes for chars less than 0x800. 262 for (; i < utf16Length; i++) { 263 char c = sequence.charAt(i); 264 if (c < 0x800) { 265 utf8Length += ((0x7f - c) >>> 31); // branch free! 266 } else { 267 utf8Length += encodedLengthGeneral(sequence, i); 268 break; 269 } 270 } 271 272 if (utf8Length < utf16Length) { 273 // Necessary and sufficient condition for overflow because of maximum 3x expansion 274 throw new IllegalArgumentException("UTF-8 length does not fit in int: " 275 + (utf8Length + (1L << 32))); 276 } 277 return utf8Length; 278 } 279 encodedLengthGeneral(CharSequence sequence, int start)280 private static int encodedLengthGeneral(CharSequence sequence, int start) { 281 int utf16Length = sequence.length(); 282 int utf8Length = 0; 283 for (int i = start; i < utf16Length; i++) { 284 char c = sequence.charAt(i); 285 if (c < 0x800) { 286 utf8Length += (0x7f - c) >>> 31; // branch free! 287 } else { 288 utf8Length += 2; 289 // jdk7+: if (Character.isSurrogate(c)) { 290 if (Character.MIN_SURROGATE <= c && c <= Character.MAX_SURROGATE) { 291 // Check that we have a well-formed surrogate pair. 292 int cp = Character.codePointAt(sequence, i); 293 if (cp < Character.MIN_SUPPLEMENTARY_CODE_POINT) { 294 throw new UnpairedSurrogateException(i, utf16Length); 295 } 296 i++; 297 } 298 } 299 } 300 return utf8Length; 301 } 302 encode(CharSequence in, byte[] out, int offset, int length)303 static int encode(CharSequence in, byte[] out, int offset, int length) { 304 return processor.encodeUtf8(in, out, offset, length); 305 } 306 // End Guava UTF-8 methods. 307 308 /** 309 * Determines if the given {@link ByteBuffer} is a valid UTF-8 string. 310 * 311 * <p>Selects an optimal algorithm based on the type of {@link ByteBuffer} (i.e. heap or direct) 312 * and the capabilities of the platform. 313 * 314 * @param buffer the buffer to check. 315 * @see Utf8#isValidUtf8(byte[], int, int) 316 */ isValidUtf8(ByteBuffer buffer)317 static boolean isValidUtf8(ByteBuffer buffer) { 318 return processor.isValidUtf8(buffer, buffer.position(), buffer.remaining()); 319 } 320 321 /** 322 * Determines if the given {@link ByteBuffer} is a partially valid UTF-8 string. 323 * 324 * <p>Selects an optimal algorithm based on the type of {@link ByteBuffer} (i.e. heap or direct) 325 * and the capabilities of the platform. 326 * 327 * @param buffer the buffer to check. 328 * @see Utf8#partialIsValidUtf8(int, byte[], int, int) 329 */ partialIsValidUtf8(int state, ByteBuffer buffer, int index, int limit)330 static int partialIsValidUtf8(int state, ByteBuffer buffer, int index, int limit) { 331 return processor.partialIsValidUtf8(state, buffer, index, limit); 332 } 333 334 /** 335 * Encodes the given characters to the target {@link ByteBuffer} using UTF-8 encoding. 336 * 337 * <p>Selects an optimal algorithm based on the type of {@link ByteBuffer} (i.e. heap or direct) 338 * and the capabilities of the platform. 339 * 340 * @param in the source string to be encoded 341 * @param out the target buffer to receive the encoded string. 342 * @see Utf8#encode(CharSequence, byte[], int, int) 343 */ encodeUtf8(CharSequence in, ByteBuffer out)344 static void encodeUtf8(CharSequence in, ByteBuffer out) { 345 processor.encodeUtf8(in, out); 346 } 347 348 /** 349 * Counts (approximately) the number of consecutive ASCII characters in the given buffer. 350 * The byte order of the {@link ByteBuffer} does not matter, so performance can be improved if 351 * native byte order is used (i.e. no byte-swapping in {@link ByteBuffer#getLong(int)}). 352 * 353 * @param buffer the buffer to be scanned for ASCII chars 354 * @param index the starting index of the scan 355 * @param limit the limit within buffer for the scan 356 * @return the number of ASCII characters found. The stopping position will be at or 357 * before the first non-ASCII byte. 358 */ estimateConsecutiveAscii(ByteBuffer buffer, int index, int limit)359 private static int estimateConsecutiveAscii(ByteBuffer buffer, int index, int limit) { 360 int i = index; 361 final int lim = limit - 7; 362 // This simple loop stops when we encounter a byte >= 0x80 (i.e. non-ASCII). 363 // To speed things up further, we're reading longs instead of bytes so we use a mask to 364 // determine if any byte in the current long is non-ASCII. 365 for (; i < lim && (buffer.getLong(i) & ASCII_MASK_LONG) == 0; i += 8) {} 366 return i - index; 367 } 368 369 /** 370 * A processor of UTF-8 strings, providing methods for checking validity and encoding. 371 */ 372 // TODO(nathanmittler): Add support for Memory/MemoryBlock on Android. 373 abstract static class Processor { 374 /** 375 * Returns {@code true} if the given byte array slice is a 376 * well-formed UTF-8 byte sequence. The range of bytes to be 377 * checked extends from index {@code index}, inclusive, to {@code 378 * limit}, exclusive. 379 * 380 * <p>This is a convenience method, equivalent to {@code 381 * partialIsValidUtf8(bytes, index, limit) == Utf8.COMPLETE}. 382 */ isValidUtf8(byte[] bytes, int index, int limit)383 final boolean isValidUtf8(byte[] bytes, int index, int limit) { 384 return partialIsValidUtf8(COMPLETE, bytes, index, limit) == COMPLETE; 385 } 386 387 /** 388 * Tells whether the given byte array slice is a well-formed, 389 * malformed, or incomplete UTF-8 byte sequence. The range of bytes 390 * to be checked extends from index {@code index}, inclusive, to 391 * {@code limit}, exclusive. 392 * 393 * @param state either {@link Utf8#COMPLETE} (if this is the initial decoding 394 * operation) or the value returned from a call to a partial decoding method 395 * for the previous bytes 396 * 397 * @return {@link #MALFORMED} if the partial byte sequence is 398 * definitely not well-formed, {@link #COMPLETE} if it is well-formed 399 * (no additional input needed), or if the byte sequence is 400 * "incomplete", i.e. apparently terminated in the middle of a character, 401 * an opaque integer "state" value containing enough information to 402 * decode the character when passed to a subsequent invocation of a 403 * partial decoding method. 404 */ partialIsValidUtf8(int state, byte[] bytes, int index, int limit)405 abstract int partialIsValidUtf8(int state, byte[] bytes, int index, int limit); 406 407 /** 408 * Returns {@code true} if the given portion of the {@link ByteBuffer} is a 409 * well-formed UTF-8 byte sequence. The range of bytes to be 410 * checked extends from index {@code index}, inclusive, to {@code 411 * limit}, exclusive. 412 * 413 * <p>This is a convenience method, equivalent to {@code 414 * partialIsValidUtf8(bytes, index, limit) == Utf8.COMPLETE}. 415 */ isValidUtf8(ByteBuffer buffer, int index, int limit)416 final boolean isValidUtf8(ByteBuffer buffer, int index, int limit) { 417 return partialIsValidUtf8(COMPLETE, buffer, index, limit) == COMPLETE; 418 } 419 420 /** 421 * Indicates whether or not the given buffer contains a valid UTF-8 string. 422 * 423 * @param buffer the buffer to check. 424 * @return {@code true} if the given buffer contains a valid UTF-8 string. 425 */ partialIsValidUtf8( final int state, final ByteBuffer buffer, int index, final int limit)426 final int partialIsValidUtf8( 427 final int state, final ByteBuffer buffer, int index, final int limit) { 428 if (buffer.hasArray()) { 429 final int offset = buffer.arrayOffset(); 430 return partialIsValidUtf8(state, buffer.array(), offset + index, offset + limit); 431 } else if (buffer.isDirect()){ 432 return partialIsValidUtf8Direct(state, buffer, index, limit); 433 } 434 return partialIsValidUtf8Default(state, buffer, index, limit); 435 } 436 437 /** 438 * Performs validation for direct {@link ByteBuffer} instances. 439 */ partialIsValidUtf8Direct( final int state, final ByteBuffer buffer, int index, final int limit)440 abstract int partialIsValidUtf8Direct( 441 final int state, final ByteBuffer buffer, int index, final int limit); 442 443 /** 444 * Performs validation for {@link ByteBuffer} instances using the {@link ByteBuffer} API rather 445 * than potentially faster approaches. This first completes validation for the current 446 * character (provided by {@code state}) and then finishes validation for the sequence. 447 */ partialIsValidUtf8Default( final int state, final ByteBuffer buffer, int index, final int limit)448 final int partialIsValidUtf8Default( 449 final int state, final ByteBuffer buffer, int index, final int limit) { 450 if (state != COMPLETE) { 451 // The previous decoding operation was incomplete (or malformed). 452 // We look for a well-formed sequence consisting of bytes from 453 // the previous decoding operation (stored in state) together 454 // with bytes from the array slice. 455 // 456 // We expect such "straddler characters" to be rare. 457 458 if (index >= limit) { // No bytes? No progress. 459 return state; 460 } 461 462 byte byte1 = (byte) state; 463 // byte1 is never ASCII. 464 if (byte1 < (byte) 0xE0) { 465 // two-byte form 466 467 // Simultaneously checks for illegal trailing-byte in 468 // leading position and overlong 2-byte form. 469 if (byte1 < (byte) 0xC2 470 // byte2 trailing-byte test 471 || buffer.get(index++) > (byte) 0xBF) { 472 return MALFORMED; 473 } 474 } else if (byte1 < (byte) 0xF0) { 475 // three-byte form 476 477 // Get byte2 from saved state or array 478 byte byte2 = (byte) ~(state >> 8); 479 if (byte2 == 0) { 480 byte2 = buffer.get(index++); 481 if (index >= limit) { 482 return incompleteStateFor(byte1, byte2); 483 } 484 } 485 if (byte2 > (byte) 0xBF 486 // overlong? 5 most significant bits must not all be zero 487 || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0) 488 // illegal surrogate codepoint? 489 || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0) 490 // byte3 trailing-byte test 491 || buffer.get(index++) > (byte) 0xBF) { 492 return MALFORMED; 493 } 494 } else { 495 // four-byte form 496 497 // Get byte2 and byte3 from saved state or array 498 byte byte2 = (byte) ~(state >> 8); 499 byte byte3 = 0; 500 if (byte2 == 0) { 501 byte2 = buffer.get(index++); 502 if (index >= limit) { 503 return incompleteStateFor(byte1, byte2); 504 } 505 } else { 506 byte3 = (byte) (state >> 16); 507 } 508 if (byte3 == 0) { 509 byte3 = buffer.get(index++); 510 if (index >= limit) { 511 return incompleteStateFor(byte1, byte2, byte3); 512 } 513 } 514 515 // If we were called with state == MALFORMED, then byte1 is 0xFF, 516 // which never occurs in well-formed UTF-8, and so we will return 517 // MALFORMED again below. 518 519 if (byte2 > (byte) 0xBF 520 // Check that 1 <= plane <= 16. Tricky optimized form of: 521 // if (byte1 > (byte) 0xF4 || 522 // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 || 523 // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F) 524 || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0 525 // byte3 trailing-byte test 526 || byte3 > (byte) 0xBF 527 // byte4 trailing-byte test 528 || buffer.get(index++) > (byte) 0xBF) { 529 return MALFORMED; 530 } 531 } 532 } 533 534 // Finish validation for the sequence. 535 return partialIsValidUtf8(buffer, index, limit); 536 } 537 538 /** 539 * Performs validation for {@link ByteBuffer} instances using the {@link ByteBuffer} API rather 540 * than potentially faster approaches. 541 */ partialIsValidUtf8(final ByteBuffer buffer, int index, final int limit)542 private static int partialIsValidUtf8(final ByteBuffer buffer, int index, final int limit) { 543 index += estimateConsecutiveAscii(buffer, index, limit); 544 545 for (;;) { 546 // Optimize for interior runs of ASCII bytes. 547 // TODO(nathanmittler): Consider checking 8 bytes at a time after some threshold? 548 // Maybe after seeing a few in a row that are ASCII, go back to fast mode? 549 int byte1; 550 do { 551 if (index >= limit) { 552 return COMPLETE; 553 } 554 } while ((byte1 = buffer.get(index++)) >= 0); 555 556 // If we're here byte1 is not ASCII. Only need to handle 2-4 byte forms. 557 if (byte1 < (byte) 0xE0) { 558 // Two-byte form (110xxxxx 10xxxxxx) 559 if (index >= limit) { 560 // Incomplete sequence 561 return byte1; 562 } 563 564 // Simultaneously checks for illegal trailing-byte in 565 // leading position and overlong 2-byte form. 566 if (byte1 < (byte) 0xC2 || buffer.get(index) > (byte) 0xBF) { 567 return MALFORMED; 568 } 569 index++; 570 } else if (byte1 < (byte) 0xF0) { 571 // Three-byte form (1110xxxx 10xxxxxx 10xxxxxx) 572 if (index >= limit - 1) { 573 // Incomplete sequence 574 return incompleteStateFor(buffer, byte1, index, limit - index); 575 } 576 577 final byte byte2 = buffer.get(index++); 578 if (byte2 > (byte) 0xBF 579 // overlong? 5 most significant bits must not all be zero 580 || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0) 581 // check for illegal surrogate codepoints 582 || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0) 583 // byte3 trailing-byte test 584 || buffer.get(index) > (byte) 0xBF) { 585 return MALFORMED; 586 } 587 index++; 588 } else { 589 // Four-byte form (1110xxxx 10xxxxxx 10xxxxxx 10xxxxxx) 590 if (index >= limit - 2) { 591 // Incomplete sequence 592 return incompleteStateFor(buffer, byte1, index, limit - index); 593 } 594 595 // TODO(nathanmittler): Consider using getInt() to improve performance. 596 final int byte2 = buffer.get(index++); 597 if (byte2 > (byte) 0xBF 598 // Check that 1 <= plane <= 16. Tricky optimized form of: 599 // if (byte1 > (byte) 0xF4 || 600 // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 || 601 // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F) 602 || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0 603 // byte3 trailing-byte test 604 || buffer.get(index++) > (byte) 0xBF 605 // byte4 trailing-byte test 606 || buffer.get(index++) > (byte) 0xBF) { 607 return MALFORMED; 608 } 609 } 610 } 611 } 612 613 /** 614 * Encodes an input character sequence ({@code in}) to UTF-8 in the target array ({@code out}). 615 * For a string, this method is similar to 616 * <pre>{@code 617 * byte[] a = string.getBytes(UTF_8); 618 * System.arraycopy(a, 0, bytes, offset, a.length); 619 * return offset + a.length; 620 * }</pre> 621 * 622 * but is more efficient in both time and space. One key difference is that this method 623 * requires paired surrogates, and therefore does not support chunking. 624 * While {@code String.getBytes(UTF_8)} replaces unpaired surrogates with the default 625 * replacement character, this method throws {@link UnpairedSurrogateException}. 626 * 627 * <p>To ensure sufficient space in the output buffer, either call {@link #encodedLength} to 628 * compute the exact amount needed, or leave room for 629 * {@code Utf8.MAX_BYTES_PER_CHAR * sequence.length()}, which is the largest possible number 630 * of bytes that any input can be encoded to. 631 * 632 * @param in the input character sequence to be encoded 633 * @param out the target array 634 * @param offset the starting offset in {@code bytes} to start writing at 635 * @param length the length of the {@code bytes}, starting from {@code offset} 636 * @throws UnpairedSurrogateException if {@code sequence} contains ill-formed UTF-16 (unpaired 637 * surrogates) 638 * @throws ArrayIndexOutOfBoundsException if {@code sequence} encoded in UTF-8 is longer than 639 * {@code bytes.length - offset} 640 * @return the new offset, equivalent to {@code offset + Utf8.encodedLength(sequence)} 641 */ encodeUtf8(CharSequence in, byte[] out, int offset, int length)642 abstract int encodeUtf8(CharSequence in, byte[] out, int offset, int length); 643 644 /** 645 * Encodes an input character sequence ({@code in}) to UTF-8 in the target buffer ({@code out}). 646 * Upon returning from this method, the {@code out} position will point to the position after 647 * the last encoded byte. This method requires paired surrogates, and therefore does not 648 * support chunking. 649 * 650 * <p>To ensure sufficient space in the output buffer, either call {@link #encodedLength} to 651 * compute the exact amount needed, or leave room for 652 * {@code Utf8.MAX_BYTES_PER_CHAR * in.length()}, which is the largest possible number 653 * of bytes that any input can be encoded to. 654 * 655 * @param in the source character sequence to be encoded 656 * @param out the target buffer 657 * @throws UnpairedSurrogateException if {@code in} contains ill-formed UTF-16 (unpaired 658 * surrogates) 659 * @throws ArrayIndexOutOfBoundsException if {@code in} encoded in UTF-8 is longer than 660 * {@code out.remaining()} 661 */ encodeUtf8(CharSequence in, ByteBuffer out)662 final void encodeUtf8(CharSequence in, ByteBuffer out) { 663 if (out.hasArray()) { 664 final int offset = out.arrayOffset(); 665 int endIndex = 666 Utf8.encode(in, out.array(), offset + out.position(), out.remaining()); 667 out.position(endIndex - offset); 668 } else if (out.isDirect()) { 669 encodeUtf8Direct(in, out); 670 } else { 671 encodeUtf8Default(in, out); 672 } 673 } 674 675 /** 676 * Encodes the input character sequence to a direct {@link ByteBuffer} instance. 677 */ encodeUtf8Direct(CharSequence in, ByteBuffer out)678 abstract void encodeUtf8Direct(CharSequence in, ByteBuffer out); 679 680 /** 681 * Encodes the input character sequence to a {@link ByteBuffer} instance using the {@link 682 * ByteBuffer} API, rather than potentially faster approaches. 683 */ encodeUtf8Default(CharSequence in, ByteBuffer out)684 final void encodeUtf8Default(CharSequence in, ByteBuffer out) { 685 final int inLength = in.length(); 686 int outIx = out.position(); 687 int inIx = 0; 688 689 // Since ByteBuffer.putXXX() already checks boundaries for us, no need to explicitly check 690 // access. Assume the buffer is big enough and let it handle the out of bounds exception 691 // if it occurs. 692 try { 693 // Designed to take advantage of 694 // https://wikis.oracle.com/display/HotSpotInternals/RangeCheckElimination 695 for (char c; inIx < inLength && (c = in.charAt(inIx)) < 0x80; ++inIx) { 696 out.put(outIx + inIx, (byte) c); 697 } 698 if (inIx == inLength) { 699 // Successfully encoded the entire string. 700 out.position(outIx + inIx); 701 return; 702 } 703 704 outIx += inIx; 705 for (char c; inIx < inLength; ++inIx, ++outIx) { 706 c = in.charAt(inIx); 707 if (c < 0x80) { 708 // One byte (0xxx xxxx) 709 out.put(outIx, (byte) c); 710 } else if (c < 0x800) { 711 // Two bytes (110x xxxx 10xx xxxx) 712 713 // Benchmarks show put performs better than putShort here (for HotSpot). 714 out.put(outIx++, (byte) (0xC0 | (c >>> 6))); 715 out.put(outIx, (byte) (0x80 | (0x3F & c))); 716 } else if (c < MIN_SURROGATE || MAX_SURROGATE < c) { 717 // Three bytes (1110 xxxx 10xx xxxx 10xx xxxx) 718 // Maximum single-char code point is 0xFFFF, 16 bits. 719 720 // Benchmarks show put performs better than putShort here (for HotSpot). 721 out.put(outIx++, (byte) (0xE0 | (c >>> 12))); 722 out.put(outIx++, (byte) (0x80 | (0x3F & (c >>> 6)))); 723 out.put(outIx, (byte) (0x80 | (0x3F & c))); 724 } else { 725 // Four bytes (1111 xxxx 10xx xxxx 10xx xxxx 10xx xxxx) 726 727 // Minimum code point represented by a surrogate pair is 0x10000, 17 bits, four UTF-8 728 // bytes 729 final char low; 730 if (inIx + 1 == inLength || !isSurrogatePair(c, (low = in.charAt(++inIx)))) { 731 throw new UnpairedSurrogateException(inIx, inLength); 732 } 733 // TODO(nathanmittler): Consider using putInt() to improve performance. 734 int codePoint = toCodePoint(c, low); 735 out.put(outIx++, (byte) ((0xF << 4) | (codePoint >>> 18))); 736 out.put(outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 12)))); 737 out.put(outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 6)))); 738 out.put(outIx, (byte) (0x80 | (0x3F & codePoint))); 739 } 740 } 741 742 // Successfully encoded the entire string. 743 out.position(outIx); 744 } catch (IndexOutOfBoundsException e) { 745 // TODO(nathanmittler): Consider making the API throw IndexOutOfBoundsException instead. 746 747 // If we failed in the outer ASCII loop, outIx will not have been updated. In this case, 748 // use inIx to determine the bad write index. 749 int badWriteIndex = out.position() + Math.max(inIx, outIx - out.position() + 1); 750 throw new ArrayIndexOutOfBoundsException( 751 "Failed writing " + in.charAt(inIx) + " at index " + badWriteIndex); 752 } 753 } 754 } 755 756 /** 757 * {@link Processor} implementation that does not use any {@code sun.misc.Unsafe} methods. 758 */ 759 static final class SafeProcessor extends Processor { 760 @Override partialIsValidUtf8(int state, byte[] bytes, int index, int limit)761 int partialIsValidUtf8(int state, byte[] bytes, int index, int limit) { 762 if (state != COMPLETE) { 763 // The previous decoding operation was incomplete (or malformed). 764 // We look for a well-formed sequence consisting of bytes from 765 // the previous decoding operation (stored in state) together 766 // with bytes from the array slice. 767 // 768 // We expect such "straddler characters" to be rare. 769 770 if (index >= limit) { // No bytes? No progress. 771 return state; 772 } 773 int byte1 = (byte) state; 774 // byte1 is never ASCII. 775 if (byte1 < (byte) 0xE0) { 776 // two-byte form 777 778 // Simultaneously checks for illegal trailing-byte in 779 // leading position and overlong 2-byte form. 780 if (byte1 < (byte) 0xC2 781 // byte2 trailing-byte test 782 || bytes[index++] > (byte) 0xBF) { 783 return MALFORMED; 784 } 785 } else if (byte1 < (byte) 0xF0) { 786 // three-byte form 787 788 // Get byte2 from saved state or array 789 int byte2 = (byte) ~(state >> 8); 790 if (byte2 == 0) { 791 byte2 = bytes[index++]; 792 if (index >= limit) { 793 return incompleteStateFor(byte1, byte2); 794 } 795 } 796 if (byte2 > (byte) 0xBF 797 // overlong? 5 most significant bits must not all be zero 798 || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0) 799 // illegal surrogate codepoint? 800 || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0) 801 // byte3 trailing-byte test 802 || bytes[index++] > (byte) 0xBF) { 803 return MALFORMED; 804 } 805 } else { 806 // four-byte form 807 808 // Get byte2 and byte3 from saved state or array 809 int byte2 = (byte) ~(state >> 8); 810 int byte3 = 0; 811 if (byte2 == 0) { 812 byte2 = bytes[index++]; 813 if (index >= limit) { 814 return incompleteStateFor(byte1, byte2); 815 } 816 } else { 817 byte3 = (byte) (state >> 16); 818 } 819 if (byte3 == 0) { 820 byte3 = bytes[index++]; 821 if (index >= limit) { 822 return incompleteStateFor(byte1, byte2, byte3); 823 } 824 } 825 826 // If we were called with state == MALFORMED, then byte1 is 0xFF, 827 // which never occurs in well-formed UTF-8, and so we will return 828 // MALFORMED again below. 829 830 if (byte2 > (byte) 0xBF 831 // Check that 1 <= plane <= 16. Tricky optimized form of: 832 // if (byte1 > (byte) 0xF4 || 833 // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 || 834 // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F) 835 || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0 836 // byte3 trailing-byte test 837 || byte3 > (byte) 0xBF 838 // byte4 trailing-byte test 839 || bytes[index++] > (byte) 0xBF) { 840 return MALFORMED; 841 } 842 } 843 } 844 845 return partialIsValidUtf8(bytes, index, limit); 846 } 847 848 @Override partialIsValidUtf8Direct(int state, ByteBuffer buffer, int index, int limit)849 int partialIsValidUtf8Direct(int state, ByteBuffer buffer, int index, int limit) { 850 // For safe processing, we have to use the ByteBuffer API. 851 return partialIsValidUtf8Default(state, buffer, index, limit); 852 } 853 854 @Override encodeUtf8(CharSequence in, byte[] out, int offset, int length)855 int encodeUtf8(CharSequence in, byte[] out, int offset, int length) { 856 int utf16Length = in.length(); 857 int j = offset; 858 int i = 0; 859 int limit = offset + length; 860 // Designed to take advantage of 861 // https://wikis.oracle.com/display/HotSpotInternals/RangeCheckElimination 862 for (char c; i < utf16Length && i + j < limit && (c = in.charAt(i)) < 0x80; i++) { 863 out[j + i] = (byte) c; 864 } 865 if (i == utf16Length) { 866 return j + utf16Length; 867 } 868 j += i; 869 for (char c; i < utf16Length; i++) { 870 c = in.charAt(i); 871 if (c < 0x80 && j < limit) { 872 out[j++] = (byte) c; 873 } else if (c < 0x800 && j <= limit - 2) { // 11 bits, two UTF-8 bytes 874 out[j++] = (byte) ((0xF << 6) | (c >>> 6)); 875 out[j++] = (byte) (0x80 | (0x3F & c)); 876 } else if ((c < Character.MIN_SURROGATE || Character.MAX_SURROGATE < c) && j <= limit - 3) { 877 // Maximum single-char code point is 0xFFFF, 16 bits, three UTF-8 bytes 878 out[j++] = (byte) ((0xF << 5) | (c >>> 12)); 879 out[j++] = (byte) (0x80 | (0x3F & (c >>> 6))); 880 out[j++] = (byte) (0x80 | (0x3F & c)); 881 } else if (j <= limit - 4) { 882 // Minimum code point represented by a surrogate pair is 0x10000, 17 bits, 883 // four UTF-8 bytes 884 final char low; 885 if (i + 1 == in.length() 886 || !Character.isSurrogatePair(c, (low = in.charAt(++i)))) { 887 throw new UnpairedSurrogateException((i - 1), utf16Length); 888 } 889 int codePoint = Character.toCodePoint(c, low); 890 out[j++] = (byte) ((0xF << 4) | (codePoint >>> 18)); 891 out[j++] = (byte) (0x80 | (0x3F & (codePoint >>> 12))); 892 out[j++] = (byte) (0x80 | (0x3F & (codePoint >>> 6))); 893 out[j++] = (byte) (0x80 | (0x3F & codePoint)); 894 } else { 895 // If we are surrogates and we're not a surrogate pair, always throw an 896 // UnpairedSurrogateException instead of an ArrayOutOfBoundsException. 897 if ((Character.MIN_SURROGATE <= c && c <= Character.MAX_SURROGATE) 898 && (i + 1 == in.length() 899 || !Character.isSurrogatePair(c, in.charAt(i + 1)))) { 900 throw new UnpairedSurrogateException(i, utf16Length); 901 } 902 throw new ArrayIndexOutOfBoundsException("Failed writing " + c + " at index " + j); 903 } 904 } 905 return j; 906 } 907 908 @Override encodeUtf8Direct(CharSequence in, ByteBuffer out)909 void encodeUtf8Direct(CharSequence in, ByteBuffer out) { 910 // For safe processing, we have to use the ByteBuffer API. 911 encodeUtf8Default(in, out); 912 } 913 partialIsValidUtf8(byte[] bytes, int index, int limit)914 private static int partialIsValidUtf8(byte[] bytes, int index, int limit) { 915 // Optimize for 100% ASCII (Hotspot loves small simple top-level loops like this). 916 // This simple loop stops when we encounter a byte >= 0x80 (i.e. non-ASCII). 917 while (index < limit && bytes[index] >= 0) { 918 index++; 919 } 920 921 return (index >= limit) ? COMPLETE : partialIsValidUtf8NonAscii(bytes, index, limit); 922 } 923 partialIsValidUtf8NonAscii(byte[] bytes, int index, int limit)924 private static int partialIsValidUtf8NonAscii(byte[] bytes, int index, int limit) { 925 for (;;) { 926 int byte1, byte2; 927 928 // Optimize for interior runs of ASCII bytes. 929 do { 930 if (index >= limit) { 931 return COMPLETE; 932 } 933 } while ((byte1 = bytes[index++]) >= 0); 934 935 if (byte1 < (byte) 0xE0) { 936 // two-byte form 937 938 if (index >= limit) { 939 // Incomplete sequence 940 return byte1; 941 } 942 943 // Simultaneously checks for illegal trailing-byte in 944 // leading position and overlong 2-byte form. 945 if (byte1 < (byte) 0xC2 946 || bytes[index++] > (byte) 0xBF) { 947 return MALFORMED; 948 } 949 } else if (byte1 < (byte) 0xF0) { 950 // three-byte form 951 952 if (index >= limit - 1) { // incomplete sequence 953 return incompleteStateFor(bytes, index, limit); 954 } 955 if ((byte2 = bytes[index++]) > (byte) 0xBF 956 // overlong? 5 most significant bits must not all be zero 957 || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0) 958 // check for illegal surrogate codepoints 959 || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0) 960 // byte3 trailing-byte test 961 || bytes[index++] > (byte) 0xBF) { 962 return MALFORMED; 963 } 964 } else { 965 // four-byte form 966 967 if (index >= limit - 2) { // incomplete sequence 968 return incompleteStateFor(bytes, index, limit); 969 } 970 if ((byte2 = bytes[index++]) > (byte) 0xBF 971 // Check that 1 <= plane <= 16. Tricky optimized form of: 972 // if (byte1 > (byte) 0xF4 || 973 // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 || 974 // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F) 975 || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0 976 // byte3 trailing-byte test 977 || bytes[index++] > (byte) 0xBF 978 // byte4 trailing-byte test 979 || bytes[index++] > (byte) 0xBF) { 980 return MALFORMED; 981 } 982 } 983 } 984 } 985 } 986 987 /** 988 * {@link Processor} that uses {@code sun.misc.Unsafe} where possible to improve performance. 989 */ 990 static final class UnsafeProcessor extends Processor { 991 /** 992 * Indicates whether or not all required unsafe operations are supported on this platform. 993 */ isAvailable()994 static boolean isAvailable() { 995 return hasUnsafeArrayOperations() && hasUnsafeByteBufferOperations(); 996 } 997 998 @Override partialIsValidUtf8(int state, byte[] bytes, final int index, final int limit)999 int partialIsValidUtf8(int state, byte[] bytes, final int index, final int limit) { 1000 if ((index | limit | bytes.length - limit) < 0) { 1001 throw new ArrayIndexOutOfBoundsException( 1002 String.format("Array length=%d, index=%d, limit=%d", bytes.length, index, limit)); 1003 } 1004 long offset = getArrayBaseOffset() + index; 1005 final long offsetLimit = getArrayBaseOffset() + limit; 1006 if (state != COMPLETE) { 1007 // The previous decoding operation was incomplete (or malformed). 1008 // We look for a well-formed sequence consisting of bytes from 1009 // the previous decoding operation (stored in state) together 1010 // with bytes from the array slice. 1011 // 1012 // We expect such "straddler characters" to be rare. 1013 1014 if (offset >= offsetLimit) { // No bytes? No progress. 1015 return state; 1016 } 1017 int byte1 = (byte) state; 1018 // byte1 is never ASCII. 1019 if (byte1 < (byte) 0xE0) { 1020 // two-byte form 1021 1022 // Simultaneously checks for illegal trailing-byte in 1023 // leading position and overlong 2-byte form. 1024 if (byte1 < (byte) 0xC2 1025 // byte2 trailing-byte test 1026 || UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF) { 1027 return MALFORMED; 1028 } 1029 } else if (byte1 < (byte) 0xF0) { 1030 // three-byte form 1031 1032 // Get byte2 from saved state or array 1033 int byte2 = (byte) ~(state >> 8); 1034 if (byte2 == 0) { 1035 byte2 = UnsafeUtil.getByte(bytes, offset++); 1036 if (offset >= offsetLimit) { 1037 return incompleteStateFor(byte1, byte2); 1038 } 1039 } 1040 if (byte2 > (byte) 0xBF 1041 // overlong? 5 most significant bits must not all be zero 1042 || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0) 1043 // illegal surrogate codepoint? 1044 || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0) 1045 // byte3 trailing-byte test 1046 || UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF) { 1047 return MALFORMED; 1048 } 1049 } else { 1050 // four-byte form 1051 1052 // Get byte2 and byte3 from saved state or array 1053 int byte2 = (byte) ~(state >> 8); 1054 int byte3 = 0; 1055 if (byte2 == 0) { 1056 byte2 = UnsafeUtil.getByte(bytes, offset++); 1057 if (offset >= offsetLimit) { 1058 return incompleteStateFor(byte1, byte2); 1059 } 1060 } else { 1061 byte3 = (byte) (state >> 16); 1062 } 1063 if (byte3 == 0) { 1064 byte3 = UnsafeUtil.getByte(bytes, offset++); 1065 if (offset >= offsetLimit) { 1066 return incompleteStateFor(byte1, byte2, byte3); 1067 } 1068 } 1069 1070 // If we were called with state == MALFORMED, then byte1 is 0xFF, 1071 // which never occurs in well-formed UTF-8, and so we will return 1072 // MALFORMED again below. 1073 1074 if (byte2 > (byte) 0xBF 1075 // Check that 1 <= plane <= 16. Tricky optimized form of: 1076 // if (byte1 > (byte) 0xF4 || 1077 // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 || 1078 // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F) 1079 || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0 1080 // byte3 trailing-byte test 1081 || byte3 > (byte) 0xBF 1082 // byte4 trailing-byte test 1083 || UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF) { 1084 return MALFORMED; 1085 } 1086 } 1087 } 1088 1089 return partialIsValidUtf8(bytes, offset, (int) (offsetLimit - offset)); 1090 } 1091 1092 @Override partialIsValidUtf8Direct( final int state, ByteBuffer buffer, final int index, final int limit)1093 int partialIsValidUtf8Direct( 1094 final int state, ByteBuffer buffer, final int index, final int limit) { 1095 if ((index | limit | buffer.limit() - limit) < 0) { 1096 throw new ArrayIndexOutOfBoundsException( 1097 String.format("buffer limit=%d, index=%d, limit=%d", buffer.limit(), index, limit)); 1098 } 1099 long address = addressOffset(buffer) + index; 1100 final long addressLimit = address + (limit - index); 1101 if (state != COMPLETE) { 1102 // The previous decoding operation was incomplete (or malformed). 1103 // We look for a well-formed sequence consisting of bytes from 1104 // the previous decoding operation (stored in state) together 1105 // with bytes from the array slice. 1106 // 1107 // We expect such "straddler characters" to be rare. 1108 1109 if (address >= addressLimit) { // No bytes? No progress. 1110 return state; 1111 } 1112 1113 final int byte1 = (byte) state; 1114 // byte1 is never ASCII. 1115 if (byte1 < (byte) 0xE0) { 1116 // two-byte form 1117 1118 // Simultaneously checks for illegal trailing-byte in 1119 // leading position and overlong 2-byte form. 1120 if (byte1 < (byte) 0xC2 1121 // byte2 trailing-byte test 1122 || UnsafeUtil.getByte(address++) > (byte) 0xBF) { 1123 return MALFORMED; 1124 } 1125 } else if (byte1 < (byte) 0xF0) { 1126 // three-byte form 1127 1128 // Get byte2 from saved state or array 1129 int byte2 = (byte) ~(state >> 8); 1130 if (byte2 == 0) { 1131 byte2 = UnsafeUtil.getByte(address++); 1132 if (address >= addressLimit) { 1133 return incompleteStateFor(byte1, byte2); 1134 } 1135 } 1136 if (byte2 > (byte) 0xBF 1137 // overlong? 5 most significant bits must not all be zero 1138 || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0) 1139 // illegal surrogate codepoint? 1140 || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0) 1141 // byte3 trailing-byte test 1142 || UnsafeUtil.getByte(address++) > (byte) 0xBF) { 1143 return MALFORMED; 1144 } 1145 } else { 1146 // four-byte form 1147 1148 // Get byte2 and byte3 from saved state or array 1149 int byte2 = (byte) ~(state >> 8); 1150 int byte3 = 0; 1151 if (byte2 == 0) { 1152 byte2 = UnsafeUtil.getByte(address++); 1153 if (address >= addressLimit) { 1154 return incompleteStateFor(byte1, byte2); 1155 } 1156 } else { 1157 byte3 = (byte) (state >> 16); 1158 } 1159 if (byte3 == 0) { 1160 byte3 = UnsafeUtil.getByte(address++); 1161 if (address >= addressLimit) { 1162 return incompleteStateFor(byte1, byte2, byte3); 1163 } 1164 } 1165 1166 // If we were called with state == MALFORMED, then byte1 is 0xFF, 1167 // which never occurs in well-formed UTF-8, and so we will return 1168 // MALFORMED again below. 1169 1170 if (byte2 > (byte) 0xBF 1171 // Check that 1 <= plane <= 16. Tricky optimized form of: 1172 // if (byte1 > (byte) 0xF4 || 1173 // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 || 1174 // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F) 1175 || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0 1176 // byte3 trailing-byte test 1177 || byte3 > (byte) 0xBF 1178 // byte4 trailing-byte test 1179 || UnsafeUtil.getByte(address++) > (byte) 0xBF) { 1180 return MALFORMED; 1181 } 1182 } 1183 } 1184 1185 return partialIsValidUtf8(address, (int) (addressLimit - address)); 1186 } 1187 1188 @Override encodeUtf8(final CharSequence in, final byte[] out, final int offset, final int length)1189 int encodeUtf8(final CharSequence in, final byte[] out, final int offset, final int length) { 1190 long outIx = getArrayBaseOffset() + offset; 1191 final long outLimit = outIx + length; 1192 final int inLimit = in.length(); 1193 if (inLimit > length || out.length - length < offset) { 1194 // Not even enough room for an ASCII-encoded string. 1195 throw new ArrayIndexOutOfBoundsException( 1196 "Failed writing " + in.charAt(inLimit - 1) + " at index " + (offset + length)); 1197 } 1198 1199 // Designed to take advantage of 1200 // https://wikis.oracle.com/display/HotSpotInternals/RangeCheckElimination 1201 int inIx = 0; 1202 for (char c; inIx < inLimit && (c = in.charAt(inIx)) < 0x80; ++inIx) { 1203 UnsafeUtil.putByte(out, outIx++, (byte) c); 1204 } 1205 if (inIx == inLimit) { 1206 // We're done, it was ASCII encoded. 1207 return (int) (outIx - getArrayBaseOffset()); 1208 } 1209 1210 for (char c; inIx < inLimit; ++inIx) { 1211 c = in.charAt(inIx); 1212 if (c < 0x80 && outIx < outLimit) { 1213 UnsafeUtil.putByte(out, outIx++, (byte) c); 1214 } else if (c < 0x800 && outIx <= outLimit - 2L) { // 11 bits, two UTF-8 bytes 1215 UnsafeUtil.putByte(out, outIx++, (byte) ((0xF << 6) | (c >>> 6))); 1216 UnsafeUtil.putByte(out, outIx++, (byte) (0x80 | (0x3F & c))); 1217 } else if ((c < MIN_SURROGATE || MAX_SURROGATE < c) && outIx <= outLimit - 3L) { 1218 // Maximum single-char code point is 0xFFFF, 16 bits, three UTF-8 bytes 1219 UnsafeUtil.putByte(out, outIx++, (byte) ((0xF << 5) | (c >>> 12))); 1220 UnsafeUtil.putByte(out, outIx++, (byte) (0x80 | (0x3F & (c >>> 6)))); 1221 UnsafeUtil.putByte(out, outIx++, (byte) (0x80 | (0x3F & c))); 1222 } else if (outIx <= outLimit - 4L) { 1223 // Minimum code point represented by a surrogate pair is 0x10000, 17 bits, four UTF-8 1224 // bytes 1225 final char low; 1226 if (inIx + 1 == inLimit || !isSurrogatePair(c, (low = in.charAt(++inIx)))) { 1227 throw new UnpairedSurrogateException((inIx - 1), inLimit); 1228 } 1229 int codePoint = toCodePoint(c, low); 1230 UnsafeUtil.putByte(out, outIx++, (byte) ((0xF << 4) | (codePoint >>> 18))); 1231 UnsafeUtil.putByte(out, outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 12)))); 1232 UnsafeUtil.putByte(out, outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 6)))); 1233 UnsafeUtil.putByte(out, outIx++, (byte) (0x80 | (0x3F & codePoint))); 1234 } else { 1235 if ((MIN_SURROGATE <= c && c <= MAX_SURROGATE) 1236 && (inIx + 1 == inLimit || !isSurrogatePair(c, in.charAt(inIx + 1)))) { 1237 // We are surrogates and we're not a surrogate pair. 1238 throw new UnpairedSurrogateException(inIx, inLimit); 1239 } 1240 // Not enough space in the output buffer. 1241 throw new ArrayIndexOutOfBoundsException("Failed writing " + c + " at index " + outIx); 1242 } 1243 } 1244 1245 // All bytes have been encoded. 1246 return (int) (outIx - getArrayBaseOffset()); 1247 } 1248 1249 @Override encodeUtf8Direct(CharSequence in, ByteBuffer out)1250 void encodeUtf8Direct(CharSequence in, ByteBuffer out) { 1251 final long address = addressOffset(out); 1252 long outIx = address + out.position(); 1253 final long outLimit = address + out.limit(); 1254 final int inLimit = in.length(); 1255 if (inLimit > outLimit - outIx) { 1256 // Not even enough room for an ASCII-encoded string. 1257 throw new ArrayIndexOutOfBoundsException( 1258 "Failed writing " + in.charAt(inLimit - 1) + " at index " + out.limit()); 1259 } 1260 1261 // Designed to take advantage of 1262 // https://wikis.oracle.com/display/HotSpotInternals/RangeCheckElimination 1263 int inIx = 0; 1264 for (char c; inIx < inLimit && (c = in.charAt(inIx)) < 0x80; ++inIx) { 1265 UnsafeUtil.putByte(outIx++, (byte) c); 1266 } 1267 if (inIx == inLimit) { 1268 // We're done, it was ASCII encoded. 1269 out.position((int) (outIx - address)); 1270 return; 1271 } 1272 1273 for (char c; inIx < inLimit; ++inIx) { 1274 c = in.charAt(inIx); 1275 if (c < 0x80 && outIx < outLimit) { 1276 UnsafeUtil.putByte(outIx++, (byte) c); 1277 } else if (c < 0x800 && outIx <= outLimit - 2L) { // 11 bits, two UTF-8 bytes 1278 UnsafeUtil.putByte(outIx++, (byte) ((0xF << 6) | (c >>> 6))); 1279 UnsafeUtil.putByte(outIx++, (byte) (0x80 | (0x3F & c))); 1280 } else if ((c < MIN_SURROGATE || MAX_SURROGATE < c) && outIx <= outLimit - 3L) { 1281 // Maximum single-char code point is 0xFFFF, 16 bits, three UTF-8 bytes 1282 UnsafeUtil.putByte(outIx++, (byte) ((0xF << 5) | (c >>> 12))); 1283 UnsafeUtil.putByte(outIx++, (byte) (0x80 | (0x3F & (c >>> 6)))); 1284 UnsafeUtil.putByte(outIx++, (byte) (0x80 | (0x3F & c))); 1285 } else if (outIx <= outLimit - 4L) { 1286 // Minimum code point represented by a surrogate pair is 0x10000, 17 bits, four UTF-8 1287 // bytes 1288 final char low; 1289 if (inIx + 1 == inLimit || !isSurrogatePair(c, (low = in.charAt(++inIx)))) { 1290 throw new UnpairedSurrogateException((inIx - 1), inLimit); 1291 } 1292 int codePoint = toCodePoint(c, low); 1293 UnsafeUtil.putByte(outIx++, (byte) ((0xF << 4) | (codePoint >>> 18))); 1294 UnsafeUtil.putByte(outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 12)))); 1295 UnsafeUtil.putByte(outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 6)))); 1296 UnsafeUtil.putByte(outIx++, (byte) (0x80 | (0x3F & codePoint))); 1297 } else { 1298 if ((MIN_SURROGATE <= c && c <= MAX_SURROGATE) 1299 && (inIx + 1 == inLimit || !isSurrogatePair(c, in.charAt(inIx + 1)))) { 1300 // We are surrogates and we're not a surrogate pair. 1301 throw new UnpairedSurrogateException(inIx, inLimit); 1302 } 1303 // Not enough space in the output buffer. 1304 throw new ArrayIndexOutOfBoundsException("Failed writing " + c + " at index " + outIx); 1305 } 1306 } 1307 1308 // All bytes have been encoded. 1309 out.position((int) (outIx - address)); 1310 } 1311 1312 /** 1313 * Counts (approximately) the number of consecutive ASCII characters starting from the given 1314 * position, using the most efficient method available to the platform. 1315 * 1316 * @param bytes the array containing the character sequence 1317 * @param offset the offset position of the index (same as index + arrayBaseOffset) 1318 * @param maxChars the maximum number of characters to count 1319 * @return the number of ASCII characters found. The stopping position will be at or 1320 * before the first non-ASCII byte. 1321 */ unsafeEstimateConsecutiveAscii( byte[] bytes, long offset, final int maxChars)1322 private static int unsafeEstimateConsecutiveAscii( 1323 byte[] bytes, long offset, final int maxChars) { 1324 int remaining = maxChars; 1325 if (remaining < UNSAFE_COUNT_ASCII_THRESHOLD) { 1326 // Don't bother with small strings. 1327 return 0; 1328 } 1329 1330 // Read bytes until 8-byte aligned so that we can read longs in the loop below. 1331 // Byte arrays are already either 8 or 16-byte aligned, so we just need to make sure that 1332 // the index (relative to the start of the array) is also 8-byte aligned. We do this by 1333 // ANDing the index with 7 to determine the number of bytes that need to be read before 1334 // we're 8-byte aligned. 1335 final int unaligned = (int) offset & 7; 1336 for (int j = unaligned; j > 0; j--) { 1337 if (UnsafeUtil.getByte(bytes, offset++) < 0) { 1338 return unaligned - j; 1339 } 1340 } 1341 1342 // This simple loop stops when we encounter a byte >= 0x80 (i.e. non-ASCII). 1343 // To speed things up further, we're reading longs instead of bytes so we use a mask to 1344 // determine if any byte in the current long is non-ASCII. 1345 remaining -= unaligned; 1346 for (; remaining >= 8 && (UnsafeUtil.getLong(bytes, offset) & ASCII_MASK_LONG) == 0; 1347 offset += 8, remaining -= 8) {} 1348 return maxChars - remaining; 1349 } 1350 1351 /** 1352 * Same as {@link Utf8#estimateConsecutiveAscii(ByteBuffer, int, int)} except that it uses the 1353 * most efficient method available to the platform. 1354 */ unsafeEstimateConsecutiveAscii(long address, final int maxChars)1355 private static int unsafeEstimateConsecutiveAscii(long address, final int maxChars) { 1356 int remaining = maxChars; 1357 if (remaining < UNSAFE_COUNT_ASCII_THRESHOLD) { 1358 // Don't bother with small strings. 1359 return 0; 1360 } 1361 1362 // Read bytes until 8-byte aligned so that we can read longs in the loop below. 1363 // We do this by ANDing the address with 7 to determine the number of bytes that need to 1364 // be read before we're 8-byte aligned. 1365 final int unaligned = (int) address & 7; 1366 for (int j = unaligned; j > 0; j--) { 1367 if (UnsafeUtil.getByte(address++) < 0) { 1368 return unaligned - j; 1369 } 1370 } 1371 1372 // This simple loop stops when we encounter a byte >= 0x80 (i.e. non-ASCII). 1373 // To speed things up further, we're reading longs instead of bytes so we use a mask to 1374 // determine if any byte in the current long is non-ASCII. 1375 remaining -= unaligned; 1376 for (; remaining >= 8 && (UnsafeUtil.getLong(address) & ASCII_MASK_LONG) == 0; 1377 address += 8, remaining -= 8) {} 1378 return maxChars - remaining; 1379 } 1380 partialIsValidUtf8(final byte[] bytes, long offset, int remaining)1381 private static int partialIsValidUtf8(final byte[] bytes, long offset, int remaining) { 1382 // Skip past ASCII characters as quickly as possible. 1383 final int skipped = unsafeEstimateConsecutiveAscii(bytes, offset, remaining); 1384 remaining -= skipped; 1385 offset += skipped; 1386 1387 for (;;) { 1388 // Optimize for interior runs of ASCII bytes. 1389 // TODO(nathanmittler): Consider checking 8 bytes at a time after some threshold? 1390 // Maybe after seeing a few in a row that are ASCII, go back to fast mode? 1391 int byte1 = 0; 1392 for (; remaining > 0 && (byte1 = UnsafeUtil.getByte(bytes, offset++)) >= 0; --remaining) { 1393 } 1394 if (remaining == 0) { 1395 return COMPLETE; 1396 } 1397 remaining--; 1398 1399 // If we're here byte1 is not ASCII. Only need to handle 2-4 byte forms. 1400 if (byte1 < (byte) 0xE0) { 1401 // Two-byte form (110xxxxx 10xxxxxx) 1402 if (remaining == 0) { 1403 // Incomplete sequence 1404 return byte1; 1405 } 1406 remaining--; 1407 1408 // Simultaneously checks for illegal trailing-byte in 1409 // leading position and overlong 2-byte form. 1410 if (byte1 < (byte) 0xC2 1411 || UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF) { 1412 return MALFORMED; 1413 } 1414 } else if (byte1 < (byte) 0xF0) { 1415 // Three-byte form (1110xxxx 10xxxxxx 10xxxxxx) 1416 if (remaining < 2) { 1417 // Incomplete sequence 1418 return unsafeIncompleteStateFor(bytes, byte1, offset, remaining); 1419 } 1420 remaining -= 2; 1421 1422 final int byte2; 1423 if ((byte2 = UnsafeUtil.getByte(bytes, offset++)) > (byte) 0xBF 1424 // overlong? 5 most significant bits must not all be zero 1425 || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0) 1426 // check for illegal surrogate codepoints 1427 || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0) 1428 // byte3 trailing-byte test 1429 || UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF) { 1430 return MALFORMED; 1431 } 1432 } else { 1433 // Four-byte form (1110xxxx 10xxxxxx 10xxxxxx 10xxxxxx) 1434 if (remaining < 3) { 1435 // Incomplete sequence 1436 return unsafeIncompleteStateFor(bytes, byte1, offset, remaining); 1437 } 1438 remaining -= 3; 1439 1440 final int byte2; 1441 if ((byte2 = UnsafeUtil.getByte(bytes, offset++)) > (byte) 0xBF 1442 // Check that 1 <= plane <= 16. Tricky optimized form of: 1443 // if (byte1 > (byte) 0xF4 || 1444 // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 || 1445 // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F) 1446 || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0 1447 // byte3 trailing-byte test 1448 || UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF 1449 // byte4 trailing-byte test 1450 || UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF) { 1451 return MALFORMED; 1452 } 1453 } 1454 } 1455 } 1456 partialIsValidUtf8(long address, int remaining)1457 private static int partialIsValidUtf8(long address, int remaining) { 1458 // Skip past ASCII characters as quickly as possible. 1459 final int skipped = unsafeEstimateConsecutiveAscii(address, remaining); 1460 address += skipped; 1461 remaining -= skipped; 1462 1463 for (;;) { 1464 // Optimize for interior runs of ASCII bytes. 1465 // TODO(nathanmittler): Consider checking 8 bytes at a time after some threshold? 1466 // Maybe after seeing a few in a row that are ASCII, go back to fast mode? 1467 int byte1 = 0; 1468 for (; remaining > 0 && (byte1 = UnsafeUtil.getByte(address++)) >= 0; --remaining) { 1469 } 1470 if (remaining == 0) { 1471 return COMPLETE; 1472 } 1473 remaining--; 1474 1475 if (byte1 < (byte) 0xE0) { 1476 // Two-byte form 1477 1478 if (remaining == 0) { 1479 // Incomplete sequence 1480 return byte1; 1481 } 1482 remaining--; 1483 1484 // Simultaneously checks for illegal trailing-byte in 1485 // leading position and overlong 2-byte form. 1486 if (byte1 < (byte) 0xC2 || UnsafeUtil.getByte(address++) > (byte) 0xBF) { 1487 return MALFORMED; 1488 } 1489 } else if (byte1 < (byte) 0xF0) { 1490 // Three-byte form 1491 1492 if (remaining < 2) { 1493 // Incomplete sequence 1494 return unsafeIncompleteStateFor(address, byte1, remaining); 1495 } 1496 remaining -= 2; 1497 1498 final byte byte2 = UnsafeUtil.getByte(address++); 1499 if (byte2 > (byte) 0xBF 1500 // overlong? 5 most significant bits must not all be zero 1501 || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0) 1502 // check for illegal surrogate codepoints 1503 || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0) 1504 // byte3 trailing-byte test 1505 || UnsafeUtil.getByte(address++) > (byte) 0xBF) { 1506 return MALFORMED; 1507 } 1508 } else { 1509 // Four-byte form 1510 1511 if (remaining < 3) { 1512 // Incomplete sequence 1513 return unsafeIncompleteStateFor(address, byte1, remaining); 1514 } 1515 remaining -= 3; 1516 1517 final byte byte2 = UnsafeUtil.getByte(address++); 1518 if (byte2 > (byte) 0xBF 1519 // Check that 1 <= plane <= 16. Tricky optimized form of: 1520 // if (byte1 > (byte) 0xF4 || 1521 // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 || 1522 // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F) 1523 || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0 1524 // byte3 trailing-byte test 1525 || UnsafeUtil.getByte(address++) > (byte) 0xBF 1526 // byte4 trailing-byte test 1527 || UnsafeUtil.getByte(address++) > (byte) 0xBF) { 1528 return MALFORMED; 1529 } 1530 } 1531 } 1532 } 1533 unsafeIncompleteStateFor(byte[] bytes, int byte1, long offset, int remaining)1534 private static int unsafeIncompleteStateFor(byte[] bytes, int byte1, long offset, 1535 int remaining) { 1536 switch (remaining) { 1537 case 0: { 1538 return incompleteStateFor(byte1); 1539 } 1540 case 1: { 1541 return incompleteStateFor(byte1, UnsafeUtil.getByte(bytes, offset)); 1542 } 1543 case 2: { 1544 return incompleteStateFor(byte1, UnsafeUtil.getByte(bytes, offset), 1545 UnsafeUtil.getByte(bytes, offset + 1)); 1546 } 1547 default: { 1548 throw new AssertionError(); 1549 } 1550 } 1551 } 1552 unsafeIncompleteStateFor(long address, final int byte1, int remaining)1553 private static int unsafeIncompleteStateFor(long address, final int byte1, int remaining) { 1554 switch (remaining) { 1555 case 0: { 1556 return incompleteStateFor(byte1); 1557 } 1558 case 1: { 1559 return incompleteStateFor(byte1, UnsafeUtil.getByte(address)); 1560 } 1561 case 2: { 1562 return incompleteStateFor(byte1, UnsafeUtil.getByte(address), 1563 UnsafeUtil.getByte(address + 1)); 1564 } 1565 default: { 1566 throw new AssertionError(); 1567 } 1568 } 1569 } 1570 } 1571 Utf8()1572 private Utf8() {} 1573 } 1574