1 // Protocol Buffers - Google's data interchange format 2 // Copyright 2008 Google Inc. All rights reserved. 3 // https://developers.google.com/protocol-buffers/ 4 // 5 // Redistribution and use in source and binary forms, with or without 6 // modification, are permitted provided that the following conditions are 7 // met: 8 // 9 // * Redistributions of source code must retain the above copyright 10 // notice, this list of conditions and the following disclaimer. 11 // * Redistributions in binary form must reproduce the above 12 // copyright notice, this list of conditions and the following disclaimer 13 // in the documentation and/or other materials provided with the 14 // distribution. 15 // * Neither the name of Google Inc. nor the names of its 16 // contributors may be used to endorse or promote products derived from 17 // this software without specific prior written permission. 18 // 19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 31 package com.google.flatbuffers; 32 33 import java.nio.ByteBuffer; 34 import static java.lang.Character.MAX_SURROGATE; 35 import static java.lang.Character.MIN_SUPPLEMENTARY_CODE_POINT; 36 import static java.lang.Character.MIN_SURROGATE; 37 import static java.lang.Character.isSurrogatePair; 38 import static java.lang.Character.toCodePoint; 39 40 /** 41 * A set of low-level, high-performance static utility methods related 42 * to the UTF-8 character encoding. This class has no dependencies 43 * outside of the core JDK libraries. 44 * 45 * <p>There are several variants of UTF-8. The one implemented by 46 * this class is the restricted definition of UTF-8 introduced in 47 * Unicode 3.1, which mandates the rejection of "overlong" byte 48 * sequences as well as rejection of 3-byte surrogate codepoint byte 49 * sequences. Note that the UTF-8 decoder included in Oracle's JDK 50 * has been modified to also reject "overlong" byte sequences, but (as 51 * of 2011) still accepts 3-byte surrogate codepoint byte sequences. 52 * 53 * <p>The byte sequences considered valid by this class are exactly 54 * those that can be roundtrip converted to Strings and back to bytes 55 * using the UTF-8 charset, without loss: <pre> {@code 56 * Arrays.equals(bytes, new String(bytes, Internal.UTF_8).getBytes(Internal.UTF_8)) 57 * }</pre> 58 * 59 * <p>See the Unicode Standard,</br> 60 * Table 3-6. <em>UTF-8 Bit Distribution</em>,</br> 61 * Table 3-7. <em>Well Formed UTF-8 Byte Sequences</em>. 62 */ 63 final public class Utf8Safe extends Utf8 { 64 65 /** 66 * Returns the number of bytes in the UTF-8-encoded form of {@code sequence}. For a string, 67 * this method is equivalent to {@code string.getBytes(UTF_8).length}, but is more efficient in 68 * both time and space. 69 * 70 * @throws IllegalArgumentException if {@code sequence} contains ill-formed UTF-16 (unpaired 71 * surrogates) 72 */ computeEncodedLength(CharSequence sequence)73 private static int computeEncodedLength(CharSequence sequence) { 74 // Warning to maintainers: this implementation is highly optimized. 75 int utf16Length = sequence.length(); 76 int utf8Length = utf16Length; 77 int i = 0; 78 79 // This loop optimizes for pure ASCII. 80 while (i < utf16Length && sequence.charAt(i) < 0x80) { 81 i++; 82 } 83 84 // This loop optimizes for chars less than 0x800. 85 for (; i < utf16Length; i++) { 86 char c = sequence.charAt(i); 87 if (c < 0x800) { 88 utf8Length += ((0x7f - c) >>> 31); // branch free! 89 } else { 90 utf8Length += encodedLengthGeneral(sequence, i); 91 break; 92 } 93 } 94 95 if (utf8Length < utf16Length) { 96 // Necessary and sufficient condition for overflow because of maximum 3x expansion 97 throw new IllegalArgumentException("UTF-8 length does not fit in int: " 98 + (utf8Length + (1L << 32))); 99 } 100 return utf8Length; 101 } 102 encodedLengthGeneral(CharSequence sequence, int start)103 private static int encodedLengthGeneral(CharSequence sequence, int start) { 104 int utf16Length = sequence.length(); 105 int utf8Length = 0; 106 for (int i = start; i < utf16Length; i++) { 107 char c = sequence.charAt(i); 108 if (c < 0x800) { 109 utf8Length += (0x7f - c) >>> 31; // branch free! 110 } else { 111 utf8Length += 2; 112 // jdk7+: if (Character.isSurrogate(c)) { 113 if (Character.MIN_SURROGATE <= c && c <= Character.MAX_SURROGATE) { 114 // Check that we have a well-formed surrogate pair. 115 int cp = Character.codePointAt(sequence, i); 116 if (cp < MIN_SUPPLEMENTARY_CODE_POINT) { 117 throw new Utf8Safe.UnpairedSurrogateException(i, utf16Length); 118 } 119 i++; 120 } 121 } 122 } 123 return utf8Length; 124 } 125 decodeUtf8Array(byte[] bytes, int index, int size)126 private static String decodeUtf8Array(byte[] bytes, int index, int size) { 127 // Bitwise OR combines the sign bits so any negative value fails the check. 128 if ((index | size | bytes.length - index - size) < 0) { 129 throw new ArrayIndexOutOfBoundsException( 130 String.format("buffer length=%d, index=%d, size=%d", bytes.length, index, size)); 131 } 132 133 int offset = index; 134 final int limit = offset + size; 135 136 // The longest possible resulting String is the same as the number of input bytes, when it is 137 // all ASCII. For other cases, this over-allocates and we will truncate in the end. 138 char[] resultArr = new char[size]; 139 int resultPos = 0; 140 141 // Optimize for 100% ASCII (Hotspot loves small simple top-level loops like this). 142 // This simple loop stops when we encounter a byte >= 0x80 (i.e. non-ASCII). 143 while (offset < limit) { 144 byte b = bytes[offset]; 145 if (!DecodeUtil.isOneByte(b)) { 146 break; 147 } 148 offset++; 149 DecodeUtil.handleOneByte(b, resultArr, resultPos++); 150 } 151 152 while (offset < limit) { 153 byte byte1 = bytes[offset++]; 154 if (DecodeUtil.isOneByte(byte1)) { 155 DecodeUtil.handleOneByte(byte1, resultArr, resultPos++); 156 // It's common for there to be multiple ASCII characters in a run mixed in, so add an 157 // extra optimized loop to take care of these runs. 158 while (offset < limit) { 159 byte b = bytes[offset]; 160 if (!DecodeUtil.isOneByte(b)) { 161 break; 162 } 163 offset++; 164 DecodeUtil.handleOneByte(b, resultArr, resultPos++); 165 } 166 } else if (DecodeUtil.isTwoBytes(byte1)) { 167 if (offset >= limit) { 168 throw new IllegalArgumentException("Invalid UTF-8"); 169 } 170 DecodeUtil.handleTwoBytes(byte1, /* byte2 */ bytes[offset++], resultArr, resultPos++); 171 } else if (DecodeUtil.isThreeBytes(byte1)) { 172 if (offset >= limit - 1) { 173 throw new IllegalArgumentException("Invalid UTF-8"); 174 } 175 DecodeUtil.handleThreeBytes( 176 byte1, 177 /* byte2 */ bytes[offset++], 178 /* byte3 */ bytes[offset++], 179 resultArr, 180 resultPos++); 181 } else { 182 if (offset >= limit - 2) { 183 throw new IllegalArgumentException("Invalid UTF-8"); 184 } 185 DecodeUtil.handleFourBytes( 186 byte1, 187 /* byte2 */ bytes[offset++], 188 /* byte3 */ bytes[offset++], 189 /* byte4 */ bytes[offset++], 190 resultArr, 191 resultPos++); 192 // 4-byte case requires two chars. 193 resultPos++; 194 } 195 } 196 197 return new String(resultArr, 0, resultPos); 198 } 199 decodeUtf8Buffer(ByteBuffer buffer, int offset, int length)200 private static String decodeUtf8Buffer(ByteBuffer buffer, int offset, 201 int length) { 202 // Bitwise OR combines the sign bits so any negative value fails the check. 203 if ((offset | length | buffer.limit() - offset - length) < 0) { 204 throw new ArrayIndexOutOfBoundsException( 205 String.format("buffer limit=%d, index=%d, limit=%d", buffer.limit(), 206 offset, length)); 207 } 208 209 final int limit = offset + length; 210 211 // The longest possible resulting String is the same as the number of input bytes, when it is 212 // all ASCII. For other cases, this over-allocates and we will truncate in the end. 213 char[] resultArr = new char[length]; 214 int resultPos = 0; 215 216 // Optimize for 100% ASCII (Hotspot loves small simple top-level loops like this). 217 // This simple loop stops when we encounter a byte >= 0x80 (i.e. non-ASCII). 218 while (offset < limit) { 219 byte b = buffer.get(offset); 220 if (!DecodeUtil.isOneByte(b)) { 221 break; 222 } 223 offset++; 224 DecodeUtil.handleOneByte(b, resultArr, resultPos++); 225 } 226 227 while (offset < limit) { 228 byte byte1 = buffer.get(offset++); 229 if (DecodeUtil.isOneByte(byte1)) { 230 DecodeUtil.handleOneByte(byte1, resultArr, resultPos++); 231 // It's common for there to be multiple ASCII characters in a run mixed in, so add an 232 // extra optimized loop to take care of these runs. 233 while (offset < limit) { 234 byte b = buffer.get(offset); 235 if (!DecodeUtil.isOneByte(b)) { 236 break; 237 } 238 offset++; 239 DecodeUtil.handleOneByte(b, resultArr, resultPos++); 240 } 241 } else if (DecodeUtil.isTwoBytes(byte1)) { 242 if (offset >= limit) { 243 throw new IllegalArgumentException("Invalid UTF-8"); 244 } 245 DecodeUtil.handleTwoBytes( 246 byte1, /* byte2 */ buffer.get(offset++), resultArr, resultPos++); 247 } else if (DecodeUtil.isThreeBytes(byte1)) { 248 if (offset >= limit - 1) { 249 throw new IllegalArgumentException("Invalid UTF-8"); 250 } 251 DecodeUtil.handleThreeBytes( 252 byte1, 253 /* byte2 */ buffer.get(offset++), 254 /* byte3 */ buffer.get(offset++), 255 resultArr, 256 resultPos++); 257 } else { 258 if (offset >= limit - 2) { 259 throw new IllegalArgumentException("Invalid UTF-8"); 260 } 261 DecodeUtil.handleFourBytes( 262 byte1, 263 /* byte2 */ buffer.get(offset++), 264 /* byte3 */ buffer.get(offset++), 265 /* byte4 */ buffer.get(offset++), 266 resultArr, 267 resultPos++); 268 // 4-byte case requires two chars. 269 resultPos++; 270 } 271 } 272 273 return new String(resultArr, 0, resultPos); 274 } 275 276 @Override encodedLength(CharSequence in)277 public int encodedLength(CharSequence in) { 278 return computeEncodedLength(in); 279 } 280 281 /** 282 * Decodes the given UTF-8 portion of the {@link ByteBuffer} into a {@link String}. 283 * 284 * @throws IllegalArgumentException if the input is not valid UTF-8. 285 */ 286 @Override decodeUtf8(ByteBuffer buffer, int offset, int length)287 public String decodeUtf8(ByteBuffer buffer, int offset, int length) 288 throws IllegalArgumentException { 289 if (buffer.hasArray()) { 290 return decodeUtf8Array(buffer.array(), buffer.arrayOffset() + offset, length); 291 } else { 292 return decodeUtf8Buffer(buffer, offset, length); 293 } 294 } 295 296 encodeUtf8Buffer(CharSequence in, ByteBuffer out)297 private static void encodeUtf8Buffer(CharSequence in, ByteBuffer out) { 298 final int inLength = in.length(); 299 int outIx = out.position(); 300 int inIx = 0; 301 302 // Since ByteBuffer.putXXX() already checks boundaries for us, no need to explicitly check 303 // access. Assume the buffer is big enough and let it handle the out of bounds exception 304 // if it occurs. 305 try { 306 // Designed to take advantage of 307 // https://wikis.oracle.com/display/HotSpotInternals/RangeCheckElimination 308 for (char c; inIx < inLength && (c = in.charAt(inIx)) < 0x80; ++inIx) { 309 out.put(outIx + inIx, (byte) c); 310 } 311 if (inIx == inLength) { 312 // Successfully encoded the entire string. 313 out.position(outIx + inIx); 314 return; 315 } 316 317 outIx += inIx; 318 for (char c; inIx < inLength; ++inIx, ++outIx) { 319 c = in.charAt(inIx); 320 if (c < 0x80) { 321 // One byte (0xxx xxxx) 322 out.put(outIx, (byte) c); 323 } else if (c < 0x800) { 324 // Two bytes (110x xxxx 10xx xxxx) 325 326 // Benchmarks show put performs better than putShort here (for HotSpot). 327 out.put(outIx++, (byte) (0xC0 | (c >>> 6))); 328 out.put(outIx, (byte) (0x80 | (0x3F & c))); 329 } else if (c < MIN_SURROGATE || MAX_SURROGATE < c) { 330 // Three bytes (1110 xxxx 10xx xxxx 10xx xxxx) 331 // Maximum single-char code point is 0xFFFF, 16 bits. 332 333 // Benchmarks show put performs better than putShort here (for HotSpot). 334 out.put(outIx++, (byte) (0xE0 | (c >>> 12))); 335 out.put(outIx++, (byte) (0x80 | (0x3F & (c >>> 6)))); 336 out.put(outIx, (byte) (0x80 | (0x3F & c))); 337 } else { 338 // Four bytes (1111 xxxx 10xx xxxx 10xx xxxx 10xx xxxx) 339 340 // Minimum code point represented by a surrogate pair is 0x10000, 17 bits, four UTF-8 341 // bytes 342 final char low; 343 if (inIx + 1 == inLength || !isSurrogatePair(c, (low = in.charAt(++inIx)))) { 344 throw new UnpairedSurrogateException(inIx, inLength); 345 } 346 // TODO(nathanmittler): Consider using putInt() to improve performance. 347 int codePoint = toCodePoint(c, low); 348 out.put(outIx++, (byte) ((0xF << 4) | (codePoint >>> 18))); 349 out.put(outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 12)))); 350 out.put(outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 6)))); 351 out.put(outIx, (byte) (0x80 | (0x3F & codePoint))); 352 } 353 } 354 355 // Successfully encoded the entire string. 356 out.position(outIx); 357 } catch (IndexOutOfBoundsException e) { 358 // TODO(nathanmittler): Consider making the API throw IndexOutOfBoundsException instead. 359 360 // If we failed in the outer ASCII loop, outIx will not have been updated. In this case, 361 // use inIx to determine the bad write index. 362 int badWriteIndex = out.position() + Math.max(inIx, outIx - out.position() + 1); 363 throw new ArrayIndexOutOfBoundsException( 364 "Failed writing " + in.charAt(inIx) + " at index " + badWriteIndex); 365 } 366 } 367 encodeUtf8Array(CharSequence in, byte[] out, int offset, int length)368 private static int encodeUtf8Array(CharSequence in, byte[] out, 369 int offset, int length) { 370 int utf16Length = in.length(); 371 int j = offset; 372 int i = 0; 373 int limit = offset + length; 374 // Designed to take advantage of 375 // https://wikis.oracle.com/display/HotSpotInternals/RangeCheckElimination 376 for (char c; i < utf16Length && i + j < limit && (c = in.charAt(i)) < 0x80; i++) { 377 out[j + i] = (byte) c; 378 } 379 if (i == utf16Length) { 380 return j + utf16Length; 381 } 382 j += i; 383 for (char c; i < utf16Length; i++) { 384 c = in.charAt(i); 385 if (c < 0x80 && j < limit) { 386 out[j++] = (byte) c; 387 } else if (c < 0x800 && j <= limit - 2) { // 11 bits, two UTF-8 bytes 388 out[j++] = (byte) ((0xF << 6) | (c >>> 6)); 389 out[j++] = (byte) (0x80 | (0x3F & c)); 390 } else if ((c < Character.MIN_SURROGATE || Character.MAX_SURROGATE < c) && j <= limit - 3) { 391 // Maximum single-char code point is 0xFFFF, 16 bits, three UTF-8 bytes 392 out[j++] = (byte) ((0xF << 5) | (c >>> 12)); 393 out[j++] = (byte) (0x80 | (0x3F & (c >>> 6))); 394 out[j++] = (byte) (0x80 | (0x3F & c)); 395 } else if (j <= limit - 4) { 396 // Minimum code point represented by a surrogate pair is 0x10000, 17 bits, 397 // four UTF-8 bytes 398 final char low; 399 if (i + 1 == in.length() 400 || !Character.isSurrogatePair(c, (low = in.charAt(++i)))) { 401 throw new UnpairedSurrogateException((i - 1), utf16Length); 402 } 403 int codePoint = Character.toCodePoint(c, low); 404 out[j++] = (byte) ((0xF << 4) | (codePoint >>> 18)); 405 out[j++] = (byte) (0x80 | (0x3F & (codePoint >>> 12))); 406 out[j++] = (byte) (0x80 | (0x3F & (codePoint >>> 6))); 407 out[j++] = (byte) (0x80 | (0x3F & codePoint)); 408 } else { 409 // If we are surrogates and we're not a surrogate pair, always throw an 410 // UnpairedSurrogateException instead of an ArrayOutOfBoundsException. 411 if ((Character.MIN_SURROGATE <= c && c <= Character.MAX_SURROGATE) 412 && (i + 1 == in.length() 413 || !Character.isSurrogatePair(c, in.charAt(i + 1)))) { 414 throw new UnpairedSurrogateException(i, utf16Length); 415 } 416 throw new ArrayIndexOutOfBoundsException("Failed writing " + c + " at index " + j); 417 } 418 } 419 return j; 420 } 421 422 /** 423 * Encodes the given characters to the target {@link ByteBuffer} using UTF-8 encoding. 424 * 425 * <p>Selects an optimal algorithm based on the type of {@link ByteBuffer} (i.e. heap or direct) 426 * and the capabilities of the platform. 427 * 428 * @param in the source string to be encoded 429 * @param out the target buffer to receive the encoded string. 430 */ 431 @Override encodeUtf8(CharSequence in, ByteBuffer out)432 public void encodeUtf8(CharSequence in, ByteBuffer out) { 433 if (out.hasArray()) { 434 int start = out.arrayOffset(); 435 int end = encodeUtf8Array(in, out.array(), start + out.position(), 436 out.remaining()); 437 out.position(end - start); 438 } else { 439 encodeUtf8Buffer(in, out); 440 } 441 } 442 443 // These UTF-8 handling methods are copied from Guava's Utf8Unsafe class with 444 // a modification to throw a local exception. This exception can be caught 445 // to fallback to more lenient behavior. 446 static class UnpairedSurrogateException extends IllegalArgumentException { UnpairedSurrogateException(int index, int length)447 UnpairedSurrogateException(int index, int length) { 448 super("Unpaired surrogate at index " + index + " of " + length); 449 } 450 } 451 } 452