• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Protocol Buffers - Google's data interchange format
2 // Copyright 2008 Google Inc.  All rights reserved.
3 // https://developers.google.com/protocol-buffers/
4 //
5 // Redistribution and use in source and binary forms, with or without
6 // modification, are permitted provided that the following conditions are
7 // met:
8 //
9 //     * Redistributions of source code must retain the above copyright
10 // notice, this list of conditions and the following disclaimer.
11 //     * Redistributions in binary form must reproduce the above
12 // copyright notice, this list of conditions and the following disclaimer
13 // in the documentation and/or other materials provided with the
14 // distribution.
15 //     * Neither the name of Google Inc. nor the names of its
16 // contributors may be used to endorse or promote products derived from
17 // this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 
31 package com.google.protobuf;
32 
33 import static java.lang.Character.MAX_SURROGATE;
34 import static java.lang.Character.MIN_SURROGATE;
35 import static java.lang.Character.isSurrogatePair;
36 import static java.lang.Character.toCodePoint;
37 
38 import java.lang.reflect.Field;
39 import java.nio.Buffer;
40 import java.nio.ByteBuffer;
41 import java.security.AccessController;
42 import java.security.PrivilegedExceptionAction;
43 import java.util.logging.Level;
44 import java.util.logging.Logger;
45 
46 /**
47  * A set of low-level, high-performance static utility methods related
48  * to the UTF-8 character encoding.  This class has no dependencies
49  * outside of the core JDK libraries.
50  *
51  * <p>There are several variants of UTF-8.  The one implemented by
52  * this class is the restricted definition of UTF-8 introduced in
53  * Unicode 3.1, which mandates the rejection of "overlong" byte
54  * sequences as well as rejection of 3-byte surrogate codepoint byte
55  * sequences.  Note that the UTF-8 decoder included in Oracle's JDK
56  * has been modified to also reject "overlong" byte sequences, but (as
57  * of 2011) still accepts 3-byte surrogate codepoint byte sequences.
58  *
59  * <p>The byte sequences considered valid by this class are exactly
60  * those that can be roundtrip converted to Strings and back to bytes
61  * using the UTF-8 charset, without loss: <pre> {@code
62  * Arrays.equals(bytes, new String(bytes, Internal.UTF_8).getBytes(Internal.UTF_8))
63  * }</pre>
64  *
65  * <p>See the Unicode Standard,</br>
66  * Table 3-6. <em>UTF-8 Bit Distribution</em>,</br>
67  * Table 3-7. <em>Well Formed UTF-8 Byte Sequences</em>.
68  *
69  * <p>This class supports decoding of partial byte sequences, so that the
70  * bytes in a complete UTF-8 byte sequences can be stored in multiple
71  * segments.  Methods typically return {@link #MALFORMED} if the partial
72  * byte sequence is definitely not well-formed, {@link #COMPLETE} if it is
73  * well-formed in the absence of additional input, or if the byte sequence
74  * apparently terminated in the middle of a character, an opaque integer
75  * "state" value containing enough information to decode the character when
76  * passed to a subsequent invocation of a partial decoding method.
77  *
78  * @author martinrb@google.com (Martin Buchholz)
79  */
80 // TODO(nathanmittler): Copy changes in this class back to Guava
81 final class Utf8 {
82   private static final Logger logger = Logger.getLogger(Utf8.class.getName());
83 
84   /**
85    * UTF-8 is a runtime hot spot so we attempt to provide heavily optimized implementations
86    * depending on what is available on the platform. The processor is the platform-optimized
87    * delegate for which all methods are delegated directly to.
88    */
89   private static final Processor processor =
90       UnsafeProcessor.isAvailable() ? new UnsafeProcessor() : new SafeProcessor();
91 
92   /**
93    * A mask used when performing unsafe reads to determine if a long value contains any non-ASCII
94    * characters (i.e. any byte >= 0x80).
95    */
96   private static final long ASCII_MASK_LONG = 0x8080808080808080L;
97 
98   /**
99    * Maximum number of bytes per Java UTF-16 char in UTF-8.
100    * @see java.nio.charset.CharsetEncoder#maxBytesPerChar()
101    */
102   static final int MAX_BYTES_PER_CHAR = 3;
103 
104   /**
105    * State value indicating that the byte sequence is well-formed and
106    * complete (no further bytes are needed to complete a character).
107    */
108   public static final int COMPLETE = 0;
109 
110   /**
111    * State value indicating that the byte sequence is definitely not
112    * well-formed.
113    */
114   public static final int MALFORMED = -1;
115 
116   /**
117    * Used by {@code Unsafe} UTF-8 string validation logic to determine the minimum string length
118    * above which to employ an optimized algorithm for counting ASCII characters. The reason for this
119    * threshold is that for small strings, the optimization may not be beneficial or may even
120    * negatively impact performance since it requires additional logic to avoid unaligned reads
121    * (when calling {@code Unsafe.getLong}). This threshold guarantees that even if the initial
122    * offset is unaligned, we're guaranteed to make at least one call to {@code Unsafe.getLong()}
123    * which provides a performance improvement that entirely subsumes the cost of the additional
124    * logic.
125    */
126   private static final int UNSAFE_COUNT_ASCII_THRESHOLD = 16;
127 
128   // Other state values include the partial bytes of the incomplete
129   // character to be decoded in the simplest way: we pack the bytes
130   // into the state int in little-endian order.  For example:
131   //
132   // int state = byte1 ^ (byte2 << 8) ^ (byte3 << 16);
133   //
134   // Such a state is unpacked thus (note the ~ operation for byte2 to
135   // undo byte1's sign-extension bits):
136   //
137   // int byte1 = (byte) state;
138   // int byte2 = (byte) ~(state >> 8);
139   // int byte3 = (byte) (state >> 16);
140   //
141   // We cannot store a zero byte in the state because it would be
142   // indistinguishable from the absence of a byte.  But we don't need
143   // to, because partial bytes must always be negative.  When building
144   // a state, we ensure that byte1 is negative and subsequent bytes
145   // are valid trailing bytes.
146 
147   /**
148    * Returns {@code true} if the given byte array is a well-formed
149    * UTF-8 byte sequence.
150    *
151    * <p>This is a convenience method, equivalent to a call to {@code
152    * isValidUtf8(bytes, 0, bytes.length)}.
153    */
isValidUtf8(byte[] bytes)154   public static boolean isValidUtf8(byte[] bytes) {
155     return processor.isValidUtf8(bytes, 0, bytes.length);
156   }
157 
158   /**
159    * Returns {@code true} if the given byte array slice is a
160    * well-formed UTF-8 byte sequence.  The range of bytes to be
161    * checked extends from index {@code index}, inclusive, to {@code
162    * limit}, exclusive.
163    *
164    * <p>This is a convenience method, equivalent to {@code
165    * partialIsValidUtf8(bytes, index, limit) == Utf8.COMPLETE}.
166    */
isValidUtf8(byte[] bytes, int index, int limit)167   public static boolean isValidUtf8(byte[] bytes, int index, int limit) {
168     return processor.isValidUtf8(bytes, index, limit);
169   }
170 
171   /**
172    * Tells whether the given byte array slice is a well-formed,
173    * malformed, or incomplete UTF-8 byte sequence.  The range of bytes
174    * to be checked extends from index {@code index}, inclusive, to
175    * {@code limit}, exclusive.
176    *
177    * @param state either {@link Utf8#COMPLETE} (if this is the initial decoding
178    * operation) or the value returned from a call to a partial decoding method
179    * for the previous bytes
180    *
181    * @return {@link #MALFORMED} if the partial byte sequence is
182    * definitely not well-formed, {@link #COMPLETE} if it is well-formed
183    * (no additional input needed), or if the byte sequence is
184    * "incomplete", i.e. apparently terminated in the middle of a character,
185    * an opaque integer "state" value containing enough information to
186    * decode the character when passed to a subsequent invocation of a
187    * partial decoding method.
188    */
partialIsValidUtf8(int state, byte[] bytes, int index, int limit)189   public static int partialIsValidUtf8(int state, byte[] bytes, int index, int limit) {
190     return processor.partialIsValidUtf8(state, bytes, index, limit);
191   }
192 
incompleteStateFor(int byte1)193   private static int incompleteStateFor(int byte1) {
194     return (byte1 > (byte) 0xF4) ?
195         MALFORMED : byte1;
196   }
197 
incompleteStateFor(int byte1, int byte2)198   private static int incompleteStateFor(int byte1, int byte2) {
199     return (byte1 > (byte) 0xF4 ||
200             byte2 > (byte) 0xBF) ?
201         MALFORMED : byte1 ^ (byte2 << 8);
202   }
203 
incompleteStateFor(int byte1, int byte2, int byte3)204   private static int incompleteStateFor(int byte1, int byte2, int byte3) {
205     return (byte1 > (byte) 0xF4 ||
206             byte2 > (byte) 0xBF ||
207             byte3 > (byte) 0xBF) ?
208         MALFORMED : byte1 ^ (byte2 << 8) ^ (byte3 << 16);
209   }
210 
incompleteStateFor(byte[] bytes, int index, int limit)211   private static int incompleteStateFor(byte[] bytes, int index, int limit) {
212     int byte1 = bytes[index - 1];
213     switch (limit - index) {
214       case 0: return incompleteStateFor(byte1);
215       case 1: return incompleteStateFor(byte1, bytes[index]);
216       case 2: return incompleteStateFor(byte1, bytes[index], bytes[index + 1]);
217       default: throw new AssertionError();
218     }
219   }
220 
incompleteStateFor( final ByteBuffer buffer, final int byte1, final int index, final int remaining)221   private static int incompleteStateFor(
222       final ByteBuffer buffer, final int byte1, final int index, final int remaining) {
223     switch (remaining) {
224       case 0:
225         return incompleteStateFor(byte1);
226       case 1:
227         return incompleteStateFor(byte1, buffer.get(index));
228       case 2:
229         return incompleteStateFor(byte1, buffer.get(index), buffer.get(index + 1));
230       default:
231         throw new AssertionError();
232     }
233   }
234 
235   // These UTF-8 handling methods are copied from Guava's Utf8 class with a modification to throw
236   // a protocol buffer local exception. This exception is then caught in CodedOutputStream so it can
237   // fallback to more lenient behavior.
238 
239   static class UnpairedSurrogateException extends IllegalArgumentException {
UnpairedSurrogateException(int index, int length)240     private UnpairedSurrogateException(int index, int length) {
241       super("Unpaired surrogate at index " + index + " of " + length);
242     }
243   }
244 
245   /**
246    * Returns the number of bytes in the UTF-8-encoded form of {@code sequence}. For a string,
247    * this method is equivalent to {@code string.getBytes(UTF_8).length}, but is more efficient in
248    * both time and space.
249    *
250    * @throws IllegalArgumentException if {@code sequence} contains ill-formed UTF-16 (unpaired
251    *     surrogates)
252    */
encodedLength(CharSequence sequence)253   static int encodedLength(CharSequence sequence) {
254     // Warning to maintainers: this implementation is highly optimized.
255     int utf16Length = sequence.length();
256     int utf8Length = utf16Length;
257     int i = 0;
258 
259     // This loop optimizes for pure ASCII.
260     while (i < utf16Length && sequence.charAt(i) < 0x80) {
261       i++;
262     }
263 
264     // This loop optimizes for chars less than 0x800.
265     for (; i < utf16Length; i++) {
266       char c = sequence.charAt(i);
267       if (c < 0x800) {
268         utf8Length += ((0x7f - c) >>> 31);  // branch free!
269       } else {
270         utf8Length += encodedLengthGeneral(sequence, i);
271         break;
272       }
273     }
274 
275     if (utf8Length < utf16Length) {
276       // Necessary and sufficient condition for overflow because of maximum 3x expansion
277       throw new IllegalArgumentException("UTF-8 length does not fit in int: "
278               + (utf8Length + (1L << 32)));
279     }
280     return utf8Length;
281   }
282 
encodedLengthGeneral(CharSequence sequence, int start)283   private static int encodedLengthGeneral(CharSequence sequence, int start) {
284     int utf16Length = sequence.length();
285     int utf8Length = 0;
286     for (int i = start; i < utf16Length; i++) {
287       char c = sequence.charAt(i);
288       if (c < 0x800) {
289         utf8Length += (0x7f - c) >>> 31; // branch free!
290       } else {
291         utf8Length += 2;
292         // jdk7+: if (Character.isSurrogate(c)) {
293         if (Character.MIN_SURROGATE <= c && c <= Character.MAX_SURROGATE) {
294           // Check that we have a well-formed surrogate pair.
295           int cp = Character.codePointAt(sequence, i);
296           if (cp < Character.MIN_SUPPLEMENTARY_CODE_POINT) {
297             throw new UnpairedSurrogateException(i, utf16Length);
298           }
299           i++;
300         }
301       }
302     }
303     return utf8Length;
304   }
305 
encode(CharSequence in, byte[] out, int offset, int length)306   static int encode(CharSequence in, byte[] out, int offset, int length) {
307     return processor.encodeUtf8(in, out, offset, length);
308   }
309   // End Guava UTF-8 methods.
310 
311   /**
312    * Determines if the given {@link ByteBuffer} is a valid UTF-8 string.
313    *
314    * <p>Selects an optimal algorithm based on the type of {@link ByteBuffer} (i.e. heap or direct)
315    * and the capabilities of the platform.
316    *
317    * @param buffer the buffer to check.
318    * @see Utf8#isValidUtf8(byte[], int, int)
319    */
isValidUtf8(ByteBuffer buffer)320   static boolean isValidUtf8(ByteBuffer buffer) {
321     return processor.isValidUtf8(buffer, buffer.position(), buffer.remaining());
322   }
323 
324   /**
325    * Determines if the given {@link ByteBuffer} is a partially valid UTF-8 string.
326    *
327    * <p>Selects an optimal algorithm based on the type of {@link ByteBuffer} (i.e. heap or direct)
328    * and the capabilities of the platform.
329    *
330    * @param buffer the buffer to check.
331    * @see Utf8#partialIsValidUtf8(int, byte[], int, int)
332    */
partialIsValidUtf8(int state, ByteBuffer buffer, int index, int limit)333   static int partialIsValidUtf8(int state, ByteBuffer buffer, int index, int limit) {
334     return processor.partialIsValidUtf8(state, buffer, index, limit);
335   }
336 
337   /**
338    * Encodes the given characters to the target {@link ByteBuffer} using UTF-8 encoding.
339    *
340    * <p>Selects an optimal algorithm based on the type of {@link ByteBuffer} (i.e. heap or direct)
341    * and the capabilities of the platform.
342    *
343    * @param in the source string to be encoded
344    * @param out the target buffer to receive the encoded string.
345    * @see Utf8#encode(CharSequence, byte[], int, int)
346    */
encodeUtf8(CharSequence in, ByteBuffer out)347   static void encodeUtf8(CharSequence in, ByteBuffer out) {
348     processor.encodeUtf8(in, out);
349   }
350 
351   /**
352    * Counts (approximately) the number of consecutive ASCII characters in the given buffer.
353    * The byte order of the {@link ByteBuffer} does not matter, so performance can be improved if
354    * native byte order is used (i.e. no byte-swapping in {@link ByteBuffer#getLong(int)}).
355    *
356    * @param buffer the buffer to be scanned for ASCII chars
357    * @param index the starting index of the scan
358    * @param limit the limit within buffer for the scan
359    * @return the number of ASCII characters found. The stopping position will be at or
360    * before the first non-ASCII byte.
361    */
estimateConsecutiveAscii(ByteBuffer buffer, int index, int limit)362   private static int estimateConsecutiveAscii(ByteBuffer buffer, int index, int limit) {
363     int i = index;
364     final int lim = limit - 7;
365     // This simple loop stops when we encounter a byte >= 0x80 (i.e. non-ASCII).
366     // To speed things up further, we're reading longs instead of bytes so we use a mask to
367     // determine if any byte in the current long is non-ASCII.
368     for (; i < lim && (buffer.getLong(i) & ASCII_MASK_LONG) == 0; i += 8) {}
369     return i - index;
370   }
371 
372   /**
373    * A processor of UTF-8 strings, providing methods for checking validity and encoding.
374    */
375   // TODO(nathanmittler): Add support for Memory/MemoryBlock on Android.
376   abstract static class Processor {
377     /**
378      * Returns {@code true} if the given byte array slice is a
379      * well-formed UTF-8 byte sequence.  The range of bytes to be
380      * checked extends from index {@code index}, inclusive, to {@code
381      * limit}, exclusive.
382      *
383      * <p>This is a convenience method, equivalent to {@code
384      * partialIsValidUtf8(bytes, index, limit) == Utf8.COMPLETE}.
385      */
isValidUtf8(byte[] bytes, int index, int limit)386     final boolean isValidUtf8(byte[] bytes, int index, int limit) {
387       return partialIsValidUtf8(COMPLETE, bytes, index, limit) == COMPLETE;
388     }
389 
390     /**
391      * Tells whether the given byte array slice is a well-formed,
392      * malformed, or incomplete UTF-8 byte sequence.  The range of bytes
393      * to be checked extends from index {@code index}, inclusive, to
394      * {@code limit}, exclusive.
395      *
396      * @param state either {@link Utf8#COMPLETE} (if this is the initial decoding
397      * operation) or the value returned from a call to a partial decoding method
398      * for the previous bytes
399      *
400      * @return {@link #MALFORMED} if the partial byte sequence is
401      * definitely not well-formed, {@link #COMPLETE} if it is well-formed
402      * (no additional input needed), or if the byte sequence is
403      * "incomplete", i.e. apparently terminated in the middle of a character,
404      * an opaque integer "state" value containing enough information to
405      * decode the character when passed to a subsequent invocation of a
406      * partial decoding method.
407      */
partialIsValidUtf8(int state, byte[] bytes, int index, int limit)408     abstract int partialIsValidUtf8(int state, byte[] bytes, int index, int limit);
409 
410     /**
411      * Returns {@code true} if the given portion of the {@link ByteBuffer} is a
412      * well-formed UTF-8 byte sequence.  The range of bytes to be
413      * checked extends from index {@code index}, inclusive, to {@code
414      * limit}, exclusive.
415      *
416      * <p>This is a convenience method, equivalent to {@code
417      * partialIsValidUtf8(bytes, index, limit) == Utf8.COMPLETE}.
418      */
isValidUtf8(ByteBuffer buffer, int index, int limit)419     final boolean isValidUtf8(ByteBuffer buffer, int index, int limit) {
420       return partialIsValidUtf8(COMPLETE, buffer, index, limit) == COMPLETE;
421     }
422 
423     /**
424      * Indicates whether or not the given buffer contains a valid UTF-8 string.
425      *
426      * @param buffer the buffer to check.
427      * @return {@code true} if the given buffer contains a valid UTF-8 string.
428      */
partialIsValidUtf8( final int state, final ByteBuffer buffer, int index, final int limit)429     final int partialIsValidUtf8(
430         final int state, final ByteBuffer buffer, int index, final int limit) {
431       if (buffer.hasArray()) {
432         final int offset = buffer.arrayOffset();
433         return partialIsValidUtf8(state, buffer.array(), offset + index, offset + limit);
434       } else if (buffer.isDirect()){
435         return partialIsValidUtf8Direct(state, buffer, index, limit);
436       }
437       return partialIsValidUtf8Default(state, buffer, index, limit);
438     }
439 
440     /**
441      * Performs validation for direct {@link ByteBuffer} instances.
442      */
partialIsValidUtf8Direct( final int state, final ByteBuffer buffer, int index, final int limit)443     abstract int partialIsValidUtf8Direct(
444         final int state, final ByteBuffer buffer, int index, final int limit);
445 
446     /**
447      * Performs validation for {@link ByteBuffer} instances using the {@link ByteBuffer} API rather
448      * than potentially faster approaches. This first completes validation for the current
449      * character (provided by {@code state}) and then finishes validation for the sequence.
450      */
partialIsValidUtf8Default( final int state, final ByteBuffer buffer, int index, final int limit)451     final int partialIsValidUtf8Default(
452         final int state, final ByteBuffer buffer, int index, final int limit) {
453       if (state != COMPLETE) {
454         // The previous decoding operation was incomplete (or malformed).
455         // We look for a well-formed sequence consisting of bytes from
456         // the previous decoding operation (stored in state) together
457         // with bytes from the array slice.
458         //
459         // We expect such "straddler characters" to be rare.
460 
461         if (index >= limit) { // No bytes? No progress.
462           return state;
463         }
464 
465         byte byte1 = (byte) state;
466         // byte1 is never ASCII.
467         if (byte1 < (byte) 0xE0) {
468           // two-byte form
469 
470           // Simultaneously checks for illegal trailing-byte in
471           // leading position and overlong 2-byte form.
472           if (byte1 < (byte) 0xC2
473               // byte2 trailing-byte test
474               || buffer.get(index++) > (byte) 0xBF) {
475             return MALFORMED;
476           }
477         } else if (byte1 < (byte) 0xF0) {
478           // three-byte form
479 
480           // Get byte2 from saved state or array
481           byte byte2 = (byte) ~(state >> 8);
482           if (byte2 == 0) {
483             byte2 = buffer.get(index++);
484             if (index >= limit) {
485               return incompleteStateFor(byte1, byte2);
486             }
487           }
488           if (byte2 > (byte) 0xBF
489               // overlong? 5 most significant bits must not all be zero
490               || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0)
491               // illegal surrogate codepoint?
492               || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0)
493               // byte3 trailing-byte test
494               || buffer.get(index++) > (byte) 0xBF) {
495             return MALFORMED;
496           }
497         } else {
498           // four-byte form
499 
500           // Get byte2 and byte3 from saved state or array
501           byte byte2 = (byte) ~(state >> 8);
502           byte byte3 = 0;
503           if (byte2 == 0) {
504             byte2 = buffer.get(index++);
505             if (index >= limit) {
506               return incompleteStateFor(byte1, byte2);
507             }
508           } else {
509             byte3 = (byte) (state >> 16);
510           }
511           if (byte3 == 0) {
512             byte3 = buffer.get(index++);
513             if (index >= limit) {
514               return incompleteStateFor(byte1, byte2, byte3);
515             }
516           }
517 
518           // If we were called with state == MALFORMED, then byte1 is 0xFF,
519           // which never occurs in well-formed UTF-8, and so we will return
520           // MALFORMED again below.
521 
522           if (byte2 > (byte) 0xBF
523               // Check that 1 <= plane <= 16.  Tricky optimized form of:
524               // if (byte1 > (byte) 0xF4 ||
525               //     byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 ||
526               //     byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F)
527               || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0
528               // byte3 trailing-byte test
529               || byte3 > (byte) 0xBF
530               // byte4 trailing-byte test
531               || buffer.get(index++) > (byte) 0xBF) {
532             return MALFORMED;
533           }
534         }
535       }
536 
537       // Finish validation for the sequence.
538       return partialIsValidUtf8(buffer, index, limit);
539     }
540 
541     /**
542      * Performs validation for {@link ByteBuffer} instances using the {@link ByteBuffer} API rather
543      * than potentially faster approaches.
544      */
partialIsValidUtf8(final ByteBuffer buffer, int index, final int limit)545     private static int partialIsValidUtf8(final ByteBuffer buffer, int index, final int limit) {
546       index += estimateConsecutiveAscii(buffer, index, limit);
547 
548       for (;;) {
549         // Optimize for interior runs of ASCII bytes.
550         // TODO(nathanmittler): Consider checking 8 bytes at a time after some threshold?
551         // Maybe after seeing a few in a row that are ASCII, go back to fast mode?
552         int byte1;
553         do {
554           if (index >= limit) {
555             return COMPLETE;
556           }
557         } while ((byte1 = buffer.get(index++)) >= 0);
558 
559         // If we're here byte1 is not ASCII. Only need to handle 2-4 byte forms.
560         if (byte1 < (byte) 0xE0) {
561           // Two-byte form (110xxxxx 10xxxxxx)
562           if (index >= limit) {
563             // Incomplete sequence
564             return byte1;
565           }
566 
567           // Simultaneously checks for illegal trailing-byte in
568           // leading position and overlong 2-byte form.
569           if (byte1 < (byte) 0xC2 || buffer.get(index) > (byte) 0xBF) {
570             return MALFORMED;
571           }
572           index++;
573         } else if (byte1 < (byte) 0xF0) {
574           // Three-byte form (1110xxxx 10xxxxxx 10xxxxxx)
575           if (index >= limit - 1) {
576             // Incomplete sequence
577             return incompleteStateFor(buffer, byte1, index, limit - index);
578           }
579 
580           final byte byte2 = buffer.get(index++);
581           if (byte2 > (byte) 0xBF
582               // overlong? 5 most significant bits must not all be zero
583               || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0)
584               // check for illegal surrogate codepoints
585               || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0)
586               // byte3 trailing-byte test
587               || buffer.get(index) > (byte) 0xBF) {
588             return MALFORMED;
589           }
590           index++;
591         } else {
592           // Four-byte form (1110xxxx 10xxxxxx 10xxxxxx 10xxxxxx)
593           if (index >= limit - 2) {
594             // Incomplete sequence
595             return incompleteStateFor(buffer, byte1, index, limit - index);
596           }
597 
598           // TODO(nathanmittler): Consider using getInt() to improve performance.
599           final int byte2 = buffer.get(index++);
600           if (byte2 > (byte) 0xBF
601               // Check that 1 <= plane <= 16.  Tricky optimized form of:
602               // if (byte1 > (byte) 0xF4 ||
603               //     byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 ||
604               //     byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F)
605               || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0
606               // byte3 trailing-byte test
607               || buffer.get(index++) > (byte) 0xBF
608               // byte4 trailing-byte test
609               || buffer.get(index++) > (byte) 0xBF) {
610             return MALFORMED;
611           }
612         }
613       }
614     }
615 
616     /**
617      * Encodes an input character sequence ({@code in}) to UTF-8 in the target array ({@code out}).
618      * For a string, this method is similar to
619      * <pre>{@code
620      * byte[] a = string.getBytes(UTF_8);
621      * System.arraycopy(a, 0, bytes, offset, a.length);
622      * return offset + a.length;
623      * }</pre>
624      *
625      * but is more efficient in both time and space. One key difference is that this method
626      * requires paired surrogates, and therefore does not support chunking.
627      * While {@code String.getBytes(UTF_8)} replaces unpaired surrogates with the default
628      * replacement character, this method throws {@link UnpairedSurrogateException}.
629      *
630      * <p>To ensure sufficient space in the output buffer, either call {@link #encodedLength} to
631      * compute the exact amount needed, or leave room for
632      * {@code Utf8.MAX_BYTES_PER_CHAR * sequence.length()}, which is the largest possible number
633      * of bytes that any input can be encoded to.
634      *
635      * @param in the input character sequence to be encoded
636      * @param out the target array
637      * @param offset the starting offset in {@code bytes} to start writing at
638      * @param length the length of the {@code bytes}, starting from {@code offset}
639      * @throws UnpairedSurrogateException if {@code sequence} contains ill-formed UTF-16 (unpaired
640      *     surrogates)
641      * @throws ArrayIndexOutOfBoundsException if {@code sequence} encoded in UTF-8 is longer than
642      *     {@code bytes.length - offset}
643      * @return the new offset, equivalent to {@code offset + Utf8.encodedLength(sequence)}
644      */
encodeUtf8(CharSequence in, byte[] out, int offset, int length)645     abstract int encodeUtf8(CharSequence in, byte[] out, int offset, int length);
646 
647     /**
648      * Encodes an input character sequence ({@code in}) to UTF-8 in the target buffer ({@code out}).
649      * Upon returning from this method, the {@code out} position will point to the position after
650      * the last encoded byte. This method requires paired surrogates, and therefore does not
651      * support chunking.
652      *
653      * <p>To ensure sufficient space in the output buffer, either call {@link #encodedLength} to
654      * compute the exact amount needed, or leave room for
655      * {@code Utf8.MAX_BYTES_PER_CHAR * in.length()}, which is the largest possible number
656      * of bytes that any input can be encoded to.
657      *
658      * @param in the source character sequence to be encoded
659      * @param out the target buffer
660      * @throws UnpairedSurrogateException if {@code in} contains ill-formed UTF-16 (unpaired
661      *     surrogates)
662      * @throws ArrayIndexOutOfBoundsException if {@code in} encoded in UTF-8 is longer than
663      *     {@code out.remaining()}
664      */
encodeUtf8(CharSequence in, ByteBuffer out)665     final void encodeUtf8(CharSequence in, ByteBuffer out) {
666       if (out.hasArray()) {
667         final int offset = out.arrayOffset();
668         int endIndex =
669             Utf8.encode(in, out.array(), offset + out.position(), out.remaining());
670         out.position(endIndex - offset);
671       } else if (out.isDirect()) {
672         encodeUtf8Direct(in, out);
673       } else {
674         encodeUtf8Default(in, out);
675       }
676     }
677 
678     /**
679      * Encodes the input character sequence to a direct {@link ByteBuffer} instance.
680      */
encodeUtf8Direct(CharSequence in, ByteBuffer out)681     abstract void encodeUtf8Direct(CharSequence in, ByteBuffer out);
682 
683     /**
684      * Encodes the input character sequence to a {@link ByteBuffer} instance using the {@link
685      * ByteBuffer} API, rather than potentially faster approaches.
686      */
encodeUtf8Default(CharSequence in, ByteBuffer out)687     final void encodeUtf8Default(CharSequence in, ByteBuffer out) {
688       final int inLength = in.length();
689       int outIx = out.position();
690       int inIx = 0;
691 
692       // Since ByteBuffer.putXXX() already checks boundaries for us, no need to explicitly check
693       // access. Assume the buffer is big enough and let it handle the out of bounds exception
694       // if it occurs.
695       try {
696         // Designed to take advantage of
697         // https://wikis.oracle.com/display/HotSpotInternals/RangeCheckElimination
698         for (char c; inIx < inLength && (c = in.charAt(inIx)) < 0x80; ++inIx) {
699           out.put(outIx + inIx, (byte) c);
700         }
701         if (inIx == inLength) {
702           // Successfully encoded the entire string.
703           out.position(outIx + inIx);
704           return;
705         }
706 
707         outIx += inIx;
708         for (char c; inIx < inLength; ++inIx, ++outIx) {
709           c = in.charAt(inIx);
710           if (c < 0x80) {
711             // One byte (0xxx xxxx)
712             out.put(outIx, (byte) c);
713           } else if (c < 0x800) {
714             // Two bytes (110x xxxx 10xx xxxx)
715 
716             // Benchmarks show put performs better than putShort here (for HotSpot).
717             out.put(outIx++, (byte) (0xC0 | (c >>> 6)));
718             out.put(outIx, (byte) (0x80 | (0x3F & c)));
719           } else if (c < MIN_SURROGATE || MAX_SURROGATE < c) {
720             // Three bytes (1110 xxxx 10xx xxxx 10xx xxxx)
721             // Maximum single-char code point is 0xFFFF, 16 bits.
722 
723             // Benchmarks show put performs better than putShort here (for HotSpot).
724             out.put(outIx++, (byte) (0xE0 | (c >>> 12)));
725             out.put(outIx++, (byte) (0x80 | (0x3F & (c >>> 6))));
726             out.put(outIx, (byte) (0x80 | (0x3F & c)));
727           } else {
728             // Four bytes (1111 xxxx 10xx xxxx 10xx xxxx 10xx xxxx)
729 
730             // Minimum code point represented by a surrogate pair is 0x10000, 17 bits, four UTF-8
731             // bytes
732             final char low;
733             if (inIx + 1 == inLength || !isSurrogatePair(c, (low = in.charAt(++inIx)))) {
734               throw new UnpairedSurrogateException(inIx, inLength);
735             }
736             // TODO(nathanmittler): Consider using putInt() to improve performance.
737             int codePoint = toCodePoint(c, low);
738             out.put(outIx++, (byte) ((0xF << 4) | (codePoint >>> 18)));
739             out.put(outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 12))));
740             out.put(outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 6))));
741             out.put(outIx, (byte) (0x80 | (0x3F & codePoint)));
742           }
743         }
744 
745         // Successfully encoded the entire string.
746         out.position(outIx);
747       } catch (IndexOutOfBoundsException e) {
748         // TODO(nathanmittler): Consider making the API throw IndexOutOfBoundsException instead.
749 
750         // If we failed in the outer ASCII loop, outIx will not have been updated. In this case,
751         // use inIx to determine the bad write index.
752         int badWriteIndex = out.position() + Math.max(inIx, outIx - out.position() + 1);
753         throw new ArrayIndexOutOfBoundsException(
754             "Failed writing " + in.charAt(inIx) + " at index " + badWriteIndex);
755       }
756     }
757   }
758 
759   /**
760    * {@link Processor} implementation that does not use any {@code sun.misc.Unsafe} methods.
761    */
762   static final class SafeProcessor extends Processor {
763     @Override
partialIsValidUtf8(int state, byte[] bytes, int index, int limit)764     int partialIsValidUtf8(int state, byte[] bytes, int index, int limit) {
765       if (state != COMPLETE) {
766         // The previous decoding operation was incomplete (or malformed).
767         // We look for a well-formed sequence consisting of bytes from
768         // the previous decoding operation (stored in state) together
769         // with bytes from the array slice.
770         //
771         // We expect such "straddler characters" to be rare.
772 
773         if (index >= limit) {  // No bytes? No progress.
774           return state;
775         }
776         int byte1 = (byte) state;
777         // byte1 is never ASCII.
778         if (byte1 < (byte) 0xE0) {
779           // two-byte form
780 
781           // Simultaneously checks for illegal trailing-byte in
782           // leading position and overlong 2-byte form.
783           if (byte1 < (byte) 0xC2
784               // byte2 trailing-byte test
785               || bytes[index++] > (byte) 0xBF) {
786             return MALFORMED;
787           }
788         } else if (byte1 < (byte) 0xF0) {
789           // three-byte form
790 
791           // Get byte2 from saved state or array
792           int byte2 = (byte) ~(state >> 8);
793           if (byte2 == 0) {
794             byte2 = bytes[index++];
795             if (index >= limit) {
796               return incompleteStateFor(byte1, byte2);
797             }
798           }
799           if (byte2 > (byte) 0xBF
800               // overlong? 5 most significant bits must not all be zero
801               || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0)
802               // illegal surrogate codepoint?
803               || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0)
804               // byte3 trailing-byte test
805               || bytes[index++] > (byte) 0xBF) {
806             return MALFORMED;
807           }
808         } else {
809           // four-byte form
810 
811           // Get byte2 and byte3 from saved state or array
812           int byte2 = (byte) ~(state >> 8);
813           int byte3 = 0;
814           if (byte2 == 0) {
815             byte2 = bytes[index++];
816             if (index >= limit) {
817               return incompleteStateFor(byte1, byte2);
818             }
819           } else {
820             byte3 = (byte) (state >> 16);
821           }
822           if (byte3 == 0) {
823             byte3 = bytes[index++];
824             if (index >= limit) {
825               return incompleteStateFor(byte1, byte2, byte3);
826             }
827           }
828 
829           // If we were called with state == MALFORMED, then byte1 is 0xFF,
830           // which never occurs in well-formed UTF-8, and so we will return
831           // MALFORMED again below.
832 
833           if (byte2 > (byte) 0xBF
834               // Check that 1 <= plane <= 16.  Tricky optimized form of:
835               // if (byte1 > (byte) 0xF4 ||
836               //     byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 ||
837               //     byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F)
838               || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0
839               // byte3 trailing-byte test
840               || byte3 > (byte) 0xBF
841               // byte4 trailing-byte test
842               || bytes[index++] > (byte) 0xBF) {
843             return MALFORMED;
844           }
845         }
846       }
847 
848       return partialIsValidUtf8(bytes, index, limit);
849     }
850 
851     @Override
partialIsValidUtf8Direct(int state, ByteBuffer buffer, int index, int limit)852     int partialIsValidUtf8Direct(int state, ByteBuffer buffer, int index, int limit) {
853       // For safe processing, we have to use the ByteBuffer API.
854       return partialIsValidUtf8Default(state, buffer, index, limit);
855     }
856 
857     @Override
encodeUtf8(CharSequence in, byte[] out, int offset, int length)858     int encodeUtf8(CharSequence in, byte[] out, int offset, int length) {
859       int utf16Length = in.length();
860       int j = offset;
861       int i = 0;
862       int limit = offset + length;
863       // Designed to take advantage of
864       // https://wikis.oracle.com/display/HotSpotInternals/RangeCheckElimination
865       for (char c; i < utf16Length && i + j < limit && (c = in.charAt(i)) < 0x80; i++) {
866         out[j + i] = (byte) c;
867       }
868       if (i == utf16Length) {
869         return j + utf16Length;
870       }
871       j += i;
872       for (char c; i < utf16Length; i++) {
873         c = in.charAt(i);
874         if (c < 0x80 && j < limit) {
875           out[j++] = (byte) c;
876         } else if (c < 0x800 && j <= limit - 2) { // 11 bits, two UTF-8 bytes
877           out[j++] = (byte) ((0xF << 6) | (c >>> 6));
878           out[j++] = (byte) (0x80 | (0x3F & c));
879         } else if ((c < Character.MIN_SURROGATE || Character.MAX_SURROGATE < c) && j <= limit - 3) {
880           // Maximum single-char code point is 0xFFFF, 16 bits, three UTF-8 bytes
881           out[j++] = (byte) ((0xF << 5) | (c >>> 12));
882           out[j++] = (byte) (0x80 | (0x3F & (c >>> 6)));
883           out[j++] = (byte) (0x80 | (0x3F & c));
884         } else if (j <= limit - 4) {
885           // Minimum code point represented by a surrogate pair is 0x10000, 17 bits,
886           // four UTF-8 bytes
887           final char low;
888           if (i + 1 == in.length()
889                   || !Character.isSurrogatePair(c, (low = in.charAt(++i)))) {
890             throw new UnpairedSurrogateException((i - 1), utf16Length);
891           }
892           int codePoint = Character.toCodePoint(c, low);
893           out[j++] = (byte) ((0xF << 4) | (codePoint >>> 18));
894           out[j++] = (byte) (0x80 | (0x3F & (codePoint >>> 12)));
895           out[j++] = (byte) (0x80 | (0x3F & (codePoint >>> 6)));
896           out[j++] = (byte) (0x80 | (0x3F & codePoint));
897         } else {
898           // If we are surrogates and we're not a surrogate pair, always throw an
899           // UnpairedSurrogateException instead of an ArrayOutOfBoundsException.
900           if ((Character.MIN_SURROGATE <= c && c <= Character.MAX_SURROGATE)
901               && (i + 1 == in.length()
902                   || !Character.isSurrogatePair(c, in.charAt(i + 1)))) {
903             throw new UnpairedSurrogateException(i, utf16Length);
904           }
905           throw new ArrayIndexOutOfBoundsException("Failed writing " + c + " at index " + j);
906         }
907       }
908       return j;
909     }
910 
911     @Override
encodeUtf8Direct(CharSequence in, ByteBuffer out)912     void encodeUtf8Direct(CharSequence in, ByteBuffer out) {
913       // For safe processing, we have to use the ByteBuffer API.
914       encodeUtf8Default(in, out);
915     }
916 
partialIsValidUtf8(byte[] bytes, int index, int limit)917     private static int partialIsValidUtf8(byte[] bytes, int index, int limit) {
918       // Optimize for 100% ASCII (Hotspot loves small simple top-level loops like this).
919       // This simple loop stops when we encounter a byte >= 0x80 (i.e. non-ASCII).
920       while (index < limit && bytes[index] >= 0) {
921         index++;
922       }
923 
924       return (index >= limit) ? COMPLETE : partialIsValidUtf8NonAscii(bytes, index, limit);
925     }
926 
partialIsValidUtf8NonAscii(byte[] bytes, int index, int limit)927     private static int partialIsValidUtf8NonAscii(byte[] bytes, int index, int limit) {
928       for (;;) {
929         int byte1, byte2;
930 
931         // Optimize for interior runs of ASCII bytes.
932         do {
933           if (index >= limit) {
934             return COMPLETE;
935           }
936         } while ((byte1 = bytes[index++]) >= 0);
937 
938         if (byte1 < (byte) 0xE0) {
939           // two-byte form
940 
941           if (index >= limit) {
942             // Incomplete sequence
943             return byte1;
944           }
945 
946           // Simultaneously checks for illegal trailing-byte in
947           // leading position and overlong 2-byte form.
948           if (byte1 < (byte) 0xC2
949               || bytes[index++] > (byte) 0xBF) {
950             return MALFORMED;
951           }
952         } else if (byte1 < (byte) 0xF0) {
953           // three-byte form
954 
955           if (index >= limit - 1) { // incomplete sequence
956             return incompleteStateFor(bytes, index, limit);
957           }
958           if ((byte2 = bytes[index++]) > (byte) 0xBF
959               // overlong? 5 most significant bits must not all be zero
960               || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0)
961               // check for illegal surrogate codepoints
962               || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0)
963               // byte3 trailing-byte test
964               || bytes[index++] > (byte) 0xBF) {
965             return MALFORMED;
966           }
967         } else {
968           // four-byte form
969 
970           if (index >= limit - 2) {  // incomplete sequence
971             return incompleteStateFor(bytes, index, limit);
972           }
973           if ((byte2 = bytes[index++]) > (byte) 0xBF
974               // Check that 1 <= plane <= 16.  Tricky optimized form of:
975               // if (byte1 > (byte) 0xF4 ||
976               //     byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 ||
977               //     byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F)
978               || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0
979               // byte3 trailing-byte test
980               || bytes[index++] > (byte) 0xBF
981               // byte4 trailing-byte test
982               || bytes[index++] > (byte) 0xBF) {
983             return MALFORMED;
984           }
985         }
986       }
987     }
988   }
989 
990   /**
991    * {@link Processor} that uses {@code sun.misc.Unsafe} where possible to improve performance.
992    */
993   static final class UnsafeProcessor extends Processor {
994     private static final sun.misc.Unsafe UNSAFE = getUnsafe();
995     private static final long BUFFER_ADDRESS_OFFSET =
996         fieldOffset(field(Buffer.class, "address"));
997     private static final int ARRAY_BASE_OFFSET = byteArrayBaseOffset();
998 
999     /**
1000      * We only use Unsafe operations if we have access to direct {@link ByteBuffer}'s address
1001      * and the array base offset is a multiple of 8 (needed by Unsafe.getLong()).
1002      */
1003     private static final boolean AVAILABLE =
1004         BUFFER_ADDRESS_OFFSET != -1 && ARRAY_BASE_OFFSET % 8 == 0;
1005 
1006     /**
1007      * Indicates whether or not all required unsafe operations are supported on this platform.
1008      */
isAvailable()1009     static boolean isAvailable() {
1010       return AVAILABLE;
1011     }
1012 
1013     @Override
partialIsValidUtf8(int state, byte[] bytes, final int index, final int limit)1014     int partialIsValidUtf8(int state, byte[] bytes, final int index, final int limit) {
1015       if ((index | limit | bytes.length - limit) < 0) {
1016         throw new ArrayIndexOutOfBoundsException(
1017             String.format("Array length=%d, index=%d, limit=%d", bytes.length, index, limit));
1018       }
1019       long offset = ARRAY_BASE_OFFSET + index;
1020       final long offsetLimit = ARRAY_BASE_OFFSET + limit;
1021       if (state != COMPLETE) {
1022         // The previous decoding operation was incomplete (or malformed).
1023         // We look for a well-formed sequence consisting of bytes from
1024         // the previous decoding operation (stored in state) together
1025         // with bytes from the array slice.
1026         //
1027         // We expect such "straddler characters" to be rare.
1028 
1029         if (offset >= offsetLimit) {  // No bytes? No progress.
1030           return state;
1031         }
1032         int byte1 = (byte) state;
1033         // byte1 is never ASCII.
1034         if (byte1 < (byte) 0xE0) {
1035           // two-byte form
1036 
1037           // Simultaneously checks for illegal trailing-byte in
1038           // leading position and overlong 2-byte form.
1039           if (byte1 < (byte) 0xC2
1040               // byte2 trailing-byte test
1041               || UNSAFE.getByte(bytes, offset++) > (byte) 0xBF) {
1042             return MALFORMED;
1043           }
1044         } else if (byte1 < (byte) 0xF0) {
1045           // three-byte form
1046 
1047           // Get byte2 from saved state or array
1048           int byte2 = (byte) ~(state >> 8);
1049           if (byte2 == 0) {
1050             byte2 = UNSAFE.getByte(bytes, offset++);
1051             if (offset >= offsetLimit) {
1052               return incompleteStateFor(byte1, byte2);
1053             }
1054           }
1055           if (byte2 > (byte) 0xBF
1056               // overlong? 5 most significant bits must not all be zero
1057               || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0)
1058               // illegal surrogate codepoint?
1059               || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0)
1060               // byte3 trailing-byte test
1061               || UNSAFE.getByte(bytes, offset++) > (byte) 0xBF) {
1062             return MALFORMED;
1063           }
1064         } else {
1065           // four-byte form
1066 
1067           // Get byte2 and byte3 from saved state or array
1068           int byte2 = (byte) ~(state >> 8);
1069           int byte3 = 0;
1070           if (byte2 == 0) {
1071             byte2 = UNSAFE.getByte(bytes, offset++);
1072             if (offset >= offsetLimit) {
1073               return incompleteStateFor(byte1, byte2);
1074             }
1075           } else {
1076             byte3 = (byte) (state >> 16);
1077           }
1078           if (byte3 == 0) {
1079             byte3 = UNSAFE.getByte(bytes, offset++);
1080             if (offset >= offsetLimit) {
1081               return incompleteStateFor(byte1, byte2, byte3);
1082             }
1083           }
1084 
1085           // If we were called with state == MALFORMED, then byte1 is 0xFF,
1086           // which never occurs in well-formed UTF-8, and so we will return
1087           // MALFORMED again below.
1088 
1089           if (byte2 > (byte) 0xBF
1090               // Check that 1 <= plane <= 16.  Tricky optimized form of:
1091               // if (byte1 > (byte) 0xF4 ||
1092               //     byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 ||
1093               //     byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F)
1094               || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0
1095               // byte3 trailing-byte test
1096               || byte3 > (byte) 0xBF
1097               // byte4 trailing-byte test
1098               || UNSAFE.getByte(bytes, offset++) > (byte) 0xBF) {
1099             return MALFORMED;
1100           }
1101         }
1102       }
1103 
1104       return partialIsValidUtf8(bytes, offset, (int) (offsetLimit - offset));
1105     }
1106 
1107     @Override
partialIsValidUtf8Direct( final int state, ByteBuffer buffer, final int index, final int limit)1108     int partialIsValidUtf8Direct(
1109         final int state, ByteBuffer buffer, final int index, final int limit) {
1110       if ((index | limit | buffer.limit() - limit) < 0) {
1111         throw new ArrayIndexOutOfBoundsException(
1112             String.format("buffer limit=%d, index=%d, limit=%d", buffer.limit(), index, limit));
1113       }
1114       long address = addressOffset(buffer) + index;
1115       final long addressLimit = address + (limit - index);
1116       if (state != COMPLETE) {
1117         // The previous decoding operation was incomplete (or malformed).
1118         // We look for a well-formed sequence consisting of bytes from
1119         // the previous decoding operation (stored in state) together
1120         // with bytes from the array slice.
1121         //
1122         // We expect such "straddler characters" to be rare.
1123 
1124         if (address >= addressLimit) { // No bytes? No progress.
1125           return state;
1126         }
1127 
1128         final int byte1 = (byte) state;
1129         // byte1 is never ASCII.
1130         if (byte1 < (byte) 0xE0) {
1131           // two-byte form
1132 
1133           // Simultaneously checks for illegal trailing-byte in
1134           // leading position and overlong 2-byte form.
1135           if (byte1 < (byte) 0xC2
1136               // byte2 trailing-byte test
1137               || UNSAFE.getByte(address++) > (byte) 0xBF) {
1138             return MALFORMED;
1139           }
1140         } else if (byte1 < (byte) 0xF0) {
1141           // three-byte form
1142 
1143           // Get byte2 from saved state or array
1144           int byte2 = (byte) ~(state >> 8);
1145           if (byte2 == 0) {
1146             byte2 = UNSAFE.getByte(address++);
1147             if (address >= addressLimit) {
1148               return incompleteStateFor(byte1, byte2);
1149             }
1150           }
1151           if (byte2 > (byte) 0xBF
1152               // overlong? 5 most significant bits must not all be zero
1153               || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0)
1154               // illegal surrogate codepoint?
1155               || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0)
1156               // byte3 trailing-byte test
1157               || UNSAFE.getByte(address++) > (byte) 0xBF) {
1158             return MALFORMED;
1159           }
1160         } else {
1161           // four-byte form
1162 
1163           // Get byte2 and byte3 from saved state or array
1164           int byte2 = (byte) ~(state >> 8);
1165           int byte3 = 0;
1166           if (byte2 == 0) {
1167             byte2 = UNSAFE.getByte(address++);
1168             if (address >= addressLimit) {
1169               return incompleteStateFor(byte1, byte2);
1170             }
1171           } else {
1172             byte3 = (byte) (state >> 16);
1173           }
1174           if (byte3 == 0) {
1175             byte3 = UNSAFE.getByte(address++);
1176             if (address >= addressLimit) {
1177               return incompleteStateFor(byte1, byte2, byte3);
1178             }
1179           }
1180 
1181           // If we were called with state == MALFORMED, then byte1 is 0xFF,
1182           // which never occurs in well-formed UTF-8, and so we will return
1183           // MALFORMED again below.
1184 
1185           if (byte2 > (byte) 0xBF
1186               // Check that 1 <= plane <= 16.  Tricky optimized form of:
1187               // if (byte1 > (byte) 0xF4 ||
1188               //     byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 ||
1189               //     byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F)
1190               || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0
1191               // byte3 trailing-byte test
1192               || byte3 > (byte) 0xBF
1193               // byte4 trailing-byte test
1194               || UNSAFE.getByte(address++) > (byte) 0xBF) {
1195             return MALFORMED;
1196           }
1197         }
1198       }
1199 
1200       return partialIsValidUtf8(address, (int) (addressLimit - address));
1201     }
1202 
1203     @Override
encodeUtf8(final CharSequence in, final byte[] out, final int offset, final int length)1204     int encodeUtf8(final CharSequence in, final byte[] out, final int offset, final int length) {
1205       long outIx = ARRAY_BASE_OFFSET + offset;
1206       final long outLimit = outIx + length;
1207       final int inLimit = in.length();
1208       if (inLimit > length || out.length - length < offset) {
1209         // Not even enough room for an ASCII-encoded string.
1210         throw new ArrayIndexOutOfBoundsException(
1211             "Failed writing " + in.charAt(inLimit - 1) + " at index " + (offset + length));
1212       }
1213 
1214       // Designed to take advantage of
1215       // https://wikis.oracle.com/display/HotSpotInternals/RangeCheckElimination
1216       int inIx = 0;
1217       for (char c; inIx < inLimit && (c = in.charAt(inIx)) < 0x80; ++inIx) {
1218         UNSAFE.putByte(out, outIx++, (byte) c);
1219       }
1220       if (inIx == inLimit) {
1221         // We're done, it was ASCII encoded.
1222         return (int) (outIx - ARRAY_BASE_OFFSET);
1223       }
1224 
1225       for (char c; inIx < inLimit; ++inIx) {
1226         c = in.charAt(inIx);
1227         if (c < 0x80 && outIx < outLimit) {
1228           UNSAFE.putByte(out, outIx++, (byte) c);
1229         } else if (c < 0x800 && outIx <= outLimit - 2L) { // 11 bits, two UTF-8 bytes
1230           UNSAFE.putByte(out, outIx++, (byte) ((0xF << 6) | (c >>> 6)));
1231           UNSAFE.putByte(out, outIx++, (byte) (0x80 | (0x3F & c)));
1232         } else if ((c < MIN_SURROGATE || MAX_SURROGATE < c) && outIx <= outLimit - 3L) {
1233           // Maximum single-char code point is 0xFFFF, 16 bits, three UTF-8 bytes
1234           UNSAFE.putByte(out, outIx++, (byte) ((0xF << 5) | (c >>> 12)));
1235           UNSAFE.putByte(out, outIx++, (byte) (0x80 | (0x3F & (c >>> 6))));
1236           UNSAFE.putByte(out, outIx++, (byte) (0x80 | (0x3F & c)));
1237         } else if (outIx <= outLimit - 4L) {
1238           // Minimum code point represented by a surrogate pair is 0x10000, 17 bits, four UTF-8
1239           // bytes
1240           final char low;
1241           if (inIx + 1 == inLimit || !isSurrogatePair(c, (low = in.charAt(++inIx)))) {
1242             throw new UnpairedSurrogateException((inIx - 1), inLimit);
1243           }
1244           int codePoint = toCodePoint(c, low);
1245           UNSAFE.putByte(out, outIx++, (byte) ((0xF << 4) | (codePoint >>> 18)));
1246           UNSAFE.putByte(out, outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 12))));
1247           UNSAFE.putByte(out, outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 6))));
1248           UNSAFE.putByte(out, outIx++, (byte) (0x80 | (0x3F & codePoint)));
1249         } else {
1250           if ((MIN_SURROGATE <= c && c <= MAX_SURROGATE)
1251               && (inIx + 1 == inLimit || !isSurrogatePair(c, in.charAt(inIx + 1)))) {
1252             // We are surrogates and we're not a surrogate pair.
1253             throw new UnpairedSurrogateException(inIx, inLimit);
1254           }
1255           // Not enough space in the output buffer.
1256           throw new ArrayIndexOutOfBoundsException("Failed writing " + c + " at index " + outIx);
1257         }
1258       }
1259 
1260       // All bytes have been encoded.
1261       return (int) (outIx - ARRAY_BASE_OFFSET);
1262     }
1263 
1264     @Override
encodeUtf8Direct(CharSequence in, ByteBuffer out)1265     void encodeUtf8Direct(CharSequence in, ByteBuffer out) {
1266       final long address = addressOffset(out);
1267       long outIx = address + out.position();
1268       final long outLimit = address + out.limit();
1269       final int inLimit = in.length();
1270       if (inLimit > outLimit - outIx) {
1271         // Not even enough room for an ASCII-encoded string.
1272         throw new ArrayIndexOutOfBoundsException(
1273             "Failed writing " + in.charAt(inLimit - 1) + " at index " + out.limit());
1274       }
1275 
1276       // Designed to take advantage of
1277       // https://wikis.oracle.com/display/HotSpotInternals/RangeCheckElimination
1278       int inIx = 0;
1279       for (char c; inIx < inLimit && (c = in.charAt(inIx)) < 0x80; ++inIx) {
1280         UNSAFE.putByte(outIx++, (byte) c);
1281       }
1282       if (inIx == inLimit) {
1283         // We're done, it was ASCII encoded.
1284         out.position((int) (outIx - address));
1285         return;
1286       }
1287 
1288       for (char c; inIx < inLimit; ++inIx) {
1289         c = in.charAt(inIx);
1290         if (c < 0x80 && outIx < outLimit) {
1291           UNSAFE.putByte(outIx++, (byte) c);
1292         } else if (c < 0x800 && outIx <= outLimit - 2L) { // 11 bits, two UTF-8 bytes
1293           UNSAFE.putByte(outIx++, (byte) ((0xF << 6) | (c >>> 6)));
1294           UNSAFE.putByte(outIx++, (byte) (0x80 | (0x3F & c)));
1295         } else if ((c < MIN_SURROGATE || MAX_SURROGATE < c) && outIx <= outLimit - 3L) {
1296           // Maximum single-char code point is 0xFFFF, 16 bits, three UTF-8 bytes
1297           UNSAFE.putByte(outIx++, (byte) ((0xF << 5) | (c >>> 12)));
1298           UNSAFE.putByte(outIx++, (byte) (0x80 | (0x3F & (c >>> 6))));
1299           UNSAFE.putByte(outIx++, (byte) (0x80 | (0x3F & c)));
1300         } else if (outIx <= outLimit - 4L) {
1301           // Minimum code point represented by a surrogate pair is 0x10000, 17 bits, four UTF-8
1302           // bytes
1303           final char low;
1304           if (inIx + 1 == inLimit || !isSurrogatePair(c, (low = in.charAt(++inIx)))) {
1305             throw new UnpairedSurrogateException((inIx - 1), inLimit);
1306           }
1307           int codePoint = toCodePoint(c, low);
1308           UNSAFE.putByte(outIx++, (byte) ((0xF << 4) | (codePoint >>> 18)));
1309           UNSAFE.putByte(outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 12))));
1310           UNSAFE.putByte(outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 6))));
1311           UNSAFE.putByte(outIx++, (byte) (0x80 | (0x3F & codePoint)));
1312         } else {
1313           if ((MIN_SURROGATE <= c && c <= MAX_SURROGATE)
1314               && (inIx + 1 == inLimit || !isSurrogatePair(c, in.charAt(inIx + 1)))) {
1315             // We are surrogates and we're not a surrogate pair.
1316             throw new UnpairedSurrogateException(inIx, inLimit);
1317           }
1318           // Not enough space in the output buffer.
1319           throw new ArrayIndexOutOfBoundsException("Failed writing " + c + " at index " + outIx);
1320         }
1321       }
1322 
1323       // All bytes have been encoded.
1324       out.position((int) (outIx - address));
1325     }
1326 
1327     /**
1328      * Counts (approximately) the number of consecutive ASCII characters starting from the given
1329      * position, using the most efficient method available to the platform.
1330      *
1331      * @param bytes the array containing the character sequence
1332      * @param offset the offset position of the index (same as index + arrayBaseOffset)
1333      * @param maxChars the maximum number of characters to count
1334      * @return the number of ASCII characters found. The stopping position will be at or
1335      * before the first non-ASCII byte.
1336      */
unsafeEstimateConsecutiveAscii( byte[] bytes, long offset, final int maxChars)1337     private static int unsafeEstimateConsecutiveAscii(
1338         byte[] bytes, long offset, final int maxChars) {
1339       int remaining = maxChars;
1340       if (remaining < UNSAFE_COUNT_ASCII_THRESHOLD) {
1341         // Don't bother with small strings.
1342         return 0;
1343       }
1344 
1345       // Read bytes until 8-byte aligned so that we can read longs in the loop below.
1346       // Byte arrays are already either 8 or 16-byte aligned, so we just need to make sure that
1347       // the index (relative to the start of the array) is also 8-byte aligned. We do this by
1348       // ANDing the index with 7 to determine the number of bytes that need to be read before
1349       // we're 8-byte aligned.
1350       final int unaligned = (int) offset & 7;
1351       for (int j = unaligned; j > 0; j--) {
1352         if (UNSAFE.getByte(bytes, offset++) < 0) {
1353           return unaligned - j;
1354         }
1355       }
1356 
1357       // This simple loop stops when we encounter a byte >= 0x80 (i.e. non-ASCII).
1358       // To speed things up further, we're reading longs instead of bytes so we use a mask to
1359       // determine if any byte in the current long is non-ASCII.
1360       remaining -= unaligned;
1361       for (; remaining >= 8 && (UNSAFE.getLong(bytes, offset) & ASCII_MASK_LONG) == 0;
1362           offset += 8, remaining -= 8) {}
1363       return maxChars - remaining;
1364     }
1365 
1366     /**
1367      * Same as {@link Utf8#estimateConsecutiveAscii(ByteBuffer, int, int)} except that it uses the
1368      * most efficient method available to the platform.
1369      */
unsafeEstimateConsecutiveAscii(long address, final int maxChars)1370     private static int unsafeEstimateConsecutiveAscii(long address, final int maxChars) {
1371       int remaining = maxChars;
1372       if (remaining < UNSAFE_COUNT_ASCII_THRESHOLD) {
1373         // Don't bother with small strings.
1374         return 0;
1375       }
1376 
1377       // Read bytes until 8-byte aligned so that we can read longs in the loop below.
1378       // We do this by ANDing the address with 7 to determine the number of bytes that need to
1379       // be read before we're 8-byte aligned.
1380       final int unaligned = (int) address & 7;
1381       for (int j = unaligned; j > 0; j--) {
1382         if (UNSAFE.getByte(address++) < 0) {
1383           return unaligned - j;
1384         }
1385       }
1386 
1387       // This simple loop stops when we encounter a byte >= 0x80 (i.e. non-ASCII).
1388       // To speed things up further, we're reading longs instead of bytes so we use a mask to
1389       // determine if any byte in the current long is non-ASCII.
1390       remaining -= unaligned;
1391       for (; remaining >= 8 && (UNSAFE.getLong(address) & ASCII_MASK_LONG) == 0;
1392           address += 8, remaining -= 8) {}
1393       return maxChars - remaining;
1394     }
1395 
partialIsValidUtf8(final byte[] bytes, long offset, int remaining)1396     private static int partialIsValidUtf8(final byte[] bytes, long offset, int remaining) {
1397       // Skip past ASCII characters as quickly as possible.
1398       final int skipped = unsafeEstimateConsecutiveAscii(bytes, offset, remaining);
1399       remaining -= skipped;
1400       offset += skipped;
1401 
1402       for (;;) {
1403         // Optimize for interior runs of ASCII bytes.
1404         // TODO(nathanmittler): Consider checking 8 bytes at a time after some threshold?
1405         // Maybe after seeing a few in a row that are ASCII, go back to fast mode?
1406         int byte1 = 0;
1407         for (; remaining > 0 && (byte1 = UNSAFE.getByte(bytes, offset++)) >= 0; --remaining) {
1408         }
1409         if (remaining == 0) {
1410           return COMPLETE;
1411         }
1412         remaining--;
1413 
1414         // If we're here byte1 is not ASCII. Only need to handle 2-4 byte forms.
1415         if (byte1 < (byte) 0xE0) {
1416           // Two-byte form (110xxxxx 10xxxxxx)
1417           if (remaining == 0) {
1418             // Incomplete sequence
1419             return byte1;
1420           }
1421           remaining--;
1422 
1423           // Simultaneously checks for illegal trailing-byte in
1424           // leading position and overlong 2-byte form.
1425           if (byte1 < (byte) 0xC2
1426               || UNSAFE.getByte(bytes, offset++) > (byte) 0xBF) {
1427             return MALFORMED;
1428           }
1429         } else if (byte1 < (byte) 0xF0) {
1430           // Three-byte form (1110xxxx 10xxxxxx 10xxxxxx)
1431           if (remaining < 2) {
1432             // Incomplete sequence
1433             return unsafeIncompleteStateFor(bytes, byte1, offset, remaining);
1434           }
1435           remaining -= 2;
1436 
1437           final int byte2;
1438           if ((byte2 = UNSAFE.getByte(bytes, offset++)) > (byte) 0xBF
1439               // overlong? 5 most significant bits must not all be zero
1440               || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0)
1441               // check for illegal surrogate codepoints
1442               || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0)
1443               // byte3 trailing-byte test
1444               || UNSAFE.getByte(bytes, offset++) > (byte) 0xBF) {
1445             return MALFORMED;
1446           }
1447         } else {
1448           // Four-byte form (1110xxxx 10xxxxxx 10xxxxxx 10xxxxxx)
1449           if (remaining < 3) {
1450             // Incomplete sequence
1451             return unsafeIncompleteStateFor(bytes, byte1, offset, remaining);
1452           }
1453           remaining -= 3;
1454 
1455           final int byte2;
1456           if ((byte2 = UNSAFE.getByte(bytes, offset++)) > (byte) 0xBF
1457               // Check that 1 <= plane <= 16.  Tricky optimized form of:
1458               // if (byte1 > (byte) 0xF4 ||
1459               //     byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 ||
1460               //     byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F)
1461               || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0
1462               // byte3 trailing-byte test
1463               || UNSAFE.getByte(bytes, offset++) > (byte) 0xBF
1464               // byte4 trailing-byte test
1465               || UNSAFE.getByte(bytes, offset++) > (byte) 0xBF) {
1466             return MALFORMED;
1467           }
1468         }
1469       }
1470     }
1471 
partialIsValidUtf8(long address, int remaining)1472     private static int partialIsValidUtf8(long address, int remaining) {
1473       // Skip past ASCII characters as quickly as possible.
1474       final int skipped = unsafeEstimateConsecutiveAscii(address, remaining);
1475       address += skipped;
1476       remaining -= skipped;
1477 
1478       for (;;) {
1479         // Optimize for interior runs of ASCII bytes.
1480         // TODO(nathanmittler): Consider checking 8 bytes at a time after some threshold?
1481         // Maybe after seeing a few in a row that are ASCII, go back to fast mode?
1482         int byte1 = 0;
1483         for (; remaining > 0 && (byte1 = UNSAFE.getByte(address++)) >= 0; --remaining) {
1484         }
1485         if (remaining == 0) {
1486           return COMPLETE;
1487         }
1488         remaining--;
1489 
1490         if (byte1 < (byte) 0xE0) {
1491           // Two-byte form
1492 
1493           if (remaining == 0) {
1494             // Incomplete sequence
1495             return byte1;
1496           }
1497           remaining--;
1498 
1499           // Simultaneously checks for illegal trailing-byte in
1500           // leading position and overlong 2-byte form.
1501           if (byte1 < (byte) 0xC2 || UNSAFE.getByte(address++) > (byte) 0xBF) {
1502             return MALFORMED;
1503           }
1504         } else if (byte1 < (byte) 0xF0) {
1505           // Three-byte form
1506 
1507           if (remaining < 2) {
1508             // Incomplete sequence
1509             return unsafeIncompleteStateFor(address, byte1, remaining);
1510           }
1511           remaining -= 2;
1512 
1513           final byte byte2 = UNSAFE.getByte(address++);
1514           if (byte2 > (byte) 0xBF
1515               // overlong? 5 most significant bits must not all be zero
1516               || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0)
1517               // check for illegal surrogate codepoints
1518               || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0)
1519               // byte3 trailing-byte test
1520               || UNSAFE.getByte(address++) > (byte) 0xBF) {
1521             return MALFORMED;
1522           }
1523         } else {
1524           // Four-byte form
1525 
1526           if (remaining < 3) {
1527             // Incomplete sequence
1528             return unsafeIncompleteStateFor(address, byte1, remaining);
1529           }
1530           remaining -= 3;
1531 
1532           final byte byte2 = UNSAFE.getByte(address++);
1533           if (byte2 > (byte) 0xBF
1534               // Check that 1 <= plane <= 16.  Tricky optimized form of:
1535               // if (byte1 > (byte) 0xF4 ||
1536               //     byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 ||
1537               //     byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F)
1538               || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0
1539               // byte3 trailing-byte test
1540               || UNSAFE.getByte(address++) > (byte) 0xBF
1541               // byte4 trailing-byte test
1542               || UNSAFE.getByte(address++) > (byte) 0xBF) {
1543             return MALFORMED;
1544           }
1545         }
1546       }
1547     }
1548 
unsafeIncompleteStateFor(byte[] bytes, int byte1, long offset, int remaining)1549     private static int unsafeIncompleteStateFor(byte[] bytes, int byte1, long offset,
1550         int remaining) {
1551       switch (remaining) {
1552         case 0: {
1553           return incompleteStateFor(byte1);
1554         }
1555         case 1: {
1556           return incompleteStateFor(byte1, UNSAFE.getByte(bytes, offset));
1557         }
1558         case 2: {
1559           return incompleteStateFor(byte1, UNSAFE.getByte(bytes, offset),
1560               UNSAFE.getByte(bytes, offset + 1));
1561         }
1562         default: {
1563           throw new AssertionError();
1564         }
1565       }
1566     }
1567 
unsafeIncompleteStateFor(long address, final int byte1, int remaining)1568     private static int unsafeIncompleteStateFor(long address, final int byte1, int remaining) {
1569       switch (remaining) {
1570         case 0: {
1571           return incompleteStateFor(byte1);
1572         }
1573         case 1: {
1574           return incompleteStateFor(byte1, UNSAFE.getByte(address));
1575         }
1576         case 2: {
1577           return incompleteStateFor(byte1, UNSAFE.getByte(address), UNSAFE.getByte(address + 1));
1578         }
1579         default: {
1580           throw new AssertionError();
1581         }
1582       }
1583     }
1584 
1585     /**
1586      * Gets the field with the given name within the class, or {@code null} if not found. If
1587      * found, the field is made accessible.
1588      */
field(Class<?> clazz, String fieldName)1589     private static Field field(Class<?> clazz, String fieldName) {
1590       Field field;
1591       try {
1592         field = clazz.getDeclaredField(fieldName);
1593         field.setAccessible(true);
1594       } catch (Throwable t) {
1595         // Failed to access the fields.
1596         field = null;
1597       }
1598       logger.log(Level.FINEST, "{0}.{1}: {2}",
1599           new Object[] {clazz.getName(), fieldName, (field != null ? "available" : "unavailable")});
1600       return field;
1601     }
1602 
1603     /**
1604      * Returns the offset of the provided field, or {@code -1} if {@code sun.misc.Unsafe} is not
1605      * available.
1606      */
fieldOffset(Field field)1607     private static long fieldOffset(Field field) {
1608       return field == null || UNSAFE == null ? -1 : UNSAFE.objectFieldOffset(field);
1609     }
1610 
1611     /**
1612      * Get the base offset for byte arrays, or {@code -1} if {@code sun.misc.Unsafe} is not
1613      * available.
1614      */
byteArrayBaseOffset()1615     private static <T> int byteArrayBaseOffset() {
1616       return UNSAFE == null ? -1 : UNSAFE.arrayBaseOffset(byte[].class);
1617     }
1618 
1619     /**
1620      * Gets the offset of the {@code address} field of the given direct {@link ByteBuffer}.
1621      */
addressOffset(ByteBuffer buffer)1622     private static long addressOffset(ByteBuffer buffer) {
1623       return UNSAFE.getLong(buffer, BUFFER_ADDRESS_OFFSET);
1624     }
1625 
1626     /**
1627      * Gets the {@code sun.misc.Unsafe} instance, or {@code null} if not available on this
1628      * platform.
1629      */
getUnsafe()1630     private static sun.misc.Unsafe getUnsafe() {
1631       sun.misc.Unsafe unsafe = null;
1632       try {
1633         unsafe = AccessController.doPrivileged(new PrivilegedExceptionAction<sun.misc.Unsafe>() {
1634           @Override
1635           public sun.misc.Unsafe run() throws Exception {
1636             Class<sun.misc.Unsafe> k = sun.misc.Unsafe.class;
1637 
1638             // Check that this platform supports all of the required unsafe methods.
1639             checkRequiredMethods(k);
1640 
1641             for (Field f : k.getDeclaredFields()) {
1642               f.setAccessible(true);
1643               Object x = f.get(null);
1644               if (k.isInstance(x)) {
1645                 return k.cast(x);
1646               }
1647             }
1648             // The sun.misc.Unsafe field does not exist.
1649             return null;
1650           }
1651         });
1652       } catch (Throwable e) {
1653         // Catching Throwable here due to the fact that Google AppEngine raises NoClassDefFoundError
1654         // for Unsafe.
1655       }
1656 
1657       logger.log(Level.FINEST, "sun.misc.Unsafe: {}",
1658           unsafe != null ? "available" : "unavailable");
1659       return unsafe;
1660     }
1661 
1662     /**
1663      * Verifies that all required methods of {@code sun.misc.Unsafe} are available on this platform.
1664      */
checkRequiredMethods(Class<sun.misc.Unsafe> clazz)1665     private static void checkRequiredMethods(Class<sun.misc.Unsafe> clazz)
1666         throws NoSuchMethodException, SecurityException {
1667       // Needed for Unsafe byte[] access
1668       clazz.getMethod("arrayBaseOffset", Class.class);
1669       clazz.getMethod("getByte", Object.class, long.class);
1670       clazz.getMethod("putByte", Object.class, long.class, byte.class);
1671       clazz.getMethod("getLong", Object.class, long.class);
1672 
1673       // Needed for Unsafe Direct ByteBuffer access
1674       clazz.getMethod("objectFieldOffset", Field.class);
1675       clazz.getMethod("getByte", long.class);
1676       clazz.getMethod("getLong", Object.class, long.class);
1677       clazz.getMethod("putByte", long.class, byte.class);
1678       clazz.getMethod("getLong", long.class);
1679     }
1680   }
1681 
Utf8()1682   private Utf8() {}
1683 }
1684