• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Protocol Buffers - Google's data interchange format
2 // Copyright 2008 Google Inc.  All rights reserved.
3 // https://developers.google.com/protocol-buffers/
4 //
5 // Redistribution and use in source and binary forms, with or without
6 // modification, are permitted provided that the following conditions are
7 // met:
8 //
9 //     * Redistributions of source code must retain the above copyright
10 // notice, this list of conditions and the following disclaimer.
11 //     * Redistributions in binary form must reproduce the above
12 // copyright notice, this list of conditions and the following disclaimer
13 // in the documentation and/or other materials provided with the
14 // distribution.
15 //     * Neither the name of Google Inc. nor the names of its
16 // contributors may be used to endorse or promote products derived from
17 // this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 
31 package com.google.protobuf;
32 
33 import static com.google.protobuf.UnsafeUtil.addressOffset;
34 import static com.google.protobuf.UnsafeUtil.hasUnsafeArrayOperations;
35 import static com.google.protobuf.UnsafeUtil.hasUnsafeByteBufferOperations;
36 import static java.lang.Character.MAX_SURROGATE;
37 import static java.lang.Character.MIN_HIGH_SURROGATE;
38 import static java.lang.Character.MIN_LOW_SURROGATE;
39 import static java.lang.Character.MIN_SUPPLEMENTARY_CODE_POINT;
40 import static java.lang.Character.MIN_SURROGATE;
41 import static java.lang.Character.isSurrogatePair;
42 import static java.lang.Character.toCodePoint;
43 
44 import java.nio.ByteBuffer;
45 
46 /**
47  * A set of low-level, high-performance static utility methods related to the UTF-8 character
48  * encoding. This class has no dependencies outside of the core JDK libraries.
49  *
50  * <p>There are several variants of UTF-8. The one implemented by this class is the restricted
51  * definition of UTF-8 introduced in Unicode 3.1, which mandates the rejection of "overlong" byte
52  * sequences as well as rejection of 3-byte surrogate codepoint byte sequences. Note that the UTF-8
53  * decoder included in Oracle's JDK has been modified to also reject "overlong" byte sequences, but
54  * (as of 2011) still accepts 3-byte surrogate codepoint byte sequences.
55  *
56  * <p>The byte sequences considered valid by this class are exactly those that can be roundtrip
57  * converted to Strings and back to bytes using the UTF-8 charset, without loss:
58  *
59  * <pre>{@code
60  * Arrays.equals(bytes, new String(bytes, Internal.UTF_8).getBytes(Internal.UTF_8))
61  * }</pre>
62  *
63  * <p>See the Unicode Standard,</br> Table 3-6. <em>UTF-8 Bit Distribution</em>,</br> Table 3-7.
64  * <em>Well Formed UTF-8 Byte Sequences</em>.
65  *
66  * <p>This class supports decoding of partial byte sequences, so that the bytes in a complete UTF-8
67  * byte sequences can be stored in multiple segments. Methods typically return {@link #MALFORMED} if
68  * the partial byte sequence is definitely not well-formed, {@link #COMPLETE} if it is well-formed
69  * in the absence of additional input, or if the byte sequence apparently terminated in the middle
70  * of a character, an opaque integer "state" value containing enough information to decode the
71  * character when passed to a subsequent invocation of a partial decoding method.
72  *
73  * @author martinrb@google.com (Martin Buchholz)
74  */
75 // TODO(nathanmittler): Copy changes in this class back to Guava
76 final class Utf8 {
77 
78   /**
79    * UTF-8 is a runtime hot spot so we attempt to provide heavily optimized implementations
80    * depending on what is available on the platform. The processor is the platform-optimized
81    * delegate for which all methods are delegated directly to.
82    */
83   private static final Processor processor =
84       (UnsafeProcessor.isAvailable() && !Android.isOnAndroidDevice())
85           ? new UnsafeProcessor()
86           : new SafeProcessor();
87 
88   /**
89    * A mask used when performing unsafe reads to determine if a long value contains any non-ASCII
90    * characters (i.e. any byte >= 0x80).
91    */
92   private static final long ASCII_MASK_LONG = 0x8080808080808080L;
93 
94   /**
95    * Maximum number of bytes per Java UTF-16 char in UTF-8.
96    *
97    * @see java.nio.charset.CharsetEncoder#maxBytesPerChar()
98    */
99   static final int MAX_BYTES_PER_CHAR = 3;
100 
101   /**
102    * State value indicating that the byte sequence is well-formed and complete (no further bytes are
103    * needed to complete a character).
104    */
105   public static final int COMPLETE = 0;
106 
107   /** State value indicating that the byte sequence is definitely not well-formed. */
108   public static final int MALFORMED = -1;
109 
110   /**
111    * Used by {@code Unsafe} UTF-8 string validation logic to determine the minimum string length
112    * above which to employ an optimized algorithm for counting ASCII characters. The reason for this
113    * threshold is that for small strings, the optimization may not be beneficial or may even
114    * negatively impact performance since it requires additional logic to avoid unaligned reads (when
115    * calling {@code Unsafe.getLong}). This threshold guarantees that even if the initial offset is
116    * unaligned, we're guaranteed to make at least one call to {@code Unsafe.getLong()} which
117    * provides a performance improvement that entirely subsumes the cost of the additional logic.
118    */
119   private static final int UNSAFE_COUNT_ASCII_THRESHOLD = 16;
120 
121   // Other state values include the partial bytes of the incomplete
122   // character to be decoded in the simplest way: we pack the bytes
123   // into the state int in little-endian order.  For example:
124   //
125   // int state = byte1 ^ (byte2 << 8) ^ (byte3 << 16);
126   //
127   // Such a state is unpacked thus (note the ~ operation for byte2 to
128   // undo byte1's sign-extension bits):
129   //
130   // int byte1 = (byte) state;
131   // int byte2 = (byte) ~(state >> 8);
132   // int byte3 = (byte) (state >> 16);
133   //
134   // We cannot store a zero byte in the state because it would be
135   // indistinguishable from the absence of a byte.  But we don't need
136   // to, because partial bytes must always be negative.  When building
137   // a state, we ensure that byte1 is negative and subsequent bytes
138   // are valid trailing bytes.
139 
140   /**
141    * Returns {@code true} if the given byte array is a well-formed UTF-8 byte sequence.
142    *
143    * <p>This is a convenience method, equivalent to a call to {@code isValidUtf8(bytes, 0,
144    * bytes.length)}.
145    */
isValidUtf8(byte[] bytes)146   public static boolean isValidUtf8(byte[] bytes) {
147     return processor.isValidUtf8(bytes, 0, bytes.length);
148   }
149 
150   /**
151    * Returns {@code true} if the given byte array slice is a well-formed UTF-8 byte sequence. The
152    * range of bytes to be checked extends from index {@code index}, inclusive, to {@code limit},
153    * exclusive.
154    *
155    * <p>This is a convenience method, equivalent to {@code partialIsValidUtf8(bytes, index, limit)
156    * == Utf8.COMPLETE}.
157    */
isValidUtf8(byte[] bytes, int index, int limit)158   public static boolean isValidUtf8(byte[] bytes, int index, int limit) {
159     return processor.isValidUtf8(bytes, index, limit);
160   }
161 
162   /**
163    * Tells whether the given byte array slice is a well-formed, malformed, or incomplete UTF-8 byte
164    * sequence. The range of bytes to be checked extends from index {@code index}, inclusive, to
165    * {@code limit}, exclusive.
166    *
167    * @param state either {@link Utf8#COMPLETE} (if this is the initial decoding operation) or the
168    *     value returned from a call to a partial decoding method for the previous bytes
169    * @return {@link #MALFORMED} if the partial byte sequence is definitely not well-formed, {@link
170    *     #COMPLETE} if it is well-formed (no additional input needed), or if the byte sequence is
171    *     "incomplete", i.e. apparently terminated in the middle of a character, an opaque integer
172    *     "state" value containing enough information to decode the character when passed to a
173    *     subsequent invocation of a partial decoding method.
174    */
partialIsValidUtf8(int state, byte[] bytes, int index, int limit)175   public static int partialIsValidUtf8(int state, byte[] bytes, int index, int limit) {
176     return processor.partialIsValidUtf8(state, bytes, index, limit);
177   }
178 
incompleteStateFor(int byte1)179   private static int incompleteStateFor(int byte1) {
180     return (byte1 > (byte) 0xF4) ? MALFORMED : byte1;
181   }
182 
incompleteStateFor(int byte1, int byte2)183   private static int incompleteStateFor(int byte1, int byte2) {
184     return (byte1 > (byte) 0xF4 || byte2 > (byte) 0xBF) ? MALFORMED : byte1 ^ (byte2 << 8);
185   }
186 
incompleteStateFor(int byte1, int byte2, int byte3)187   private static int incompleteStateFor(int byte1, int byte2, int byte3) {
188     return (byte1 > (byte) 0xF4 || byte2 > (byte) 0xBF || byte3 > (byte) 0xBF)
189         ? MALFORMED
190         : byte1 ^ (byte2 << 8) ^ (byte3 << 16);
191   }
192 
incompleteStateFor(byte[] bytes, int index, int limit)193   private static int incompleteStateFor(byte[] bytes, int index, int limit) {
194     int byte1 = bytes[index - 1];
195     switch (limit - index) {
196       case 0:
197         return incompleteStateFor(byte1);
198       case 1:
199         return incompleteStateFor(byte1, bytes[index]);
200       case 2:
201         return incompleteStateFor(byte1, bytes[index], bytes[index + 1]);
202       default:
203         throw new AssertionError();
204     }
205   }
206 
incompleteStateFor( final ByteBuffer buffer, final int byte1, final int index, final int remaining)207   private static int incompleteStateFor(
208       final ByteBuffer buffer, final int byte1, final int index, final int remaining) {
209     switch (remaining) {
210       case 0:
211         return incompleteStateFor(byte1);
212       case 1:
213         return incompleteStateFor(byte1, buffer.get(index));
214       case 2:
215         return incompleteStateFor(byte1, buffer.get(index), buffer.get(index + 1));
216       default:
217         throw new AssertionError();
218     }
219   }
220 
221   // These UTF-8 handling methods are copied from Guava's Utf8 class with a modification to throw
222   // a protocol buffer local exception. This exception is then caught in CodedOutputStream so it can
223   // fallback to more lenient behavior.
224 
225   static class UnpairedSurrogateException extends IllegalArgumentException {
UnpairedSurrogateException(int index, int length)226     UnpairedSurrogateException(int index, int length) {
227       super("Unpaired surrogate at index " + index + " of " + length);
228     }
229   }
230 
231   /**
232    * Returns the number of bytes in the UTF-8-encoded form of {@code sequence}. For a string, this
233    * method is equivalent to {@code string.getBytes(UTF_8).length}, but is more efficient in both
234    * time and space.
235    *
236    * @throws IllegalArgumentException if {@code sequence} contains ill-formed UTF-16 (unpaired
237    *     surrogates)
238    */
encodedLength(CharSequence sequence)239   static int encodedLength(CharSequence sequence) {
240     // Warning to maintainers: this implementation is highly optimized.
241     int utf16Length = sequence.length();
242     int utf8Length = utf16Length;
243     int i = 0;
244 
245     // This loop optimizes for pure ASCII.
246     while (i < utf16Length && sequence.charAt(i) < 0x80) {
247       i++;
248     }
249 
250     // This loop optimizes for chars less than 0x800.
251     for (; i < utf16Length; i++) {
252       char c = sequence.charAt(i);
253       if (c < 0x800) {
254         utf8Length += ((0x7f - c) >>> 31); // branch free!
255       } else {
256         utf8Length += encodedLengthGeneral(sequence, i);
257         break;
258       }
259     }
260 
261     if (utf8Length < utf16Length) {
262       // Necessary and sufficient condition for overflow because of maximum 3x expansion
263       throw new IllegalArgumentException(
264           "UTF-8 length does not fit in int: " + (utf8Length + (1L << 32)));
265     }
266     return utf8Length;
267   }
268 
encodedLengthGeneral(CharSequence sequence, int start)269   private static int encodedLengthGeneral(CharSequence sequence, int start) {
270     int utf16Length = sequence.length();
271     int utf8Length = 0;
272     for (int i = start; i < utf16Length; i++) {
273       char c = sequence.charAt(i);
274       if (c < 0x800) {
275         utf8Length += (0x7f - c) >>> 31; // branch free!
276       } else {
277         utf8Length += 2;
278         // jdk7+: if (Character.isSurrogate(c)) {
279         if (Character.MIN_SURROGATE <= c && c <= Character.MAX_SURROGATE) {
280           // Check that we have a well-formed surrogate pair.
281           int cp = Character.codePointAt(sequence, i);
282           if (cp < MIN_SUPPLEMENTARY_CODE_POINT) {
283             throw new UnpairedSurrogateException(i, utf16Length);
284           }
285           i++;
286         }
287       }
288     }
289     return utf8Length;
290   }
291 
encode(CharSequence in, byte[] out, int offset, int length)292   static int encode(CharSequence in, byte[] out, int offset, int length) {
293     return processor.encodeUtf8(in, out, offset, length);
294   }
295   // End Guava UTF-8 methods.
296 
297   /**
298    * Determines if the given {@link ByteBuffer} is a valid UTF-8 string.
299    *
300    * <p>Selects an optimal algorithm based on the type of {@link ByteBuffer} (i.e. heap or direct)
301    * and the capabilities of the platform.
302    *
303    * @param buffer the buffer to check.
304    * @see Utf8#isValidUtf8(byte[], int, int)
305    */
isValidUtf8(ByteBuffer buffer)306   static boolean isValidUtf8(ByteBuffer buffer) {
307     return processor.isValidUtf8(buffer, buffer.position(), buffer.remaining());
308   }
309 
310   /**
311    * Determines if the given {@link ByteBuffer} is a partially valid UTF-8 string.
312    *
313    * <p>Selects an optimal algorithm based on the type of {@link ByteBuffer} (i.e. heap or direct)
314    * and the capabilities of the platform.
315    *
316    * @param buffer the buffer to check.
317    * @see Utf8#partialIsValidUtf8(int, byte[], int, int)
318    */
partialIsValidUtf8(int state, ByteBuffer buffer, int index, int limit)319   static int partialIsValidUtf8(int state, ByteBuffer buffer, int index, int limit) {
320     return processor.partialIsValidUtf8(state, buffer, index, limit);
321   }
322 
323   /**
324    * Decodes the given UTF-8 portion of the {@link ByteBuffer} into a {@link String}.
325    *
326    * @throws InvalidProtocolBufferException if the input is not valid UTF-8.
327    */
decodeUtf8(ByteBuffer buffer, int index, int size)328   static String decodeUtf8(ByteBuffer buffer, int index, int size)
329       throws InvalidProtocolBufferException {
330     return processor.decodeUtf8(buffer, index, size);
331   }
332 
333   /**
334    * Decodes the given UTF-8 encoded byte array slice into a {@link String}.
335    *
336    * @throws InvalidProtocolBufferException if the input is not valid UTF-8.
337    */
decodeUtf8(byte[] bytes, int index, int size)338   static String decodeUtf8(byte[] bytes, int index, int size)
339       throws InvalidProtocolBufferException {
340     return processor.decodeUtf8(bytes, index, size);
341   }
342 
343   /**
344    * Encodes the given characters to the target {@link ByteBuffer} using UTF-8 encoding.
345    *
346    * <p>Selects an optimal algorithm based on the type of {@link ByteBuffer} (i.e. heap or direct)
347    * and the capabilities of the platform.
348    *
349    * @param in the source string to be encoded
350    * @param out the target buffer to receive the encoded string.
351    * @see Utf8#encode(CharSequence, byte[], int, int)
352    */
encodeUtf8(CharSequence in, ByteBuffer out)353   static void encodeUtf8(CharSequence in, ByteBuffer out) {
354     processor.encodeUtf8(in, out);
355   }
356 
357   /**
358    * Counts (approximately) the number of consecutive ASCII characters in the given buffer. The byte
359    * order of the {@link ByteBuffer} does not matter, so performance can be improved if native byte
360    * order is used (i.e. no byte-swapping in {@link ByteBuffer#getLong(int)}).
361    *
362    * @param buffer the buffer to be scanned for ASCII chars
363    * @param index the starting index of the scan
364    * @param limit the limit within buffer for the scan
365    * @return the number of ASCII characters found. The stopping position will be at or before the
366    *     first non-ASCII byte.
367    */
estimateConsecutiveAscii(ByteBuffer buffer, int index, int limit)368   private static int estimateConsecutiveAscii(ByteBuffer buffer, int index, int limit) {
369     int i = index;
370     final int lim = limit - 7;
371     // This simple loop stops when we encounter a byte >= 0x80 (i.e. non-ASCII).
372     // To speed things up further, we're reading longs instead of bytes so we use a mask to
373     // determine if any byte in the current long is non-ASCII.
374     for (; i < lim && (buffer.getLong(i) & ASCII_MASK_LONG) == 0; i += 8) {}
375     return i - index;
376   }
377 
378   /** A processor of UTF-8 strings, providing methods for checking validity and encoding. */
379   // TODO(nathanmittler): Add support for Memory/MemoryBlock on Android.
380   abstract static class Processor {
381     /**
382      * Returns {@code true} if the given byte array slice is a well-formed UTF-8 byte sequence. The
383      * range of bytes to be checked extends from index {@code index}, inclusive, to {@code limit},
384      * exclusive.
385      *
386      * <p>This is a convenience method, equivalent to {@code partialIsValidUtf8(bytes, index, limit)
387      * == Utf8.COMPLETE}.
388      */
isValidUtf8(byte[] bytes, int index, int limit)389     final boolean isValidUtf8(byte[] bytes, int index, int limit) {
390       return partialIsValidUtf8(COMPLETE, bytes, index, limit) == COMPLETE;
391     }
392 
393     /**
394      * Tells whether the given byte array slice is a well-formed, malformed, or incomplete UTF-8
395      * byte sequence. The range of bytes to be checked extends from index {@code index}, inclusive,
396      * to {@code limit}, exclusive.
397      *
398      * @param state either {@link Utf8#COMPLETE} (if this is the initial decoding operation) or the
399      *     value returned from a call to a partial decoding method for the previous bytes
400      * @return {@link #MALFORMED} if the partial byte sequence is definitely not well-formed, {@link
401      *     #COMPLETE} if it is well-formed (no additional input needed), or if the byte sequence is
402      *     "incomplete", i.e. apparently terminated in the middle of a character, an opaque integer
403      *     "state" value containing enough information to decode the character when passed to a
404      *     subsequent invocation of a partial decoding method.
405      */
partialIsValidUtf8(int state, byte[] bytes, int index, int limit)406     abstract int partialIsValidUtf8(int state, byte[] bytes, int index, int limit);
407 
408     /**
409      * Returns {@code true} if the given portion of the {@link ByteBuffer} is a well-formed UTF-8
410      * byte sequence. The range of bytes to be checked extends from index {@code index}, inclusive,
411      * to {@code limit}, exclusive.
412      *
413      * <p>This is a convenience method, equivalent to {@code partialIsValidUtf8(bytes, index, limit)
414      * == Utf8.COMPLETE}.
415      */
isValidUtf8(ByteBuffer buffer, int index, int limit)416     final boolean isValidUtf8(ByteBuffer buffer, int index, int limit) {
417       return partialIsValidUtf8(COMPLETE, buffer, index, limit) == COMPLETE;
418     }
419 
420     /**
421      * Indicates whether or not the given buffer contains a valid UTF-8 string.
422      *
423      * @param buffer the buffer to check.
424      * @return {@code true} if the given buffer contains a valid UTF-8 string.
425      */
partialIsValidUtf8( final int state, final ByteBuffer buffer, int index, final int limit)426     final int partialIsValidUtf8(
427         final int state, final ByteBuffer buffer, int index, final int limit) {
428       if (buffer.hasArray()) {
429         final int offset = buffer.arrayOffset();
430         return partialIsValidUtf8(state, buffer.array(), offset + index, offset + limit);
431       } else if (buffer.isDirect()) {
432         return partialIsValidUtf8Direct(state, buffer, index, limit);
433       }
434       return partialIsValidUtf8Default(state, buffer, index, limit);
435     }
436 
437     /** Performs validation for direct {@link ByteBuffer} instances. */
partialIsValidUtf8Direct( final int state, final ByteBuffer buffer, int index, final int limit)438     abstract int partialIsValidUtf8Direct(
439         final int state, final ByteBuffer buffer, int index, final int limit);
440 
441     /**
442      * Performs validation for {@link ByteBuffer} instances using the {@link ByteBuffer} API rather
443      * than potentially faster approaches. This first completes validation for the current character
444      * (provided by {@code state}) and then finishes validation for the sequence.
445      */
partialIsValidUtf8Default( final int state, final ByteBuffer buffer, int index, final int limit)446     final int partialIsValidUtf8Default(
447         final int state, final ByteBuffer buffer, int index, final int limit) {
448       if (state != COMPLETE) {
449         // The previous decoding operation was incomplete (or malformed).
450         // We look for a well-formed sequence consisting of bytes from
451         // the previous decoding operation (stored in state) together
452         // with bytes from the array slice.
453         //
454         // We expect such "straddler characters" to be rare.
455 
456         if (index >= limit) { // No bytes? No progress.
457           return state;
458         }
459 
460         byte byte1 = (byte) state;
461         // byte1 is never ASCII.
462         if (byte1 < (byte) 0xE0) {
463           // two-byte form
464 
465           // Simultaneously checks for illegal trailing-byte in
466           // leading position and overlong 2-byte form.
467           if (byte1 < (byte) 0xC2
468               // byte2 trailing-byte test
469               || buffer.get(index++) > (byte) 0xBF) {
470             return MALFORMED;
471           }
472         } else if (byte1 < (byte) 0xF0) {
473           // three-byte form
474 
475           // Get byte2 from saved state or array
476           byte byte2 = (byte) ~(state >> 8);
477           if (byte2 == 0) {
478             byte2 = buffer.get(index++);
479             if (index >= limit) {
480               return incompleteStateFor(byte1, byte2);
481             }
482           }
483           if (byte2 > (byte) 0xBF
484               // overlong? 5 most significant bits must not all be zero
485               || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0)
486               // illegal surrogate codepoint?
487               || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0)
488               // byte3 trailing-byte test
489               || buffer.get(index++) > (byte) 0xBF) {
490             return MALFORMED;
491           }
492         } else {
493           // four-byte form
494 
495           // Get byte2 and byte3 from saved state or array
496           byte byte2 = (byte) ~(state >> 8);
497           byte byte3 = 0;
498           if (byte2 == 0) {
499             byte2 = buffer.get(index++);
500             if (index >= limit) {
501               return incompleteStateFor(byte1, byte2);
502             }
503           } else {
504             byte3 = (byte) (state >> 16);
505           }
506           if (byte3 == 0) {
507             byte3 = buffer.get(index++);
508             if (index >= limit) {
509               return incompleteStateFor(byte1, byte2, byte3);
510             }
511           }
512 
513           // If we were called with state == MALFORMED, then byte1 is 0xFF,
514           // which never occurs in well-formed UTF-8, and so we will return
515           // MALFORMED again below.
516 
517           if (byte2 > (byte) 0xBF
518               // Check that 1 <= plane <= 16.  Tricky optimized form of:
519               // if (byte1 > (byte) 0xF4 ||
520               //     byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 ||
521               //     byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F)
522               || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0
523               // byte3 trailing-byte test
524               || byte3 > (byte) 0xBF
525               // byte4 trailing-byte test
526               || buffer.get(index++) > (byte) 0xBF) {
527             return MALFORMED;
528           }
529         }
530       }
531 
532       // Finish validation for the sequence.
533       return partialIsValidUtf8(buffer, index, limit);
534     }
535 
536     /**
537      * Performs validation for {@link ByteBuffer} instances using the {@link ByteBuffer} API rather
538      * than potentially faster approaches.
539      */
partialIsValidUtf8(final ByteBuffer buffer, int index, final int limit)540     private static int partialIsValidUtf8(final ByteBuffer buffer, int index, final int limit) {
541       index += estimateConsecutiveAscii(buffer, index, limit);
542 
543       for (; ; ) {
544         // Optimize for interior runs of ASCII bytes.
545         // TODO(nathanmittler): Consider checking 8 bytes at a time after some threshold?
546         // Maybe after seeing a few in a row that are ASCII, go back to fast mode?
547         int byte1;
548         do {
549           if (index >= limit) {
550             return COMPLETE;
551           }
552         } while ((byte1 = buffer.get(index++)) >= 0);
553 
554         // If we're here byte1 is not ASCII. Only need to handle 2-4 byte forms.
555         if (byte1 < (byte) 0xE0) {
556           // Two-byte form (110xxxxx 10xxxxxx)
557           if (index >= limit) {
558             // Incomplete sequence
559             return byte1;
560           }
561 
562           // Simultaneously checks for illegal trailing-byte in
563           // leading position and overlong 2-byte form.
564           if (byte1 < (byte) 0xC2 || buffer.get(index) > (byte) 0xBF) {
565             return MALFORMED;
566           }
567           index++;
568         } else if (byte1 < (byte) 0xF0) {
569           // Three-byte form (1110xxxx 10xxxxxx 10xxxxxx)
570           if (index >= limit - 1) {
571             // Incomplete sequence
572             return incompleteStateFor(buffer, byte1, index, limit - index);
573           }
574 
575           final byte byte2 = buffer.get(index++);
576           if (byte2 > (byte) 0xBF
577               // overlong? 5 most significant bits must not all be zero
578               || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0)
579               // check for illegal surrogate codepoints
580               || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0)
581               // byte3 trailing-byte test
582               || buffer.get(index) > (byte) 0xBF) {
583             return MALFORMED;
584           }
585           index++;
586         } else {
587           // Four-byte form (1110xxxx 10xxxxxx 10xxxxxx 10xxxxxx)
588           if (index >= limit - 2) {
589             // Incomplete sequence
590             return incompleteStateFor(buffer, byte1, index, limit - index);
591           }
592 
593           // TODO(nathanmittler): Consider using getInt() to improve performance.
594           final int byte2 = buffer.get(index++);
595           if (byte2 > (byte) 0xBF
596               // Check that 1 <= plane <= 16.  Tricky optimized form of:
597               // if (byte1 > (byte) 0xF4 ||
598               //     byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 ||
599               //     byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F)
600               || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0
601               // byte3 trailing-byte test
602               || buffer.get(index++) > (byte) 0xBF
603               // byte4 trailing-byte test
604               || buffer.get(index++) > (byte) 0xBF) {
605             return MALFORMED;
606           }
607         }
608       }
609     }
610 
611     /**
612      * Decodes the given byte array slice into a {@link String}.
613      *
614      * @throws InvalidProtocolBufferException if the byte array slice is not valid UTF-8.
615      */
decodeUtf8(byte[] bytes, int index, int size)616     abstract String decodeUtf8(byte[] bytes, int index, int size)
617         throws InvalidProtocolBufferException;
618 
619     /**
620      * Decodes the given portion of the {@link ByteBuffer} into a {@link String}.
621      *
622      * @throws InvalidProtocolBufferException if the portion of the buffer is not valid UTF-8.
623      */
decodeUtf8(ByteBuffer buffer, int index, int size)624     final String decodeUtf8(ByteBuffer buffer, int index, int size)
625         throws InvalidProtocolBufferException {
626       if (buffer.hasArray()) {
627         final int offset = buffer.arrayOffset();
628         return decodeUtf8(buffer.array(), offset + index, size);
629       } else if (buffer.isDirect()) {
630         return decodeUtf8Direct(buffer, index, size);
631       }
632       return decodeUtf8Default(buffer, index, size);
633     }
634 
635     /** Decodes direct {@link ByteBuffer} instances into {@link String}. */
decodeUtf8Direct(ByteBuffer buffer, int index, int size)636     abstract String decodeUtf8Direct(ByteBuffer buffer, int index, int size)
637         throws InvalidProtocolBufferException;
638 
639     /**
640      * Decodes {@link ByteBuffer} instances using the {@link ByteBuffer} API rather than potentially
641      * faster approaches.
642      */
decodeUtf8Default(ByteBuffer buffer, int index, int size)643     final String decodeUtf8Default(ByteBuffer buffer, int index, int size)
644         throws InvalidProtocolBufferException {
645       // Bitwise OR combines the sign bits so any negative value fails the check.
646       if ((index | size | buffer.limit() - index - size) < 0) {
647         throw new ArrayIndexOutOfBoundsException(
648             String.format("buffer limit=%d, index=%d, limit=%d", buffer.limit(), index, size));
649       }
650 
651       int offset = index;
652       final int limit = offset + size;
653 
654       // The longest possible resulting String is the same as the number of input bytes, when it is
655       // all ASCII. For other cases, this over-allocates and we will truncate in the end.
656       char[] resultArr = new char[size];
657       int resultPos = 0;
658 
659       // Optimize for 100% ASCII (Hotspot loves small simple top-level loops like this).
660       // This simple loop stops when we encounter a byte >= 0x80 (i.e. non-ASCII).
661       while (offset < limit) {
662         byte b = buffer.get(offset);
663         if (!DecodeUtil.isOneByte(b)) {
664           break;
665         }
666         offset++;
667         DecodeUtil.handleOneByte(b, resultArr, resultPos++);
668       }
669 
670       while (offset < limit) {
671         byte byte1 = buffer.get(offset++);
672         if (DecodeUtil.isOneByte(byte1)) {
673           DecodeUtil.handleOneByte(byte1, resultArr, resultPos++);
674           // It's common for there to be multiple ASCII characters in a run mixed in, so add an
675           // extra optimized loop to take care of these runs.
676           while (offset < limit) {
677             byte b = buffer.get(offset);
678             if (!DecodeUtil.isOneByte(b)) {
679               break;
680             }
681             offset++;
682             DecodeUtil.handleOneByte(b, resultArr, resultPos++);
683           }
684         } else if (DecodeUtil.isTwoBytes(byte1)) {
685           if (offset >= limit) {
686             throw InvalidProtocolBufferException.invalidUtf8();
687           }
688           DecodeUtil.handleTwoBytes(
689               byte1, /* byte2 */ buffer.get(offset++), resultArr, resultPos++);
690         } else if (DecodeUtil.isThreeBytes(byte1)) {
691           if (offset >= limit - 1) {
692             throw InvalidProtocolBufferException.invalidUtf8();
693           }
694           DecodeUtil.handleThreeBytes(
695               byte1,
696               /* byte2 */ buffer.get(offset++),
697               /* byte3 */ buffer.get(offset++),
698               resultArr,
699               resultPos++);
700         } else {
701           if (offset >= limit - 2) {
702             throw InvalidProtocolBufferException.invalidUtf8();
703           }
704           DecodeUtil.handleFourBytes(
705               byte1,
706               /* byte2 */ buffer.get(offset++),
707               /* byte3 */ buffer.get(offset++),
708               /* byte4 */ buffer.get(offset++),
709               resultArr,
710               resultPos++);
711           // 4-byte case requires two chars.
712           resultPos++;
713         }
714       }
715 
716       return new String(resultArr, 0, resultPos);
717     }
718 
719     /**
720      * Encodes an input character sequence ({@code in}) to UTF-8 in the target array ({@code out}).
721      * For a string, this method is similar to
722      *
723      * <pre>{@code
724      * byte[] a = string.getBytes(UTF_8);
725      * System.arraycopy(a, 0, bytes, offset, a.length);
726      * return offset + a.length;
727      * }</pre>
728      *
729      * but is more efficient in both time and space. One key difference is that this method requires
730      * paired surrogates, and therefore does not support chunking. While {@code
731      * String.getBytes(UTF_8)} replaces unpaired surrogates with the default replacement character,
732      * this method throws {@link UnpairedSurrogateException}.
733      *
734      * <p>To ensure sufficient space in the output buffer, either call {@link #encodedLength} to
735      * compute the exact amount needed, or leave room for {@code Utf8.MAX_BYTES_PER_CHAR *
736      * sequence.length()}, which is the largest possible number of bytes that any input can be
737      * encoded to.
738      *
739      * @param in the input character sequence to be encoded
740      * @param out the target array
741      * @param offset the starting offset in {@code bytes} to start writing at
742      * @param length the length of the {@code bytes}, starting from {@code offset}
743      * @throws UnpairedSurrogateException if {@code sequence} contains ill-formed UTF-16 (unpaired
744      *     surrogates)
745      * @throws ArrayIndexOutOfBoundsException if {@code sequence} encoded in UTF-8 is longer than
746      *     {@code bytes.length - offset}
747      * @return the new offset, equivalent to {@code offset + Utf8.encodedLength(sequence)}
748      */
encodeUtf8(CharSequence in, byte[] out, int offset, int length)749     abstract int encodeUtf8(CharSequence in, byte[] out, int offset, int length);
750 
751     /**
752      * Encodes an input character sequence ({@code in}) to UTF-8 in the target buffer ({@code out}).
753      * Upon returning from this method, the {@code out} position will point to the position after
754      * the last encoded byte. This method requires paired surrogates, and therefore does not support
755      * chunking.
756      *
757      * <p>To ensure sufficient space in the output buffer, either call {@link #encodedLength} to
758      * compute the exact amount needed, or leave room for {@code Utf8.MAX_BYTES_PER_CHAR *
759      * in.length()}, which is the largest possible number of bytes that any input can be encoded to.
760      *
761      * @param in the source character sequence to be encoded
762      * @param out the target buffer
763      * @throws UnpairedSurrogateException if {@code in} contains ill-formed UTF-16 (unpaired
764      *     surrogates)
765      * @throws ArrayIndexOutOfBoundsException if {@code in} encoded in UTF-8 is longer than {@code
766      *     out.remaining()}
767      */
encodeUtf8(CharSequence in, ByteBuffer out)768     final void encodeUtf8(CharSequence in, ByteBuffer out) {
769       if (out.hasArray()) {
770         final int offset = out.arrayOffset();
771         int endIndex = Utf8.encode(in, out.array(), offset + out.position(), out.remaining());
772         out.position(endIndex - offset);
773       } else if (out.isDirect()) {
774         encodeUtf8Direct(in, out);
775       } else {
776         encodeUtf8Default(in, out);
777       }
778     }
779 
780     /** Encodes the input character sequence to a direct {@link ByteBuffer} instance. */
encodeUtf8Direct(CharSequence in, ByteBuffer out)781     abstract void encodeUtf8Direct(CharSequence in, ByteBuffer out);
782 
783     /**
784      * Encodes the input character sequence to a {@link ByteBuffer} instance using the {@link
785      * ByteBuffer} API, rather than potentially faster approaches.
786      */
encodeUtf8Default(CharSequence in, ByteBuffer out)787     final void encodeUtf8Default(CharSequence in, ByteBuffer out) {
788       final int inLength = in.length();
789       int outIx = out.position();
790       int inIx = 0;
791 
792       // Since ByteBuffer.putXXX() already checks boundaries for us, no need to explicitly check
793       // access. Assume the buffer is big enough and let it handle the out of bounds exception
794       // if it occurs.
795       try {
796         // Designed to take advantage of
797         // https://wiki.openjdk.java.net/display/HotSpotInternals/RangeCheckElimination
798         for (char c; inIx < inLength && (c = in.charAt(inIx)) < 0x80; ++inIx) {
799           out.put(outIx + inIx, (byte) c);
800         }
801         if (inIx == inLength) {
802           // Successfully encoded the entire string.
803           out.position(outIx + inIx);
804           return;
805         }
806 
807         outIx += inIx;
808         for (char c; inIx < inLength; ++inIx, ++outIx) {
809           c = in.charAt(inIx);
810           if (c < 0x80) {
811             // One byte (0xxx xxxx)
812             out.put(outIx, (byte) c);
813           } else if (c < 0x800) {
814             // Two bytes (110x xxxx 10xx xxxx)
815 
816             // Benchmarks show put performs better than putShort here (for HotSpot).
817             out.put(outIx++, (byte) (0xC0 | (c >>> 6)));
818             out.put(outIx, (byte) (0x80 | (0x3F & c)));
819           } else if (c < MIN_SURROGATE || MAX_SURROGATE < c) {
820             // Three bytes (1110 xxxx 10xx xxxx 10xx xxxx)
821             // Maximum single-char code point is 0xFFFF, 16 bits.
822 
823             // Benchmarks show put performs better than putShort here (for HotSpot).
824             out.put(outIx++, (byte) (0xE0 | (c >>> 12)));
825             out.put(outIx++, (byte) (0x80 | (0x3F & (c >>> 6))));
826             out.put(outIx, (byte) (0x80 | (0x3F & c)));
827           } else {
828             // Four bytes (1111 xxxx 10xx xxxx 10xx xxxx 10xx xxxx)
829 
830             // Minimum code point represented by a surrogate pair is 0x10000, 17 bits, four UTF-8
831             // bytes
832             final char low;
833             if (inIx + 1 == inLength || !isSurrogatePair(c, (low = in.charAt(++inIx)))) {
834               throw new UnpairedSurrogateException(inIx, inLength);
835             }
836             // TODO(nathanmittler): Consider using putInt() to improve performance.
837             int codePoint = toCodePoint(c, low);
838             out.put(outIx++, (byte) ((0xF << 4) | (codePoint >>> 18)));
839             out.put(outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 12))));
840             out.put(outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 6))));
841             out.put(outIx, (byte) (0x80 | (0x3F & codePoint)));
842           }
843         }
844 
845         // Successfully encoded the entire string.
846         out.position(outIx);
847       } catch (IndexOutOfBoundsException e) {
848         // TODO(nathanmittler): Consider making the API throw IndexOutOfBoundsException instead.
849 
850         // If we failed in the outer ASCII loop, outIx will not have been updated. In this case,
851         // use inIx to determine the bad write index.
852         int badWriteIndex = out.position() + Math.max(inIx, outIx - out.position() + 1);
853         throw new ArrayIndexOutOfBoundsException(
854             "Failed writing " + in.charAt(inIx) + " at index " + badWriteIndex);
855       }
856     }
857   }
858 
859   /** {@link Processor} implementation that does not use any {@code sun.misc.Unsafe} methods. */
860   static final class SafeProcessor extends Processor {
861     @Override
partialIsValidUtf8(int state, byte[] bytes, int index, int limit)862     int partialIsValidUtf8(int state, byte[] bytes, int index, int limit) {
863       if (state != COMPLETE) {
864         // The previous decoding operation was incomplete (or malformed).
865         // We look for a well-formed sequence consisting of bytes from
866         // the previous decoding operation (stored in state) together
867         // with bytes from the array slice.
868         //
869         // We expect such "straddler characters" to be rare.
870 
871         if (index >= limit) { // No bytes? No progress.
872           return state;
873         }
874         int byte1 = (byte) state;
875         // byte1 is never ASCII.
876         if (byte1 < (byte) 0xE0) {
877           // two-byte form
878 
879           // Simultaneously checks for illegal trailing-byte in
880           // leading position and overlong 2-byte form.
881           if (byte1 < (byte) 0xC2
882               // byte2 trailing-byte test
883               || bytes[index++] > (byte) 0xBF) {
884             return MALFORMED;
885           }
886         } else if (byte1 < (byte) 0xF0) {
887           // three-byte form
888 
889           // Get byte2 from saved state or array
890           int byte2 = (byte) ~(state >> 8);
891           if (byte2 == 0) {
892             byte2 = bytes[index++];
893             if (index >= limit) {
894               return incompleteStateFor(byte1, byte2);
895             }
896           }
897           if (byte2 > (byte) 0xBF
898               // overlong? 5 most significant bits must not all be zero
899               || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0)
900               // illegal surrogate codepoint?
901               || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0)
902               // byte3 trailing-byte test
903               || bytes[index++] > (byte) 0xBF) {
904             return MALFORMED;
905           }
906         } else {
907           // four-byte form
908 
909           // Get byte2 and byte3 from saved state or array
910           int byte2 = (byte) ~(state >> 8);
911           int byte3 = 0;
912           if (byte2 == 0) {
913             byte2 = bytes[index++];
914             if (index >= limit) {
915               return incompleteStateFor(byte1, byte2);
916             }
917           } else {
918             byte3 = (byte) (state >> 16);
919           }
920           if (byte3 == 0) {
921             byte3 = bytes[index++];
922             if (index >= limit) {
923               return incompleteStateFor(byte1, byte2, byte3);
924             }
925           }
926 
927           // If we were called with state == MALFORMED, then byte1 is 0xFF,
928           // which never occurs in well-formed UTF-8, and so we will return
929           // MALFORMED again below.
930 
931           if (byte2 > (byte) 0xBF
932               // Check that 1 <= plane <= 16.  Tricky optimized form of:
933               // if (byte1 > (byte) 0xF4 ||
934               //     byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 ||
935               //     byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F)
936               || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0
937               // byte3 trailing-byte test
938               || byte3 > (byte) 0xBF
939               // byte4 trailing-byte test
940               || bytes[index++] > (byte) 0xBF) {
941             return MALFORMED;
942           }
943         }
944       }
945 
946       return partialIsValidUtf8(bytes, index, limit);
947     }
948 
949     @Override
partialIsValidUtf8Direct(int state, ByteBuffer buffer, int index, int limit)950     int partialIsValidUtf8Direct(int state, ByteBuffer buffer, int index, int limit) {
951       // For safe processing, we have to use the ByteBuffer API.
952       return partialIsValidUtf8Default(state, buffer, index, limit);
953     }
954 
955     @Override
decodeUtf8(byte[] bytes, int index, int size)956     String decodeUtf8(byte[] bytes, int index, int size) throws InvalidProtocolBufferException {
957       // Bitwise OR combines the sign bits so any negative value fails the check.
958       if ((index | size | bytes.length - index - size) < 0) {
959         throw new ArrayIndexOutOfBoundsException(
960             String.format("buffer length=%d, index=%d, size=%d", bytes.length, index, size));
961       }
962 
963       int offset = index;
964       final int limit = offset + size;
965 
966       // The longest possible resulting String is the same as the number of input bytes, when it is
967       // all ASCII. For other cases, this over-allocates and we will truncate in the end.
968       char[] resultArr = new char[size];
969       int resultPos = 0;
970 
971       // Optimize for 100% ASCII (Hotspot loves small simple top-level loops like this).
972       // This simple loop stops when we encounter a byte >= 0x80 (i.e. non-ASCII).
973       while (offset < limit) {
974         byte b = bytes[offset];
975         if (!DecodeUtil.isOneByte(b)) {
976           break;
977         }
978         offset++;
979         DecodeUtil.handleOneByte(b, resultArr, resultPos++);
980       }
981 
982       while (offset < limit) {
983         byte byte1 = bytes[offset++];
984         if (DecodeUtil.isOneByte(byte1)) {
985           DecodeUtil.handleOneByte(byte1, resultArr, resultPos++);
986           // It's common for there to be multiple ASCII characters in a run mixed in, so add an
987           // extra optimized loop to take care of these runs.
988           while (offset < limit) {
989             byte b = bytes[offset];
990             if (!DecodeUtil.isOneByte(b)) {
991               break;
992             }
993             offset++;
994             DecodeUtil.handleOneByte(b, resultArr, resultPos++);
995           }
996         } else if (DecodeUtil.isTwoBytes(byte1)) {
997           if (offset >= limit) {
998             throw InvalidProtocolBufferException.invalidUtf8();
999           }
1000           DecodeUtil.handleTwoBytes(byte1, /* byte2 */ bytes[offset++], resultArr, resultPos++);
1001         } else if (DecodeUtil.isThreeBytes(byte1)) {
1002           if (offset >= limit - 1) {
1003             throw InvalidProtocolBufferException.invalidUtf8();
1004           }
1005           DecodeUtil.handleThreeBytes(
1006               byte1,
1007               /* byte2 */ bytes[offset++],
1008               /* byte3 */ bytes[offset++],
1009               resultArr,
1010               resultPos++);
1011         } else {
1012           if (offset >= limit - 2) {
1013             throw InvalidProtocolBufferException.invalidUtf8();
1014           }
1015           DecodeUtil.handleFourBytes(
1016               byte1,
1017               /* byte2 */ bytes[offset++],
1018               /* byte3 */ bytes[offset++],
1019               /* byte4 */ bytes[offset++],
1020               resultArr,
1021               resultPos++);
1022           // 4-byte case requires two chars.
1023           resultPos++;
1024         }
1025       }
1026 
1027       return new String(resultArr, 0, resultPos);
1028     }
1029 
1030     @Override
decodeUtf8Direct(ByteBuffer buffer, int index, int size)1031     String decodeUtf8Direct(ByteBuffer buffer, int index, int size)
1032         throws InvalidProtocolBufferException {
1033       // For safe processing, we have to use the ByteBufferAPI.
1034       return decodeUtf8Default(buffer, index, size);
1035     }
1036 
1037     @Override
encodeUtf8(CharSequence in, byte[] out, int offset, int length)1038     int encodeUtf8(CharSequence in, byte[] out, int offset, int length) {
1039       int utf16Length = in.length();
1040       int j = offset;
1041       int i = 0;
1042       int limit = offset + length;
1043       // Designed to take advantage of
1044       // https://wiki.openjdk.java.net/display/HotSpotInternals/RangeCheckElimination
1045       for (char c; i < utf16Length && i + j < limit && (c = in.charAt(i)) < 0x80; i++) {
1046         out[j + i] = (byte) c;
1047       }
1048       if (i == utf16Length) {
1049         return j + utf16Length;
1050       }
1051       j += i;
1052       for (char c; i < utf16Length; i++) {
1053         c = in.charAt(i);
1054         if (c < 0x80 && j < limit) {
1055           out[j++] = (byte) c;
1056         } else if (c < 0x800 && j <= limit - 2) { // 11 bits, two UTF-8 bytes
1057           out[j++] = (byte) ((0xF << 6) | (c >>> 6));
1058           out[j++] = (byte) (0x80 | (0x3F & c));
1059         } else if ((c < Character.MIN_SURROGATE || Character.MAX_SURROGATE < c) && j <= limit - 3) {
1060           // Maximum single-char code point is 0xFFFF, 16 bits, three UTF-8 bytes
1061           out[j++] = (byte) ((0xF << 5) | (c >>> 12));
1062           out[j++] = (byte) (0x80 | (0x3F & (c >>> 6)));
1063           out[j++] = (byte) (0x80 | (0x3F & c));
1064         } else if (j <= limit - 4) {
1065           // Minimum code point represented by a surrogate pair is 0x10000, 17 bits,
1066           // four UTF-8 bytes
1067           final char low;
1068           if (i + 1 == in.length() || !Character.isSurrogatePair(c, (low = in.charAt(++i)))) {
1069             throw new UnpairedSurrogateException((i - 1), utf16Length);
1070           }
1071           int codePoint = Character.toCodePoint(c, low);
1072           out[j++] = (byte) ((0xF << 4) | (codePoint >>> 18));
1073           out[j++] = (byte) (0x80 | (0x3F & (codePoint >>> 12)));
1074           out[j++] = (byte) (0x80 | (0x3F & (codePoint >>> 6)));
1075           out[j++] = (byte) (0x80 | (0x3F & codePoint));
1076         } else {
1077           // If we are surrogates and we're not a surrogate pair, always throw an
1078           // UnpairedSurrogateException instead of an ArrayOutOfBoundsException.
1079           if ((Character.MIN_SURROGATE <= c && c <= Character.MAX_SURROGATE)
1080               && (i + 1 == in.length() || !Character.isSurrogatePair(c, in.charAt(i + 1)))) {
1081             throw new UnpairedSurrogateException(i, utf16Length);
1082           }
1083           throw new ArrayIndexOutOfBoundsException("Failed writing " + c + " at index " + j);
1084         }
1085       }
1086       return j;
1087     }
1088 
1089     @Override
encodeUtf8Direct(CharSequence in, ByteBuffer out)1090     void encodeUtf8Direct(CharSequence in, ByteBuffer out) {
1091       // For safe processing, we have to use the ByteBuffer API.
1092       encodeUtf8Default(in, out);
1093     }
1094 
partialIsValidUtf8(byte[] bytes, int index, int limit)1095     private static int partialIsValidUtf8(byte[] bytes, int index, int limit) {
1096       // Optimize for 100% ASCII (Hotspot loves small simple top-level loops like this).
1097       // This simple loop stops when we encounter a byte >= 0x80 (i.e. non-ASCII).
1098       while (index < limit && bytes[index] >= 0) {
1099         index++;
1100       }
1101 
1102       return (index >= limit) ? COMPLETE : partialIsValidUtf8NonAscii(bytes, index, limit);
1103     }
1104 
partialIsValidUtf8NonAscii(byte[] bytes, int index, int limit)1105     private static int partialIsValidUtf8NonAscii(byte[] bytes, int index, int limit) {
1106       for (; ; ) {
1107         int byte1;
1108         int byte2;
1109 
1110         // Optimize for interior runs of ASCII bytes.
1111         do {
1112           if (index >= limit) {
1113             return COMPLETE;
1114           }
1115         } while ((byte1 = bytes[index++]) >= 0);
1116 
1117         if (byte1 < (byte) 0xE0) {
1118           // two-byte form
1119 
1120           if (index >= limit) {
1121             // Incomplete sequence
1122             return byte1;
1123           }
1124 
1125           // Simultaneously checks for illegal trailing-byte in
1126           // leading position and overlong 2-byte form.
1127           if (byte1 < (byte) 0xC2 || bytes[index++] > (byte) 0xBF) {
1128             return MALFORMED;
1129           }
1130         } else if (byte1 < (byte) 0xF0) {
1131           // three-byte form
1132 
1133           if (index >= limit - 1) { // incomplete sequence
1134             return incompleteStateFor(bytes, index, limit);
1135           }
1136           if ((byte2 = bytes[index++]) > (byte) 0xBF
1137               // overlong? 5 most significant bits must not all be zero
1138               || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0)
1139               // check for illegal surrogate codepoints
1140               || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0)
1141               // byte3 trailing-byte test
1142               || bytes[index++] > (byte) 0xBF) {
1143             return MALFORMED;
1144           }
1145         } else {
1146           // four-byte form
1147 
1148           if (index >= limit - 2) { // incomplete sequence
1149             return incompleteStateFor(bytes, index, limit);
1150           }
1151           if ((byte2 = bytes[index++]) > (byte) 0xBF
1152               // Check that 1 <= plane <= 16.  Tricky optimized form of:
1153               // if (byte1 > (byte) 0xF4 ||
1154               //     byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 ||
1155               //     byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F)
1156               || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0
1157               // byte3 trailing-byte test
1158               || bytes[index++] > (byte) 0xBF
1159               // byte4 trailing-byte test
1160               || bytes[index++] > (byte) 0xBF) {
1161             return MALFORMED;
1162           }
1163         }
1164       }
1165     }
1166   }
1167 
1168   /** {@link Processor} that uses {@code sun.misc.Unsafe} where possible to improve performance. */
1169   static final class UnsafeProcessor extends Processor {
1170     /** Indicates whether or not all required unsafe operations are supported on this platform. */
isAvailable()1171     static boolean isAvailable() {
1172       return hasUnsafeArrayOperations() && hasUnsafeByteBufferOperations();
1173     }
1174 
1175     @Override
partialIsValidUtf8(int state, byte[] bytes, final int index, final int limit)1176     int partialIsValidUtf8(int state, byte[] bytes, final int index, final int limit) {
1177       // Bitwise OR combines the sign bits so any negative value fails the check.
1178       if ((index | limit | bytes.length - limit) < 0) {
1179         throw new ArrayIndexOutOfBoundsException(
1180             String.format("Array length=%d, index=%d, limit=%d", bytes.length, index, limit));
1181       }
1182       long offset = index;
1183       final long offsetLimit = limit;
1184       if (state != COMPLETE) {
1185         // The previous decoding operation was incomplete (or malformed).
1186         // We look for a well-formed sequence consisting of bytes from
1187         // the previous decoding operation (stored in state) together
1188         // with bytes from the array slice.
1189         //
1190         // We expect such "straddler characters" to be rare.
1191 
1192         if (offset >= offsetLimit) { // No bytes? No progress.
1193           return state;
1194         }
1195         int byte1 = (byte) state;
1196         // byte1 is never ASCII.
1197         if (byte1 < (byte) 0xE0) {
1198           // two-byte form
1199 
1200           // Simultaneously checks for illegal trailing-byte in
1201           // leading position and overlong 2-byte form.
1202           if (byte1 < (byte) 0xC2
1203               // byte2 trailing-byte test
1204               || UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF) {
1205             return MALFORMED;
1206           }
1207         } else if (byte1 < (byte) 0xF0) {
1208           // three-byte form
1209 
1210           // Get byte2 from saved state or array
1211           int byte2 = (byte) ~(state >> 8);
1212           if (byte2 == 0) {
1213             byte2 = UnsafeUtil.getByte(bytes, offset++);
1214             if (offset >= offsetLimit) {
1215               return incompleteStateFor(byte1, byte2);
1216             }
1217           }
1218           if (byte2 > (byte) 0xBF
1219               // overlong? 5 most significant bits must not all be zero
1220               || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0)
1221               // illegal surrogate codepoint?
1222               || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0)
1223               // byte3 trailing-byte test
1224               || UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF) {
1225             return MALFORMED;
1226           }
1227         } else {
1228           // four-byte form
1229 
1230           // Get byte2 and byte3 from saved state or array
1231           int byte2 = (byte) ~(state >> 8);
1232           int byte3 = 0;
1233           if (byte2 == 0) {
1234             byte2 = UnsafeUtil.getByte(bytes, offset++);
1235             if (offset >= offsetLimit) {
1236               return incompleteStateFor(byte1, byte2);
1237             }
1238           } else {
1239             byte3 = (byte) (state >> 16);
1240           }
1241           if (byte3 == 0) {
1242             byte3 = UnsafeUtil.getByte(bytes, offset++);
1243             if (offset >= offsetLimit) {
1244               return incompleteStateFor(byte1, byte2, byte3);
1245             }
1246           }
1247 
1248           // If we were called with state == MALFORMED, then byte1 is 0xFF,
1249           // which never occurs in well-formed UTF-8, and so we will return
1250           // MALFORMED again below.
1251 
1252           if (byte2 > (byte) 0xBF
1253               // Check that 1 <= plane <= 16.  Tricky optimized form of:
1254               // if (byte1 > (byte) 0xF4 ||
1255               //     byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 ||
1256               //     byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F)
1257               || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0
1258               // byte3 trailing-byte test
1259               || byte3 > (byte) 0xBF
1260               // byte4 trailing-byte test
1261               || UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF) {
1262             return MALFORMED;
1263           }
1264         }
1265       }
1266 
1267       return partialIsValidUtf8(bytes, offset, (int) (offsetLimit - offset));
1268     }
1269 
1270     @Override
partialIsValidUtf8Direct( final int state, ByteBuffer buffer, final int index, final int limit)1271     int partialIsValidUtf8Direct(
1272         final int state, ByteBuffer buffer, final int index, final int limit) {
1273       // Bitwise OR combines the sign bits so any negative value fails the check.
1274       if ((index | limit | buffer.limit() - limit) < 0) {
1275         throw new ArrayIndexOutOfBoundsException(
1276             String.format("buffer limit=%d, index=%d, limit=%d", buffer.limit(), index, limit));
1277       }
1278       long address = addressOffset(buffer) + index;
1279       final long addressLimit = address + (limit - index);
1280       if (state != COMPLETE) {
1281         // The previous decoding operation was incomplete (or malformed).
1282         // We look for a well-formed sequence consisting of bytes from
1283         // the previous decoding operation (stored in state) together
1284         // with bytes from the array slice.
1285         //
1286         // We expect such "straddler characters" to be rare.
1287 
1288         if (address >= addressLimit) { // No bytes? No progress.
1289           return state;
1290         }
1291 
1292         final int byte1 = (byte) state;
1293         // byte1 is never ASCII.
1294         if (byte1 < (byte) 0xE0) {
1295           // two-byte form
1296 
1297           // Simultaneously checks for illegal trailing-byte in
1298           // leading position and overlong 2-byte form.
1299           if (byte1 < (byte) 0xC2
1300               // byte2 trailing-byte test
1301               || UnsafeUtil.getByte(address++) > (byte) 0xBF) {
1302             return MALFORMED;
1303           }
1304         } else if (byte1 < (byte) 0xF0) {
1305           // three-byte form
1306 
1307           // Get byte2 from saved state or array
1308           int byte2 = (byte) ~(state >> 8);
1309           if (byte2 == 0) {
1310             byte2 = UnsafeUtil.getByte(address++);
1311             if (address >= addressLimit) {
1312               return incompleteStateFor(byte1, byte2);
1313             }
1314           }
1315           if (byte2 > (byte) 0xBF
1316               // overlong? 5 most significant bits must not all be zero
1317               || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0)
1318               // illegal surrogate codepoint?
1319               || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0)
1320               // byte3 trailing-byte test
1321               || UnsafeUtil.getByte(address++) > (byte) 0xBF) {
1322             return MALFORMED;
1323           }
1324         } else {
1325           // four-byte form
1326 
1327           // Get byte2 and byte3 from saved state or array
1328           int byte2 = (byte) ~(state >> 8);
1329           int byte3 = 0;
1330           if (byte2 == 0) {
1331             byte2 = UnsafeUtil.getByte(address++);
1332             if (address >= addressLimit) {
1333               return incompleteStateFor(byte1, byte2);
1334             }
1335           } else {
1336             byte3 = (byte) (state >> 16);
1337           }
1338           if (byte3 == 0) {
1339             byte3 = UnsafeUtil.getByte(address++);
1340             if (address >= addressLimit) {
1341               return incompleteStateFor(byte1, byte2, byte3);
1342             }
1343           }
1344 
1345           // If we were called with state == MALFORMED, then byte1 is 0xFF,
1346           // which never occurs in well-formed UTF-8, and so we will return
1347           // MALFORMED again below.
1348 
1349           if (byte2 > (byte) 0xBF
1350               // Check that 1 <= plane <= 16.  Tricky optimized form of:
1351               // if (byte1 > (byte) 0xF4 ||
1352               //     byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 ||
1353               //     byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F)
1354               || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0
1355               // byte3 trailing-byte test
1356               || byte3 > (byte) 0xBF
1357               // byte4 trailing-byte test
1358               || UnsafeUtil.getByte(address++) > (byte) 0xBF) {
1359             return MALFORMED;
1360           }
1361         }
1362       }
1363 
1364       return partialIsValidUtf8(address, (int) (addressLimit - address));
1365     }
1366 
1367     @Override
decodeUtf8(byte[] bytes, int index, int size)1368     String decodeUtf8(byte[] bytes, int index, int size) throws InvalidProtocolBufferException {
1369       if ((index | size | bytes.length - index - size) < 0) {
1370         throw new ArrayIndexOutOfBoundsException(
1371             String.format("buffer length=%d, index=%d, size=%d", bytes.length, index, size));
1372       }
1373 
1374       int offset = index;
1375       final int limit = offset + size;
1376 
1377       // The longest possible resulting String is the same as the number of input bytes, when it is
1378       // all ASCII. For other cases, this over-allocates and we will truncate in the end.
1379       char[] resultArr = new char[size];
1380       int resultPos = 0;
1381 
1382       // Optimize for 100% ASCII (Hotspot loves small simple top-level loops like this).
1383       // This simple loop stops when we encounter a byte >= 0x80 (i.e. non-ASCII).
1384       while (offset < limit) {
1385         byte b = UnsafeUtil.getByte(bytes, offset);
1386         if (!DecodeUtil.isOneByte(b)) {
1387           break;
1388         }
1389         offset++;
1390         DecodeUtil.handleOneByte(b, resultArr, resultPos++);
1391       }
1392 
1393       while (offset < limit) {
1394         byte byte1 = UnsafeUtil.getByte(bytes, offset++);
1395         if (DecodeUtil.isOneByte(byte1)) {
1396           DecodeUtil.handleOneByte(byte1, resultArr, resultPos++);
1397           // It's common for there to be multiple ASCII characters in a run mixed in, so add an
1398           // extra optimized loop to take care of these runs.
1399           while (offset < limit) {
1400             byte b = UnsafeUtil.getByte(bytes, offset);
1401             if (!DecodeUtil.isOneByte(b)) {
1402               break;
1403             }
1404             offset++;
1405             DecodeUtil.handleOneByte(b, resultArr, resultPos++);
1406           }
1407         } else if (DecodeUtil.isTwoBytes(byte1)) {
1408           if (offset >= limit) {
1409             throw InvalidProtocolBufferException.invalidUtf8();
1410           }
1411           DecodeUtil.handleTwoBytes(
1412               byte1, /* byte2 */ UnsafeUtil.getByte(bytes, offset++), resultArr, resultPos++);
1413         } else if (DecodeUtil.isThreeBytes(byte1)) {
1414           if (offset >= limit - 1) {
1415             throw InvalidProtocolBufferException.invalidUtf8();
1416           }
1417           DecodeUtil.handleThreeBytes(
1418               byte1,
1419               /* byte2 */ UnsafeUtil.getByte(bytes, offset++),
1420               /* byte3 */ UnsafeUtil.getByte(bytes, offset++),
1421               resultArr,
1422               resultPos++);
1423         } else {
1424           if (offset >= limit - 2) {
1425             throw InvalidProtocolBufferException.invalidUtf8();
1426           }
1427           DecodeUtil.handleFourBytes(
1428               byte1,
1429               /* byte2 */ UnsafeUtil.getByte(bytes, offset++),
1430               /* byte3 */ UnsafeUtil.getByte(bytes, offset++),
1431               /* byte4 */ UnsafeUtil.getByte(bytes, offset++),
1432               resultArr,
1433               resultPos++);
1434           // 4-byte case requires two chars.
1435           resultPos++;
1436         }
1437       }
1438 
1439       return new String(resultArr, 0, resultPos);
1440     }
1441 
1442     @Override
decodeUtf8Direct(ByteBuffer buffer, int index, int size)1443     String decodeUtf8Direct(ByteBuffer buffer, int index, int size)
1444         throws InvalidProtocolBufferException {
1445       // Bitwise OR combines the sign bits so any negative value fails the check.
1446       if ((index | size | buffer.limit() - index - size) < 0) {
1447         throw new ArrayIndexOutOfBoundsException(
1448             String.format("buffer limit=%d, index=%d, limit=%d", buffer.limit(), index, size));
1449       }
1450       long address = UnsafeUtil.addressOffset(buffer) + index;
1451       final long addressLimit = address + size;
1452 
1453       // The longest possible resulting String is the same as the number of input bytes, when it is
1454       // all ASCII. For other cases, this over-allocates and we will truncate in the end.
1455       char[] resultArr = new char[size];
1456       int resultPos = 0;
1457 
1458       // Optimize for 100% ASCII (Hotspot loves small simple top-level loops like this).
1459       // This simple loop stops when we encounter a byte >= 0x80 (i.e. non-ASCII).
1460       while (address < addressLimit) {
1461         byte b = UnsafeUtil.getByte(address);
1462         if (!DecodeUtil.isOneByte(b)) {
1463           break;
1464         }
1465         address++;
1466         DecodeUtil.handleOneByte(b, resultArr, resultPos++);
1467       }
1468 
1469       while (address < addressLimit) {
1470         byte byte1 = UnsafeUtil.getByte(address++);
1471         if (DecodeUtil.isOneByte(byte1)) {
1472           DecodeUtil.handleOneByte(byte1, resultArr, resultPos++);
1473           // It's common for there to be multiple ASCII characters in a run mixed in, so add an
1474           // extra optimized loop to take care of these runs.
1475           while (address < addressLimit) {
1476             byte b = UnsafeUtil.getByte(address);
1477             if (!DecodeUtil.isOneByte(b)) {
1478               break;
1479             }
1480             address++;
1481             DecodeUtil.handleOneByte(b, resultArr, resultPos++);
1482           }
1483         } else if (DecodeUtil.isTwoBytes(byte1)) {
1484           if (address >= addressLimit) {
1485             throw InvalidProtocolBufferException.invalidUtf8();
1486           }
1487           DecodeUtil.handleTwoBytes(
1488               byte1, /* byte2 */ UnsafeUtil.getByte(address++), resultArr, resultPos++);
1489         } else if (DecodeUtil.isThreeBytes(byte1)) {
1490           if (address >= addressLimit - 1) {
1491             throw InvalidProtocolBufferException.invalidUtf8();
1492           }
1493           DecodeUtil.handleThreeBytes(
1494               byte1,
1495               /* byte2 */ UnsafeUtil.getByte(address++),
1496               /* byte3 */ UnsafeUtil.getByte(address++),
1497               resultArr,
1498               resultPos++);
1499         } else {
1500           if (address >= addressLimit - 2) {
1501             throw InvalidProtocolBufferException.invalidUtf8();
1502           }
1503           DecodeUtil.handleFourBytes(
1504               byte1,
1505               /* byte2 */ UnsafeUtil.getByte(address++),
1506               /* byte3 */ UnsafeUtil.getByte(address++),
1507               /* byte4 */ UnsafeUtil.getByte(address++),
1508               resultArr,
1509               resultPos++);
1510           // 4-byte case requires two chars.
1511           resultPos++;
1512         }
1513       }
1514 
1515       return new String(resultArr, 0, resultPos);
1516     }
1517 
1518     @Override
encodeUtf8(final CharSequence in, final byte[] out, final int offset, final int length)1519     int encodeUtf8(final CharSequence in, final byte[] out, final int offset, final int length) {
1520       long outIx = offset;
1521       final long outLimit = outIx + length;
1522       final int inLimit = in.length();
1523       if (inLimit > length || out.length - length < offset) {
1524         // Not even enough room for an ASCII-encoded string.
1525         throw new ArrayIndexOutOfBoundsException(
1526             "Failed writing " + in.charAt(inLimit - 1) + " at index " + (offset + length));
1527       }
1528 
1529       // Designed to take advantage of
1530       // https://wiki.openjdk.java.net/display/HotSpotInternals/RangeCheckElimination
1531       int inIx = 0;
1532       for (char c; inIx < inLimit && (c = in.charAt(inIx)) < 0x80; ++inIx) {
1533         UnsafeUtil.putByte(out, outIx++, (byte) c);
1534       }
1535       if (inIx == inLimit) {
1536         // We're done, it was ASCII encoded.
1537         return (int) outIx;
1538       }
1539 
1540       for (char c; inIx < inLimit; ++inIx) {
1541         c = in.charAt(inIx);
1542         if (c < 0x80 && outIx < outLimit) {
1543           UnsafeUtil.putByte(out, outIx++, (byte) c);
1544         } else if (c < 0x800 && outIx <= outLimit - 2L) { // 11 bits, two UTF-8 bytes
1545           UnsafeUtil.putByte(out, outIx++, (byte) ((0xF << 6) | (c >>> 6)));
1546           UnsafeUtil.putByte(out, outIx++, (byte) (0x80 | (0x3F & c)));
1547         } else if ((c < MIN_SURROGATE || MAX_SURROGATE < c) && outIx <= outLimit - 3L) {
1548           // Maximum single-char code point is 0xFFFF, 16 bits, three UTF-8 bytes
1549           UnsafeUtil.putByte(out, outIx++, (byte) ((0xF << 5) | (c >>> 12)));
1550           UnsafeUtil.putByte(out, outIx++, (byte) (0x80 | (0x3F & (c >>> 6))));
1551           UnsafeUtil.putByte(out, outIx++, (byte) (0x80 | (0x3F & c)));
1552         } else if (outIx <= outLimit - 4L) {
1553           // Minimum code point represented by a surrogate pair is 0x10000, 17 bits, four UTF-8
1554           // bytes
1555           final char low;
1556           if (inIx + 1 == inLimit || !isSurrogatePair(c, (low = in.charAt(++inIx)))) {
1557             throw new UnpairedSurrogateException((inIx - 1), inLimit);
1558           }
1559           int codePoint = toCodePoint(c, low);
1560           UnsafeUtil.putByte(out, outIx++, (byte) ((0xF << 4) | (codePoint >>> 18)));
1561           UnsafeUtil.putByte(out, outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 12))));
1562           UnsafeUtil.putByte(out, outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 6))));
1563           UnsafeUtil.putByte(out, outIx++, (byte) (0x80 | (0x3F & codePoint)));
1564         } else {
1565           if ((MIN_SURROGATE <= c && c <= MAX_SURROGATE)
1566               && (inIx + 1 == inLimit || !isSurrogatePair(c, in.charAt(inIx + 1)))) {
1567             // We are surrogates and we're not a surrogate pair.
1568             throw new UnpairedSurrogateException(inIx, inLimit);
1569           }
1570           // Not enough space in the output buffer.
1571           throw new ArrayIndexOutOfBoundsException("Failed writing " + c + " at index " + outIx);
1572         }
1573       }
1574 
1575       // All bytes have been encoded.
1576       return (int) outIx;
1577     }
1578 
1579     @Override
encodeUtf8Direct(CharSequence in, ByteBuffer out)1580     void encodeUtf8Direct(CharSequence in, ByteBuffer out) {
1581       final long address = addressOffset(out);
1582       long outIx = address + out.position();
1583       final long outLimit = address + out.limit();
1584       final int inLimit = in.length();
1585       if (inLimit > outLimit - outIx) {
1586         // Not even enough room for an ASCII-encoded string.
1587         throw new ArrayIndexOutOfBoundsException(
1588             "Failed writing " + in.charAt(inLimit - 1) + " at index " + out.limit());
1589       }
1590 
1591       // Designed to take advantage of
1592       // https://wiki.openjdk.java.net/display/HotSpotInternals/RangeCheckElimination
1593       int inIx = 0;
1594       for (char c; inIx < inLimit && (c = in.charAt(inIx)) < 0x80; ++inIx) {
1595         UnsafeUtil.putByte(outIx++, (byte) c);
1596       }
1597       if (inIx == inLimit) {
1598         // We're done, it was ASCII encoded.
1599         out.position((int) (outIx - address));
1600         return;
1601       }
1602 
1603       for (char c; inIx < inLimit; ++inIx) {
1604         c = in.charAt(inIx);
1605         if (c < 0x80 && outIx < outLimit) {
1606           UnsafeUtil.putByte(outIx++, (byte) c);
1607         } else if (c < 0x800 && outIx <= outLimit - 2L) { // 11 bits, two UTF-8 bytes
1608           UnsafeUtil.putByte(outIx++, (byte) ((0xF << 6) | (c >>> 6)));
1609           UnsafeUtil.putByte(outIx++, (byte) (0x80 | (0x3F & c)));
1610         } else if ((c < MIN_SURROGATE || MAX_SURROGATE < c) && outIx <= outLimit - 3L) {
1611           // Maximum single-char code point is 0xFFFF, 16 bits, three UTF-8 bytes
1612           UnsafeUtil.putByte(outIx++, (byte) ((0xF << 5) | (c >>> 12)));
1613           UnsafeUtil.putByte(outIx++, (byte) (0x80 | (0x3F & (c >>> 6))));
1614           UnsafeUtil.putByte(outIx++, (byte) (0x80 | (0x3F & c)));
1615         } else if (outIx <= outLimit - 4L) {
1616           // Minimum code point represented by a surrogate pair is 0x10000, 17 bits, four UTF-8
1617           // bytes
1618           final char low;
1619           if (inIx + 1 == inLimit || !isSurrogatePair(c, (low = in.charAt(++inIx)))) {
1620             throw new UnpairedSurrogateException((inIx - 1), inLimit);
1621           }
1622           int codePoint = toCodePoint(c, low);
1623           UnsafeUtil.putByte(outIx++, (byte) ((0xF << 4) | (codePoint >>> 18)));
1624           UnsafeUtil.putByte(outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 12))));
1625           UnsafeUtil.putByte(outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 6))));
1626           UnsafeUtil.putByte(outIx++, (byte) (0x80 | (0x3F & codePoint)));
1627         } else {
1628           if ((MIN_SURROGATE <= c && c <= MAX_SURROGATE)
1629               && (inIx + 1 == inLimit || !isSurrogatePair(c, in.charAt(inIx + 1)))) {
1630             // We are surrogates and we're not a surrogate pair.
1631             throw new UnpairedSurrogateException(inIx, inLimit);
1632           }
1633           // Not enough space in the output buffer.
1634           throw new ArrayIndexOutOfBoundsException("Failed writing " + c + " at index " + outIx);
1635         }
1636       }
1637 
1638       // All bytes have been encoded.
1639       out.position((int) (outIx - address));
1640     }
1641 
1642     /**
1643      * Counts (approximately) the number of consecutive ASCII characters starting from the given
1644      * position, using the most efficient method available to the platform.
1645      *
1646      * @param bytes the array containing the character sequence
1647      * @param offset the offset position of the index (same as index + arrayBaseOffset)
1648      * @param maxChars the maximum number of characters to count
1649      * @return the number of ASCII characters found. The stopping position will be at or before the
1650      *     first non-ASCII byte.
1651      */
unsafeEstimateConsecutiveAscii( byte[] bytes, long offset, final int maxChars)1652     private static int unsafeEstimateConsecutiveAscii(
1653         byte[] bytes, long offset, final int maxChars) {
1654       if (maxChars < UNSAFE_COUNT_ASCII_THRESHOLD) {
1655         // Don't bother with small strings.
1656         return 0;
1657       }
1658 
1659       for (int i = 0; i < maxChars; i++) {
1660         if (UnsafeUtil.getByte(bytes, offset++) < 0) {
1661           return i;
1662         }
1663       }
1664       return maxChars;
1665     }
1666 
1667     /**
1668      * Same as {@link Utf8#estimateConsecutiveAscii(ByteBuffer, int, int)} except that it uses the
1669      * most efficient method available to the platform.
1670      */
unsafeEstimateConsecutiveAscii(long address, final int maxChars)1671     private static int unsafeEstimateConsecutiveAscii(long address, final int maxChars) {
1672       int remaining = maxChars;
1673       if (remaining < UNSAFE_COUNT_ASCII_THRESHOLD) {
1674         // Don't bother with small strings.
1675         return 0;
1676       }
1677 
1678       // Read bytes until 8-byte aligned so that we can read longs in the loop below.
1679       // We do this by ANDing the address with 7 to determine the number of bytes that need to
1680       // be read before we're 8-byte aligned.
1681       final int unaligned = 8 - ((int) address & 7);
1682       for (int j = unaligned; j > 0; j--) {
1683         if (UnsafeUtil.getByte(address++) < 0) {
1684           return unaligned - j;
1685         }
1686       }
1687 
1688       // This simple loop stops when we encounter a byte >= 0x80 (i.e. non-ASCII).
1689       // To speed things up further, we're reading longs instead of bytes so we use a mask to
1690       // determine if any byte in the current long is non-ASCII.
1691       remaining -= unaligned;
1692       for (;
1693           remaining >= 8 && (UnsafeUtil.getLong(address) & ASCII_MASK_LONG) == 0;
1694           address += 8, remaining -= 8) {}
1695       return maxChars - remaining;
1696     }
1697 
partialIsValidUtf8(final byte[] bytes, long offset, int remaining)1698     private static int partialIsValidUtf8(final byte[] bytes, long offset, int remaining) {
1699       // Skip past ASCII characters as quickly as possible.
1700       final int skipped = unsafeEstimateConsecutiveAscii(bytes, offset, remaining);
1701       remaining -= skipped;
1702       offset += skipped;
1703 
1704       for (; ; ) {
1705         // Optimize for interior runs of ASCII bytes.
1706         // TODO(nathanmittler): Consider checking 8 bytes at a time after some threshold?
1707         // Maybe after seeing a few in a row that are ASCII, go back to fast mode?
1708         int byte1 = 0;
1709         for (; remaining > 0 && (byte1 = UnsafeUtil.getByte(bytes, offset++)) >= 0; --remaining) {}
1710         if (remaining == 0) {
1711           return COMPLETE;
1712         }
1713         remaining--;
1714 
1715         // If we're here byte1 is not ASCII. Only need to handle 2-4 byte forms.
1716         if (byte1 < (byte) 0xE0) {
1717           // Two-byte form (110xxxxx 10xxxxxx)
1718           if (remaining == 0) {
1719             // Incomplete sequence
1720             return byte1;
1721           }
1722           remaining--;
1723 
1724           // Simultaneously checks for illegal trailing-byte in
1725           // leading position and overlong 2-byte form.
1726           if (byte1 < (byte) 0xC2 || UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF) {
1727             return MALFORMED;
1728           }
1729         } else if (byte1 < (byte) 0xF0) {
1730           // Three-byte form (1110xxxx 10xxxxxx 10xxxxxx)
1731           if (remaining < 2) {
1732             // Incomplete sequence
1733             return unsafeIncompleteStateFor(bytes, byte1, offset, remaining);
1734           }
1735           remaining -= 2;
1736 
1737           final int byte2;
1738           if ((byte2 = UnsafeUtil.getByte(bytes, offset++)) > (byte) 0xBF
1739               // overlong? 5 most significant bits must not all be zero
1740               || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0)
1741               // check for illegal surrogate codepoints
1742               || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0)
1743               // byte3 trailing-byte test
1744               || UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF) {
1745             return MALFORMED;
1746           }
1747         } else {
1748           // Four-byte form (1110xxxx 10xxxxxx 10xxxxxx 10xxxxxx)
1749           if (remaining < 3) {
1750             // Incomplete sequence
1751             return unsafeIncompleteStateFor(bytes, byte1, offset, remaining);
1752           }
1753           remaining -= 3;
1754 
1755           final int byte2;
1756           if ((byte2 = UnsafeUtil.getByte(bytes, offset++)) > (byte) 0xBF
1757               // Check that 1 <= plane <= 16.  Tricky optimized form of:
1758               // if (byte1 > (byte) 0xF4 ||
1759               //     byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 ||
1760               //     byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F)
1761               || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0
1762               // byte3 trailing-byte test
1763               || UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF
1764               // byte4 trailing-byte test
1765               || UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF) {
1766             return MALFORMED;
1767           }
1768         }
1769       }
1770     }
1771 
partialIsValidUtf8(long address, int remaining)1772     private static int partialIsValidUtf8(long address, int remaining) {
1773       // Skip past ASCII characters as quickly as possible.
1774       final int skipped = unsafeEstimateConsecutiveAscii(address, remaining);
1775       address += skipped;
1776       remaining -= skipped;
1777 
1778       for (; ; ) {
1779         // Optimize for interior runs of ASCII bytes.
1780         // TODO(nathanmittler): Consider checking 8 bytes at a time after some threshold?
1781         // Maybe after seeing a few in a row that are ASCII, go back to fast mode?
1782         int byte1 = 0;
1783         for (; remaining > 0 && (byte1 = UnsafeUtil.getByte(address++)) >= 0; --remaining) {}
1784         if (remaining == 0) {
1785           return COMPLETE;
1786         }
1787         remaining--;
1788 
1789         if (byte1 < (byte) 0xE0) {
1790           // Two-byte form
1791 
1792           if (remaining == 0) {
1793             // Incomplete sequence
1794             return byte1;
1795           }
1796           remaining--;
1797 
1798           // Simultaneously checks for illegal trailing-byte in
1799           // leading position and overlong 2-byte form.
1800           if (byte1 < (byte) 0xC2 || UnsafeUtil.getByte(address++) > (byte) 0xBF) {
1801             return MALFORMED;
1802           }
1803         } else if (byte1 < (byte) 0xF0) {
1804           // Three-byte form
1805 
1806           if (remaining < 2) {
1807             // Incomplete sequence
1808             return unsafeIncompleteStateFor(address, byte1, remaining);
1809           }
1810           remaining -= 2;
1811 
1812           final byte byte2 = UnsafeUtil.getByte(address++);
1813           if (byte2 > (byte) 0xBF
1814               // overlong? 5 most significant bits must not all be zero
1815               || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0)
1816               // check for illegal surrogate codepoints
1817               || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0)
1818               // byte3 trailing-byte test
1819               || UnsafeUtil.getByte(address++) > (byte) 0xBF) {
1820             return MALFORMED;
1821           }
1822         } else {
1823           // Four-byte form
1824 
1825           if (remaining < 3) {
1826             // Incomplete sequence
1827             return unsafeIncompleteStateFor(address, byte1, remaining);
1828           }
1829           remaining -= 3;
1830 
1831           final byte byte2 = UnsafeUtil.getByte(address++);
1832           if (byte2 > (byte) 0xBF
1833               // Check that 1 <= plane <= 16.  Tricky optimized form of:
1834               // if (byte1 > (byte) 0xF4 ||
1835               //     byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 ||
1836               //     byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F)
1837               || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0
1838               // byte3 trailing-byte test
1839               || UnsafeUtil.getByte(address++) > (byte) 0xBF
1840               // byte4 trailing-byte test
1841               || UnsafeUtil.getByte(address++) > (byte) 0xBF) {
1842             return MALFORMED;
1843           }
1844         }
1845       }
1846     }
1847 
unsafeIncompleteStateFor( byte[] bytes, int byte1, long offset, int remaining)1848     private static int unsafeIncompleteStateFor(
1849         byte[] bytes, int byte1, long offset, int remaining) {
1850       switch (remaining) {
1851         case 0:
1852           return incompleteStateFor(byte1);
1853         case 1:
1854           return incompleteStateFor(byte1, UnsafeUtil.getByte(bytes, offset));
1855         case 2:
1856           return incompleteStateFor(
1857               byte1, UnsafeUtil.getByte(bytes, offset), UnsafeUtil.getByte(bytes, offset + 1));
1858         default:
1859           throw new AssertionError();
1860       }
1861     }
1862 
unsafeIncompleteStateFor(long address, final int byte1, int remaining)1863     private static int unsafeIncompleteStateFor(long address, final int byte1, int remaining) {
1864       switch (remaining) {
1865         case 0:
1866           return incompleteStateFor(byte1);
1867         case 1:
1868           return incompleteStateFor(byte1, UnsafeUtil.getByte(address));
1869         case 2:
1870           return incompleteStateFor(
1871               byte1, UnsafeUtil.getByte(address), UnsafeUtil.getByte(address + 1));
1872         default:
1873           throw new AssertionError();
1874       }
1875     }
1876   }
1877 
1878   /**
1879    * Utility methods for decoding bytes into {@link String}. Callers are responsible for extracting
1880    * bytes (possibly using Unsafe methods), and checking remaining bytes. All other UTF-8 validity
1881    * checks and codepoint conversion happen in this class.
1882    */
1883   private static class DecodeUtil {
1884 
1885     /** Returns whether this is a single-byte codepoint (i.e., ASCII) with the form '0XXXXXXX'. */
isOneByte(byte b)1886     private static boolean isOneByte(byte b) {
1887       return b >= 0;
1888     }
1889 
1890     /** Returns whether this is a two-byte codepoint with the form '10XXXXXX'. */
isTwoBytes(byte b)1891     private static boolean isTwoBytes(byte b) {
1892       return b < (byte) 0xE0;
1893     }
1894 
1895     /** Returns whether this is a three-byte codepoint with the form '110XXXXX'. */
isThreeBytes(byte b)1896     private static boolean isThreeBytes(byte b) {
1897       return b < (byte) 0xF0;
1898     }
1899 
handleOneByte(byte byte1, char[] resultArr, int resultPos)1900     private static void handleOneByte(byte byte1, char[] resultArr, int resultPos) {
1901       resultArr[resultPos] = (char) byte1;
1902     }
1903 
handleTwoBytes(byte byte1, byte byte2, char[] resultArr, int resultPos)1904     private static void handleTwoBytes(byte byte1, byte byte2, char[] resultArr, int resultPos)
1905         throws InvalidProtocolBufferException {
1906       // Simultaneously checks for illegal trailing-byte in leading position (<= '11000000') and
1907       // overlong 2-byte, '11000001'.
1908       if (byte1 < (byte) 0xC2 || isNotTrailingByte(byte2)) {
1909         throw InvalidProtocolBufferException.invalidUtf8();
1910       }
1911       resultArr[resultPos] = (char) (((byte1 & 0x1F) << 6) | trailingByteValue(byte2));
1912     }
1913 
handleThreeBytes( byte byte1, byte byte2, byte byte3, char[] resultArr, int resultPos)1914     private static void handleThreeBytes(
1915         byte byte1, byte byte2, byte byte3, char[] resultArr, int resultPos)
1916         throws InvalidProtocolBufferException {
1917       if (isNotTrailingByte(byte2)
1918           // overlong? 5 most significant bits must not all be zero
1919           || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0)
1920           // check for illegal surrogate codepoints
1921           || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0)
1922           || isNotTrailingByte(byte3)) {
1923         throw InvalidProtocolBufferException.invalidUtf8();
1924       }
1925       resultArr[resultPos] =
1926           (char)
1927               (((byte1 & 0x0F) << 12) | (trailingByteValue(byte2) << 6) | trailingByteValue(byte3));
1928     }
1929 
handleFourBytes( byte byte1, byte byte2, byte byte3, byte byte4, char[] resultArr, int resultPos)1930     private static void handleFourBytes(
1931         byte byte1, byte byte2, byte byte3, byte byte4, char[] resultArr, int resultPos)
1932         throws InvalidProtocolBufferException {
1933       if (isNotTrailingByte(byte2)
1934           // Check that 1 <= plane <= 16.  Tricky optimized form of:
1935           //   valid 4-byte leading byte?
1936           // if (byte1 > (byte) 0xF4 ||
1937           //   overlong? 4 most significant bits must not all be zero
1938           //     byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 ||
1939           //   codepoint larger than the highest code point (U+10FFFF)?
1940           //     byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F)
1941           || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0
1942           || isNotTrailingByte(byte3)
1943           || isNotTrailingByte(byte4)) {
1944         throw InvalidProtocolBufferException.invalidUtf8();
1945       }
1946       int codepoint =
1947           ((byte1 & 0x07) << 18)
1948               | (trailingByteValue(byte2) << 12)
1949               | (trailingByteValue(byte3) << 6)
1950               | trailingByteValue(byte4);
1951       resultArr[resultPos] = DecodeUtil.highSurrogate(codepoint);
1952       resultArr[resultPos + 1] = DecodeUtil.lowSurrogate(codepoint);
1953     }
1954 
1955     /** Returns whether the byte is not a valid continuation of the form '10XXXXXX'. */
isNotTrailingByte(byte b)1956     private static boolean isNotTrailingByte(byte b) {
1957       return b > (byte) 0xBF;
1958     }
1959 
1960     /** Returns the actual value of the trailing byte (removes the prefix '10') for composition. */
trailingByteValue(byte b)1961     private static int trailingByteValue(byte b) {
1962       return b & 0x3F;
1963     }
1964 
highSurrogate(int codePoint)1965     private static char highSurrogate(int codePoint) {
1966       return (char)
1967           ((MIN_HIGH_SURROGATE - (MIN_SUPPLEMENTARY_CODE_POINT >>> 10)) + (codePoint >>> 10));
1968     }
1969 
lowSurrogate(int codePoint)1970     private static char lowSurrogate(int codePoint) {
1971       return (char) (MIN_LOW_SURROGATE + (codePoint & 0x3ff));
1972     }
1973   }
1974 
Utf8()1975   private Utf8() {}
1976 }
1977