• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Protocol Buffers - Google's data interchange format
2 // Copyright 2008 Google Inc.  All rights reserved.
3 //
4 // Use of this source code is governed by a BSD-style
5 // license that can be found in the LICENSE file or at
6 // https://developers.google.com/open-source/licenses/bsd
7 
8 package com.google.protobuf;
9 
10 import static com.google.protobuf.UnsafeUtil.addressOffset;
11 import static com.google.protobuf.UnsafeUtil.hasUnsafeArrayOperations;
12 import static com.google.protobuf.UnsafeUtil.hasUnsafeByteBufferOperations;
13 import static java.lang.Character.MAX_SURROGATE;
14 import static java.lang.Character.MIN_HIGH_SURROGATE;
15 import static java.lang.Character.MIN_LOW_SURROGATE;
16 import static java.lang.Character.MIN_SUPPLEMENTARY_CODE_POINT;
17 import static java.lang.Character.MIN_SURROGATE;
18 import static java.lang.Character.isSurrogatePair;
19 import static java.lang.Character.toCodePoint;
20 
21 import java.nio.ByteBuffer;
22 import java.util.Arrays;
23 
24 /**
25  * A set of low-level, high-performance static utility methods related to the UTF-8 character
26  * encoding. This class has no dependencies outside of the core JDK libraries.
27  *
28  * <p>There are several variants of UTF-8. The one implemented by this class is the restricted
29  * definition of UTF-8 introduced in Unicode 3.1, which mandates the rejection of "overlong" byte
30  * sequences as well as rejection of 3-byte surrogate codepoint byte sequences. Note that the UTF-8
31  * decoder included in Oracle's JDK has been modified to also reject "overlong" byte sequences, but
32  * (as of 2011) still accepts 3-byte surrogate codepoint byte sequences.
33  *
34  * <p>The byte sequences considered valid by this class are exactly those that can be roundtrip
35  * converted to Strings and back to bytes using the UTF-8 charset, without loss:
36  *
37  * <pre>{@code
38  * Arrays.equals(bytes, new String(bytes, Internal.UTF_8).getBytes(Internal.UTF_8))
39  * }</pre>
40  *
41  * <p>See the Unicode Standard,</br> Table 3-6. <em>UTF-8 Bit Distribution</em>,</br> Table 3-7.
42  * <em>Well Formed UTF-8 Byte Sequences</em>.
43  *
44  * <p>This class supports decoding of partial byte sequences, so that the bytes in a complete UTF-8
45  * byte sequence can be stored in multiple segments. Methods typically return {@link #MALFORMED} if
46  * the partial byte sequence is definitely not well-formed; {@link #COMPLETE} if it is well-formed
47  * in the absence of additional input; or, if the byte sequence apparently terminated in the middle
48  * of a character, an opaque integer "state" value containing enough information to decode the
49  * character when passed to a subsequent invocation of a partial decoding method.
50  *
51  * @author martinrb@google.com (Martin Buchholz)
52  */
53 // TODO: Copy changes in this class back to Guava
54 final class Utf8 {
55 
56   /**
57    * UTF-8 is a runtime hot spot so we attempt to provide heavily optimized implementations
58    * depending on what is available on the platform. The processor is the platform-optimized
59    * delegate for which all methods are delegated directly to.
60    */
61   private static final Processor processor =
62       (UnsafeProcessor.isAvailable() && !Android.isOnAndroidDevice())
63           ? new UnsafeProcessor()
64           : new SafeProcessor();
65 
66   /**
67    * A mask used when performing unsafe reads to determine if a long value contains any non-ASCII
68    * characters (i.e. any byte >= 0x80).
69    */
70   private static final long ASCII_MASK_LONG = 0x8080808080808080L;
71 
72   /**
73    * Maximum number of bytes per Java UTF-16 char in UTF-8.
74    *
75    * @see java.nio.charset.CharsetEncoder#maxBytesPerChar()
76    */
77   static final int MAX_BYTES_PER_CHAR = 3;
78 
79   /**
80    * State value indicating that the byte sequence is well-formed and complete (no further bytes are
81    * needed to complete a character).
82    */
83   static final int COMPLETE = 0;
84 
85   /** State value indicating that the byte sequence is definitely not well-formed. */
86   static final int MALFORMED = -1;
87 
88   /**
89    * Used by {@code Unsafe} UTF-8 string validation logic to determine the minimum string length
90    * above which to employ an optimized algorithm for counting ASCII characters. The reason for this
91    * threshold is that for small strings, the optimization may not be beneficial or may even
92    * negatively impact performance since it requires additional logic to avoid unaligned reads (when
93    * calling {@code Unsafe.getLong}). This threshold guarantees that even if the initial offset is
94    * unaligned, we're guaranteed to make at least one call to {@code Unsafe.getLong()} which
95    * provides a performance improvement that entirely subsumes the cost of the additional logic.
96    */
97   private static final int UNSAFE_COUNT_ASCII_THRESHOLD = 16;
98 
99   // Other state values include the partial bytes of the incomplete
100   // character to be decoded in the simplest way: we pack the bytes
101   // into the state int in little-endian order.  For example:
102   //
103   // int state = byte1 ^ (byte2 << 8) ^ (byte3 << 16);
104   //
105   // Such a state is unpacked thus (note the ~ operation for byte2 to
106   // undo byte1's sign-extension bits):
107   //
108   // int byte1 = (byte) state;
109   // int byte2 = (byte) ~(state >> 8);
110   // int byte3 = (byte) (state >> 16);
111   //
112   // We cannot store a zero byte in the state because it would be
113   // indistinguishable from the absence of a byte.  But we don't need
114   // to, because partial bytes must always be negative.  When building
115   // a state, we ensure that byte1 is negative and subsequent bytes
116   // are valid trailing bytes.
117 
118   /**
119    * Returns {@code true} if the given byte array is a well-formed UTF-8 byte sequence.
120    *
121    * <p>This is a convenience method, equivalent to a call to {@code isValidUtf8(bytes, 0,
122    * bytes.length)}.
123    */
isValidUtf8(byte[] bytes)124   static boolean isValidUtf8(byte[] bytes) {
125     return processor.isValidUtf8(bytes, 0, bytes.length);
126   }
127 
128   /**
129    * Returns {@code true} if the given byte array slice is a well-formed UTF-8 byte sequence. The
130    * range of bytes to be checked extends from index {@code index}, inclusive, to {@code limit},
131    * exclusive.
132    *
133    * <p>This is a convenience method, equivalent to {@code partialIsValidUtf8(bytes, index, limit)
134    * == Utf8.COMPLETE}.
135    */
isValidUtf8(byte[] bytes, int index, int limit)136   static boolean isValidUtf8(byte[] bytes, int index, int limit) {
137     return processor.isValidUtf8(bytes, index, limit);
138   }
139 
140   /**
141    * Tells whether the given byte array slice is a well-formed, malformed, or incomplete UTF-8 byte
142    * sequence. The range of bytes to be checked extends from index {@code index}, inclusive, to
143    * {@code limit}, exclusive.
144    *
145    * @param state either {@link Utf8#COMPLETE} (if this is the initial decoding operation) or the
146    *     value returned from a call to a partial decoding method for the previous bytes
147    * @return {@link #MALFORMED} if the partial byte sequence is definitely not well-formed, {@link
148    *     #COMPLETE} if it is well-formed (no additional input needed), or if the byte sequence is
149    *     "incomplete", i.e. apparently terminated in the middle of a character, an opaque integer
150    *     "state" value containing enough information to decode the character when passed to a
151    *     subsequent invocation of a partial decoding method.
152    */
partialIsValidUtf8(int state, byte[] bytes, int index, int limit)153   static int partialIsValidUtf8(int state, byte[] bytes, int index, int limit) {
154     return processor.partialIsValidUtf8(state, bytes, index, limit);
155   }
156 
incompleteStateFor(int byte1)157   private static int incompleteStateFor(int byte1) {
158     return (byte1 > (byte) 0xF4) ? MALFORMED : byte1;
159   }
160 
incompleteStateFor(int byte1, int byte2)161   private static int incompleteStateFor(int byte1, int byte2) {
162     return (byte1 > (byte) 0xF4 || byte2 > (byte) 0xBF) ? MALFORMED : byte1 ^ (byte2 << 8);
163   }
164 
incompleteStateFor(int byte1, int byte2, int byte3)165   private static int incompleteStateFor(int byte1, int byte2, int byte3) {
166     return (byte1 > (byte) 0xF4 || byte2 > (byte) 0xBF || byte3 > (byte) 0xBF)
167         ? MALFORMED
168         : byte1 ^ (byte2 << 8) ^ (byte3 << 16);
169   }
170 
incompleteStateFor(byte[] bytes, int index, int limit)171   private static int incompleteStateFor(byte[] bytes, int index, int limit) {
172     int byte1 = bytes[index - 1];
173     switch (limit - index) {
174       case 0:
175         return incompleteStateFor(byte1);
176       case 1:
177         return incompleteStateFor(byte1, bytes[index]);
178       case 2:
179         return incompleteStateFor(byte1, bytes[index], bytes[index + 1]);
180       default:
181         throw new AssertionError();
182     }
183   }
184 
incompleteStateFor( final ByteBuffer buffer, final int byte1, final int index, final int remaining)185   private static int incompleteStateFor(
186       final ByteBuffer buffer, final int byte1, final int index, final int remaining) {
187     switch (remaining) {
188       case 0:
189         return incompleteStateFor(byte1);
190       case 1:
191         return incompleteStateFor(byte1, buffer.get(index));
192       case 2:
193         return incompleteStateFor(byte1, buffer.get(index), buffer.get(index + 1));
194       default:
195         throw new AssertionError();
196     }
197   }
198 
199   // These UTF-8 handling methods are copied from Guava's Utf8 class with a modification to throw
200   // a protocol buffer local exception. This exception is then caught in CodedOutputStream so it can
201   // fallback to more lenient behavior.
202 
203   static class UnpairedSurrogateException extends IllegalArgumentException {
UnpairedSurrogateException(int index, int length)204     UnpairedSurrogateException(int index, int length) {
205       super("Unpaired surrogate at index " + index + " of " + length);
206     }
207   }
208 
209   /**
210    * Returns the number of bytes in the UTF-8-encoded form of {@code sequence}. For a string, this
211    * method is equivalent to {@code string.getBytes(UTF_8).length}, but is more efficient in both
212    * time and space.
213    *
214    * @throws IllegalArgumentException if {@code sequence} contains ill-formed UTF-16 (unpaired
215    *     surrogates)
216    */
encodedLength(String string)217   static int encodedLength(String string) {
218     // Warning to maintainers: this implementation is highly optimized.
219     int utf16Length = string.length();
220     int utf8Length = utf16Length;
221     int i = 0;
222 
223     // This loop optimizes for pure ASCII.
224     while (i < utf16Length && string.charAt(i) < 0x80) {
225       i++;
226     }
227 
228     // This loop optimizes for chars less than 0x800.
229     for (; i < utf16Length; i++) {
230       char c = string.charAt(i);
231       if (c < 0x800) {
232         utf8Length += ((0x7f - c) >>> 31); // branch free!
233       } else {
234         utf8Length += encodedLengthGeneral(string, i);
235         break;
236       }
237     }
238 
239     if (utf8Length < utf16Length) {
240       // Necessary and sufficient condition for overflow because of maximum 3x expansion
241       throw new IllegalArgumentException(
242           "UTF-8 length does not fit in int: " + (utf8Length + (1L << 32)));
243     }
244     return utf8Length;
245   }
246 
encodedLengthGeneral(String string, int start)247   private static int encodedLengthGeneral(String string, int start) {
248     int utf16Length = string.length();
249     int utf8Length = 0;
250     for (int i = start; i < utf16Length; i++) {
251       char c = string.charAt(i);
252       if (c < 0x800) {
253         utf8Length += (0x7f - c) >>> 31; // branch free!
254       } else {
255         utf8Length += 2;
256         // jdk7+: if (Character.isSurrogate(c)) {
257         if (Character.MIN_SURROGATE <= c && c <= Character.MAX_SURROGATE) {
258           // Check that we have a well-formed surrogate pair.
259           int cp = Character.codePointAt(string, i);
260           if (cp < MIN_SUPPLEMENTARY_CODE_POINT) {
261             throw new UnpairedSurrogateException(i, utf16Length);
262           }
263           i++;
264         }
265       }
266     }
267     return utf8Length;
268   }
269 
encode(String in, byte[] out, int offset, int length)270   static int encode(String in, byte[] out, int offset, int length) {
271     return processor.encodeUtf8(in, out, offset, length);
272   }
273   // End Guava UTF-8 methods.
274 
275   /**
276    * Determines if the given {@link ByteBuffer} is a valid UTF-8 string.
277    *
278    * <p>Selects an optimal algorithm based on the type of {@link ByteBuffer} (i.e. heap or direct)
279    * and the capabilities of the platform.
280    *
281    * @param buffer the buffer to check.
282    * @see Utf8#isValidUtf8(byte[], int, int)
283    */
isValidUtf8(ByteBuffer buffer)284   static boolean isValidUtf8(ByteBuffer buffer) {
285     return processor.isValidUtf8(buffer, buffer.position(), buffer.remaining());
286   }
287 
288   /**
289    * Determines if the given {@link ByteBuffer} is a partially valid UTF-8 string.
290    *
291    * <p>Selects an optimal algorithm based on the type of {@link ByteBuffer} (i.e. heap or direct)
292    * and the capabilities of the platform.
293    *
294    * @param buffer the buffer to check.
295    * @see Utf8#partialIsValidUtf8(int, byte[], int, int)
296    */
partialIsValidUtf8(int state, ByteBuffer buffer, int index, int limit)297   static int partialIsValidUtf8(int state, ByteBuffer buffer, int index, int limit) {
298     return processor.partialIsValidUtf8(state, buffer, index, limit);
299   }
300 
301   /**
302    * Decodes the given UTF-8 portion of the {@link ByteBuffer} into a {@link String}.
303    *
304    * @throws InvalidProtocolBufferException if the input is not valid UTF-8.
305    */
decodeUtf8(ByteBuffer buffer, int index, int size)306   static String decodeUtf8(ByteBuffer buffer, int index, int size)
307       throws InvalidProtocolBufferException {
308     return processor.decodeUtf8(buffer, index, size);
309   }
310 
311   /**
312    * Decodes the given UTF-8 encoded byte array slice into a {@link String}.
313    *
314    * @throws InvalidProtocolBufferException if the input is not valid UTF-8.
315    */
decodeUtf8(byte[] bytes, int index, int size)316   static String decodeUtf8(byte[] bytes, int index, int size)
317       throws InvalidProtocolBufferException {
318     return processor.decodeUtf8(bytes, index, size);
319   }
320 
321   /**
322    * Encodes the given characters to the target {@link ByteBuffer} using UTF-8 encoding.
323    *
324    * <p>Selects an optimal algorithm based on the type of {@link ByteBuffer} (i.e. heap or direct)
325    * and the capabilities of the platform.
326    *
327    * @param in the source string to be encoded
328    * @param out the target buffer to receive the encoded string.
329    * @see Utf8#encode(String, byte[], int, int)
330    */
encodeUtf8(String in, ByteBuffer out)331   static void encodeUtf8(String in, ByteBuffer out) {
332     processor.encodeUtf8(in, out);
333   }
334 
335   /**
336    * Counts (approximately) the number of consecutive ASCII characters in the given buffer. The byte
337    * order of the {@link ByteBuffer} does not matter, so performance can be improved if native byte
338    * order is used (i.e. no byte-swapping in {@link ByteBuffer#getLong(int)}).
339    *
340    * @param buffer the buffer to be scanned for ASCII chars
341    * @param index the starting index of the scan
342    * @param limit the limit within buffer for the scan
343    * @return the number of ASCII characters found. The stopping position will be at or before the
344    *     first non-ASCII byte.
345    */
estimateConsecutiveAscii(ByteBuffer buffer, int index, int limit)346   private static int estimateConsecutiveAscii(ByteBuffer buffer, int index, int limit) {
347     int i = index;
348     final int lim = limit - 7;
349     // This simple loop stops when we encounter a byte >= 0x80 (i.e. non-ASCII).
350     // To speed things up further, we're reading longs instead of bytes so we use a mask to
351     // determine if any byte in the current long is non-ASCII.
352     for (; i < lim && (buffer.getLong(i) & ASCII_MASK_LONG) == 0; i += 8) {}
353     return i - index;
354   }
355 
356   /** A processor of UTF-8 strings, providing methods for checking validity and encoding. */
357   // TODO: Add support for Memory/MemoryBlock on Android.
358   abstract static class Processor {
359     /**
360      * Returns {@code true} if the given byte array slice is a well-formed UTF-8 byte sequence. The
361      * range of bytes to be checked extends from index {@code index}, inclusive, to {@code limit},
362      * exclusive.
363      *
364      * <p>This is a convenience method, equivalent to {@code partialIsValidUtf8(bytes, index, limit)
365      * == Utf8.COMPLETE}.
366      */
isValidUtf8(byte[] bytes, int index, int limit)367     final boolean isValidUtf8(byte[] bytes, int index, int limit) {
368       return partialIsValidUtf8(COMPLETE, bytes, index, limit) == COMPLETE;
369     }
370 
371     /**
372      * Tells whether the given byte array slice is a well-formed, malformed, or incomplete UTF-8
373      * byte sequence. The range of bytes to be checked extends from index {@code index}, inclusive,
374      * to {@code limit}, exclusive.
375      *
376      * @param state either {@link Utf8#COMPLETE} (if this is the initial decoding operation) or the
377      *     value returned from a call to a partial decoding method for the previous bytes
378      * @return {@link #MALFORMED} if the partial byte sequence is definitely not well-formed, {@link
379      *     #COMPLETE} if it is well-formed (no additional input needed), or if the byte sequence is
380      *     "incomplete", i.e. apparently terminated in the middle of a character, an opaque integer
381      *     "state" value containing enough information to decode the character when passed to a
382      *     subsequent invocation of a partial decoding method.
383      */
partialIsValidUtf8(int state, byte[] bytes, int index, int limit)384     abstract int partialIsValidUtf8(int state, byte[] bytes, int index, int limit);
385 
386     /**
387      * Returns {@code true} if the given portion of the {@link ByteBuffer} is a well-formed UTF-8
388      * byte sequence. The range of bytes to be checked extends from index {@code index}, inclusive,
389      * to {@code limit}, exclusive.
390      *
391      * <p>This is a convenience method, equivalent to {@code partialIsValidUtf8(bytes, index, limit)
392      * == Utf8.COMPLETE}.
393      */
isValidUtf8(ByteBuffer buffer, int index, int limit)394     final boolean isValidUtf8(ByteBuffer buffer, int index, int limit) {
395       return partialIsValidUtf8(COMPLETE, buffer, index, limit) == COMPLETE;
396     }
397 
398     /**
399      * Indicates whether or not the given buffer contains a valid UTF-8 string.
400      *
401      * @param buffer the buffer to check.
402      * @return {@code true} if the given buffer contains a valid UTF-8 string.
403      */
partialIsValidUtf8( final int state, final ByteBuffer buffer, int index, final int limit)404     final int partialIsValidUtf8(
405         final int state, final ByteBuffer buffer, int index, final int limit) {
406       if (buffer.hasArray()) {
407         final int offset = buffer.arrayOffset();
408         return partialIsValidUtf8(state, buffer.array(), offset + index, offset + limit);
409       } else if (buffer.isDirect()) {
410         return partialIsValidUtf8Direct(state, buffer, index, limit);
411       }
412       return partialIsValidUtf8Default(state, buffer, index, limit);
413     }
414 
415     /** Performs validation for direct {@link ByteBuffer} instances. */
partialIsValidUtf8Direct( final int state, final ByteBuffer buffer, int index, final int limit)416     abstract int partialIsValidUtf8Direct(
417         final int state, final ByteBuffer buffer, int index, final int limit);
418 
419     /**
420      * Performs validation for {@link ByteBuffer} instances using the {@link ByteBuffer} API rather
421      * than potentially faster approaches. This first completes validation for the current character
422      * (provided by {@code state}) and then finishes validation for the sequence.
423      */
partialIsValidUtf8Default( final int state, final ByteBuffer buffer, int index, final int limit)424     final int partialIsValidUtf8Default(
425         final int state, final ByteBuffer buffer, int index, final int limit) {
426       if (state != COMPLETE) {
427         // The previous decoding operation was incomplete (or malformed).
428         // We look for a well-formed sequence consisting of bytes from
429         // the previous decoding operation (stored in state) together
430         // with bytes from the array slice.
431         //
432         // We expect such "straddler characters" to be rare.
433 
434         if (index >= limit) { // No bytes? No progress.
435           return state;
436         }
437 
438         byte byte1 = (byte) state;
439         // byte1 is never ASCII.
440         if (byte1 < (byte) 0xE0) {
441           // two-byte form
442 
443           // Simultaneously checks for illegal trailing-byte in
444           // leading position and overlong 2-byte form.
445           if (byte1 < (byte) 0xC2
446               // byte2 trailing-byte test
447               || buffer.get(index++) > (byte) 0xBF) {
448             return MALFORMED;
449           }
450         } else if (byte1 < (byte) 0xF0) {
451           // three-byte form
452 
453           // Get byte2 from saved state or array
454           byte byte2 = (byte) ~(state >> 8);
455           if (byte2 == 0) {
456             byte2 = buffer.get(index++);
457             if (index >= limit) {
458               return incompleteStateFor(byte1, byte2);
459             }
460           }
461           if (byte2 > (byte) 0xBF
462               // overlong? 5 most significant bits must not all be zero
463               || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0)
464               // illegal surrogate codepoint?
465               || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0)
466               // byte3 trailing-byte test
467               || buffer.get(index++) > (byte) 0xBF) {
468             return MALFORMED;
469           }
470         } else {
471           // four-byte form
472 
473           // Get byte2 and byte3 from saved state or array
474           byte byte2 = (byte) ~(state >> 8);
475           byte byte3 = 0;
476           if (byte2 == 0) {
477             byte2 = buffer.get(index++);
478             if (index >= limit) {
479               return incompleteStateFor(byte1, byte2);
480             }
481           } else {
482             byte3 = (byte) (state >> 16);
483           }
484           if (byte3 == 0) {
485             byte3 = buffer.get(index++);
486             if (index >= limit) {
487               return incompleteStateFor(byte1, byte2, byte3);
488             }
489           }
490 
491           // If we were called with state == MALFORMED, then byte1 is 0xFF,
492           // which never occurs in well-formed UTF-8, and so we will return
493           // MALFORMED again below.
494 
495           if (byte2 > (byte) 0xBF
496               // Check that 1 <= plane <= 16.  Tricky optimized form of:
497               // if (byte1 > (byte) 0xF4 ||
498               //     byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 ||
499               //     byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F)
500               || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0
501               // byte3 trailing-byte test
502               || byte3 > (byte) 0xBF
503               // byte4 trailing-byte test
504               || buffer.get(index++) > (byte) 0xBF) {
505             return MALFORMED;
506           }
507         }
508       }
509 
510       // Finish validation for the sequence.
511       return partialIsValidUtf8(buffer, index, limit);
512     }
513 
514     /**
515      * Performs validation for {@link ByteBuffer} instances using the {@link ByteBuffer} API rather
516      * than potentially faster approaches.
517      */
partialIsValidUtf8(final ByteBuffer buffer, int index, final int limit)518     private static int partialIsValidUtf8(final ByteBuffer buffer, int index, final int limit) {
519       index += estimateConsecutiveAscii(buffer, index, limit);
520 
521       for (; ; ) {
522         // Optimize for interior runs of ASCII bytes.
523         // TODO: Consider checking 8 bytes at a time after some threshold?
524         // Maybe after seeing a few in a row that are ASCII, go back to fast mode?
525         int byte1;
526         do {
527           if (index >= limit) {
528             return COMPLETE;
529           }
530         } while ((byte1 = buffer.get(index++)) >= 0);
531 
532         // If we're here byte1 is not ASCII. Only need to handle 2-4 byte forms.
533         if (byte1 < (byte) 0xE0) {
534           // Two-byte form (110xxxxx 10xxxxxx)
535           if (index >= limit) {
536             // Incomplete sequence
537             return byte1;
538           }
539 
540           // Simultaneously checks for illegal trailing-byte in
541           // leading position and overlong 2-byte form.
542           if (byte1 < (byte) 0xC2 || buffer.get(index) > (byte) 0xBF) {
543             return MALFORMED;
544           }
545           index++;
546         } else if (byte1 < (byte) 0xF0) {
547           // Three-byte form (1110xxxx 10xxxxxx 10xxxxxx)
548           if (index >= limit - 1) {
549             // Incomplete sequence
550             return incompleteStateFor(buffer, byte1, index, limit - index);
551           }
552 
553           byte byte2 = buffer.get(index++);
554           if (byte2 > (byte) 0xBF
555               // overlong? 5 most significant bits must not all be zero
556               || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0)
557               // check for illegal surrogate codepoints
558               || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0)
559               // byte3 trailing-byte test
560               || buffer.get(index) > (byte) 0xBF) {
561             return MALFORMED;
562           }
563           index++;
564         } else {
565           // Four-byte form (1110xxxx 10xxxxxx 10xxxxxx 10xxxxxx)
566           if (index >= limit - 2) {
567             // Incomplete sequence
568             return incompleteStateFor(buffer, byte1, index, limit - index);
569           }
570 
571           // TODO: Consider using getInt() to improve performance.
572           int byte2 = buffer.get(index++);
573           if (byte2 > (byte) 0xBF
574               // Check that 1 <= plane <= 16.  Tricky optimized form of:
575               // if (byte1 > (byte) 0xF4 ||
576               //     byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 ||
577               //     byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F)
578               || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0
579               // byte3 trailing-byte test
580               || buffer.get(index++) > (byte) 0xBF
581               // byte4 trailing-byte test
582               || buffer.get(index++) > (byte) 0xBF) {
583             return MALFORMED;
584           }
585         }
586       }
587     }
588 
589     /**
590      * Decodes the given byte array slice into a {@link String}.
591      *
592      * @throws InvalidProtocolBufferException if the byte array slice is not valid UTF-8
593      */
decodeUtf8(byte[] bytes, int index, int size)594     abstract String decodeUtf8(byte[] bytes, int index, int size)
595         throws InvalidProtocolBufferException;
596 
597     /**
598      * Decodes the given portion of the {@link ByteBuffer} into a {@link String}.
599      *
600      * @throws InvalidProtocolBufferException if the portion of the buffer is not valid UTF-8
601      */
decodeUtf8(ByteBuffer buffer, int index, int size)602     final String decodeUtf8(ByteBuffer buffer, int index, int size)
603         throws InvalidProtocolBufferException {
604       if (buffer.hasArray()) {
605         final int offset = buffer.arrayOffset();
606         return decodeUtf8(buffer.array(), offset + index, size);
607       } else if (buffer.isDirect()) {
608         return decodeUtf8Direct(buffer, index, size);
609       }
610       return decodeUtf8Default(buffer, index, size);
611     }
612 
613     /** Decodes direct {@link ByteBuffer} instances into {@link String}. */
decodeUtf8Direct(ByteBuffer buffer, int index, int size)614     abstract String decodeUtf8Direct(ByteBuffer buffer, int index, int size)
615         throws InvalidProtocolBufferException;
616 
617     /**
618      * Decodes {@link ByteBuffer} instances using the {@link ByteBuffer} API rather than potentially
619      * faster approaches.
620      */
decodeUtf8Default(ByteBuffer buffer, int index, int size)621     final String decodeUtf8Default(ByteBuffer buffer, int index, int size)
622         throws InvalidProtocolBufferException {
623       // Bitwise OR combines the sign bits so any negative value fails the check.
624       if ((index | size | buffer.limit() - index - size) < 0) {
625         throw new ArrayIndexOutOfBoundsException(
626             String.format("buffer limit=%d, index=%d, limit=%d", buffer.limit(), index, size));
627       }
628 
629       int offset = index;
630       int limit = offset + size;
631 
632       // The longest possible resulting String is the same as the number of input bytes, when it is
633       // all ASCII. For other cases, this over-allocates and we will truncate in the end.
634       char[] resultArr = new char[size];
635       int resultPos = 0;
636 
637       // Optimize for 100% ASCII (Hotspot loves small simple top-level loops like this).
638       // This simple loop stops when we encounter a byte >= 0x80 (i.e. non-ASCII).
639       while (offset < limit) {
640         byte b = buffer.get(offset);
641         if (!DecodeUtil.isOneByte(b)) {
642           break;
643         }
644         offset++;
645         DecodeUtil.handleOneByte(b, resultArr, resultPos++);
646       }
647 
648       while (offset < limit) {
649         byte byte1 = buffer.get(offset++);
650         if (DecodeUtil.isOneByte(byte1)) {
651           DecodeUtil.handleOneByte(byte1, resultArr, resultPos++);
652           // It's common for there to be multiple ASCII characters in a run mixed in, so add an
653           // extra optimized loop to take care of these runs.
654           while (offset < limit) {
655             byte b = buffer.get(offset);
656             if (!DecodeUtil.isOneByte(b)) {
657               break;
658             }
659             offset++;
660             DecodeUtil.handleOneByte(b, resultArr, resultPos++);
661           }
662         } else if (DecodeUtil.isTwoBytes(byte1)) {
663           if (offset >= limit) {
664             throw InvalidProtocolBufferException.invalidUtf8();
665           }
666           DecodeUtil.handleTwoBytes(
667               byte1, /* byte2 */ buffer.get(offset++), resultArr, resultPos++);
668         } else if (DecodeUtil.isThreeBytes(byte1)) {
669           if (offset >= limit - 1) {
670             throw InvalidProtocolBufferException.invalidUtf8();
671           }
672           DecodeUtil.handleThreeBytes(
673               byte1,
674               /* byte2 */ buffer.get(offset++),
675               /* byte3 */ buffer.get(offset++),
676               resultArr,
677               resultPos++);
678         } else {
679           if (offset >= limit - 2) {
680             throw InvalidProtocolBufferException.invalidUtf8();
681           }
682           DecodeUtil.handleFourBytes(
683               byte1,
684               /* byte2 */ buffer.get(offset++),
685               /* byte3 */ buffer.get(offset++),
686               /* byte4 */ buffer.get(offset++),
687               resultArr,
688               resultPos++);
689           // 4-byte case requires two chars.
690           resultPos++;
691         }
692       }
693 
694       return new String(resultArr, 0, resultPos);
695     }
696 
697     /**
698      * Encodes an input character sequence ({@code in}) to UTF-8 in the target array ({@code out}).
699      * For a string, this method is similar to
700      *
701      * <pre>{@code
702      * byte[] a = string.getBytes(UTF_8);
703      * System.arraycopy(a, 0, bytes, offset, a.length);
704      * return offset + a.length;
705      * }</pre>
706      *
707      * but is more efficient in both time and space. One key difference is that this method requires
708      * paired surrogates, and therefore does not support chunking. While {@code
709      * String.getBytes(UTF_8)} replaces unpaired surrogates with the default replacement character,
710      * this method throws {@link UnpairedSurrogateException}.
711      *
712      * <p>To ensure sufficient space in the output buffer, either call {@link #encodedLength} to
713      * compute the exact amount needed, or leave room for {@code Utf8.MAX_BYTES_PER_CHAR *
714      * sequence.length()}, which is the largest possible number of bytes that any input can be
715      * encoded to.
716      *
717      * @param in the input character sequence to be encoded
718      * @param out the target array
719      * @param offset the starting offset in {@code bytes} to start writing at
720      * @param length the length of the {@code bytes}, starting from {@code offset}
721      * @throws UnpairedSurrogateException if {@code sequence} contains ill-formed UTF-16 (unpaired
722      *     surrogates)
723      * @throws ArrayIndexOutOfBoundsException if {@code sequence} encoded in UTF-8 is longer than
724      *     {@code bytes.length - offset}
725      * @return the new offset, equivalent to {@code offset + Utf8.encodedLength(sequence)}
726      */
encodeUtf8(String in, byte[] out, int offset, int length)727     abstract int encodeUtf8(String in, byte[] out, int offset, int length);
728 
729     /**
730      * Encodes an input character sequence ({@code in}) to UTF-8 in the target buffer ({@code out}).
731      * Upon returning from this method, the {@code out} position will point to the position after
732      * the last encoded byte. This method requires paired surrogates, and therefore does not support
733      * chunking.
734      *
735      * <p>To ensure sufficient space in the output buffer, either call {@link #encodedLength} to
736      * compute the exact amount needed, or leave room for {@code Utf8.MAX_BYTES_PER_CHAR *
737      * in.length()}, which is the largest possible number of bytes that any input can be encoded to.
738      *
739      * @param in the source character sequence to be encoded
740      * @param out the target buffer
741      * @throws UnpairedSurrogateException if {@code in} contains ill-formed UTF-16 (unpaired
742      *     surrogates)
743      * @throws ArrayIndexOutOfBoundsException if {@code in} encoded in UTF-8 is longer than {@code
744      *     out.remaining()}
745      */
encodeUtf8(String in, ByteBuffer out)746     final void encodeUtf8(String in, ByteBuffer out) {
747       if (out.hasArray()) {
748         final int offset = out.arrayOffset();
749         int endIndex = Utf8.encode(in, out.array(), offset + out.position(), out.remaining());
750         Java8Compatibility.position(out, endIndex - offset);
751       } else if (out.isDirect()) {
752         encodeUtf8Direct(in, out);
753       } else {
754         encodeUtf8Default(in, out);
755       }
756     }
757 
758     /** Encodes the input character sequence to a direct {@link ByteBuffer} instance. */
encodeUtf8Direct(String in, ByteBuffer out)759     abstract void encodeUtf8Direct(String in, ByteBuffer out);
760 
761     /**
762      * Encodes the input character sequence to a {@link ByteBuffer} instance using the {@link
763      * ByteBuffer} API, rather than potentially faster approaches.
764      */
encodeUtf8Default(String in, ByteBuffer out)765     final void encodeUtf8Default(String in, ByteBuffer out) {
766       final int inLength = in.length();
767       int outIx = out.position();
768       int inIx = 0;
769 
770       // Since ByteBuffer.putXXX() already checks boundaries for us, no need to explicitly check
771       // access. Assume the buffer is big enough and let it handle the out of bounds exception
772       // if it occurs.
773       try {
774         // Designed to take advantage of
775         // https://wiki.openjdk.java.net/display/HotSpotInternals/RangeCheckElimination
776         for (char c; inIx < inLength && (c = in.charAt(inIx)) < 0x80; ++inIx) {
777           out.put(outIx + inIx, (byte) c);
778         }
779         if (inIx == inLength) {
780           // Successfully encoded the entire string.
781           Java8Compatibility.position(out, outIx + inIx);
782           return;
783         }
784 
785         outIx += inIx;
786         for (char c; inIx < inLength; ++inIx, ++outIx) {
787           c = in.charAt(inIx);
788           if (c < 0x80) {
789             // One byte (0xxx xxxx)
790             out.put(outIx, (byte) c);
791           } else if (c < 0x800) {
792             // Two bytes (110x xxxx 10xx xxxx)
793 
794             // Benchmarks show put performs better than putShort here (for HotSpot).
795             out.put(outIx++, (byte) (0xC0 | (c >>> 6)));
796             out.put(outIx, (byte) (0x80 | (0x3F & c)));
797           } else if (c < MIN_SURROGATE || MAX_SURROGATE < c) {
798             // Three bytes (1110 xxxx 10xx xxxx 10xx xxxx)
799             // Maximum single-char code point is 0xFFFF, 16 bits.
800 
801             // Benchmarks show put performs better than putShort here (for HotSpot).
802             out.put(outIx++, (byte) (0xE0 | (c >>> 12)));
803             out.put(outIx++, (byte) (0x80 | (0x3F & (c >>> 6))));
804             out.put(outIx, (byte) (0x80 | (0x3F & c)));
805           } else {
806             // Four bytes (1111 xxxx 10xx xxxx 10xx xxxx 10xx xxxx)
807 
808             // Minimum code point represented by a surrogate pair is 0x10000, 17 bits, four UTF-8
809             // bytes
810             final char low;
811             if (inIx + 1 == inLength || !isSurrogatePair(c, (low = in.charAt(++inIx)))) {
812               throw new UnpairedSurrogateException(inIx, inLength);
813             }
814             // TODO: Consider using putInt() to improve performance.
815             int codePoint = toCodePoint(c, low);
816             out.put(outIx++, (byte) ((0xF << 4) | (codePoint >>> 18)));
817             out.put(outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 12))));
818             out.put(outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 6))));
819             out.put(outIx, (byte) (0x80 | (0x3F & codePoint)));
820           }
821         }
822 
823         // Successfully encoded the entire string.
824         Java8Compatibility.position(out, outIx);
825       } catch (IndexOutOfBoundsException e) {
826         // TODO: Consider making the API throw IndexOutOfBoundsException instead.
827 
828         // If we failed in the outer ASCII loop, outIx will not have been updated. In this case,
829         // use inIx to determine the bad write index.
830         int badWriteIndex = out.position() + Math.max(inIx, outIx - out.position() + 1);
831         throw new ArrayIndexOutOfBoundsException(
832             "Failed writing " + in.charAt(inIx) + " at index " + badWriteIndex);
833       }
834     }
835   }
836 
837   /** {@link Processor} implementation that does not use any {@code sun.misc.Unsafe} methods. */
838   static final class SafeProcessor extends Processor {
839     @Override
partialIsValidUtf8(int state, byte[] bytes, int index, int limit)840     int partialIsValidUtf8(int state, byte[] bytes, int index, int limit) {
841       if (state != COMPLETE) {
842         // The previous decoding operation was incomplete (or malformed).
843         // We look for a well-formed sequence consisting of bytes from
844         // the previous decoding operation (stored in state) together
845         // with bytes from the array slice.
846         //
847         // We expect such "straddler characters" to be rare.
848 
849         if (index >= limit) { // No bytes? No progress.
850           return state;
851         }
852         int byte1 = (byte) state;
853         // byte1 is never ASCII.
854         if (byte1 < (byte) 0xE0) {
855           // two-byte form
856 
857           // Simultaneously checks for illegal trailing-byte in
858           // leading position and overlong 2-byte form.
859           if (byte1 < (byte) 0xC2
860               // byte2 trailing-byte test
861               || bytes[index++] > (byte) 0xBF) {
862             return MALFORMED;
863           }
864         } else if (byte1 < (byte) 0xF0) {
865           // three-byte form
866 
867           // Get byte2 from saved state or array
868           int byte2 = (byte) ~(state >> 8);
869           if (byte2 == 0) {
870             byte2 = bytes[index++];
871             if (index >= limit) {
872               return incompleteStateFor(byte1, byte2);
873             }
874           }
875           if (byte2 > (byte) 0xBF
876               // overlong? 5 most significant bits must not all be zero
877               || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0)
878               // illegal surrogate codepoint?
879               || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0)
880               // byte3 trailing-byte test
881               || bytes[index++] > (byte) 0xBF) {
882             return MALFORMED;
883           }
884         } else {
885           // four-byte form
886 
887           // Get byte2 and byte3 from saved state or array
888           int byte2 = (byte) ~(state >> 8);
889           int byte3 = 0;
890           if (byte2 == 0) {
891             byte2 = bytes[index++];
892             if (index >= limit) {
893               return incompleteStateFor(byte1, byte2);
894             }
895           } else {
896             byte3 = (byte) (state >> 16);
897           }
898           if (byte3 == 0) {
899             byte3 = bytes[index++];
900             if (index >= limit) {
901               return incompleteStateFor(byte1, byte2, byte3);
902             }
903           }
904 
905           // If we were called with state == MALFORMED, then byte1 is 0xFF,
906           // which never occurs in well-formed UTF-8, and so we will return
907           // MALFORMED again below.
908 
909           if (byte2 > (byte) 0xBF
910               // Check that 1 <= plane <= 16.  Tricky optimized form of:
911               // if (byte1 > (byte) 0xF4 ||
912               //     byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 ||
913               //     byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F)
914               || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0
915               // byte3 trailing-byte test
916               || byte3 > (byte) 0xBF
917               // byte4 trailing-byte test
918               || bytes[index++] > (byte) 0xBF) {
919             return MALFORMED;
920           }
921         }
922       }
923 
924       return partialIsValidUtf8(bytes, index, limit);
925     }
926 
927     @Override
partialIsValidUtf8Direct(int state, ByteBuffer buffer, int index, int limit)928     int partialIsValidUtf8Direct(int state, ByteBuffer buffer, int index, int limit) {
929       // For safe processing, we have to use the ByteBuffer API.
930       return partialIsValidUtf8Default(state, buffer, index, limit);
931     }
932 
933     @Override
decodeUtf8(byte[] bytes, int index, int size)934     String decodeUtf8(byte[] bytes, int index, int size) throws InvalidProtocolBufferException {
935       // Bitwise OR combines the sign bits so any negative value fails the check.
936       if ((index | size | bytes.length - index - size) < 0) {
937         throw new ArrayIndexOutOfBoundsException(
938             String.format("buffer length=%d, index=%d, size=%d", bytes.length, index, size));
939       }
940 
941       int offset = index;
942       final int limit = offset + size;
943 
944       // The longest possible resulting String is the same as the number of input bytes, when it is
945       // all ASCII. For other cases, this over-allocates and we will truncate in the end.
946       char[] resultArr = new char[size];
947       int resultPos = 0;
948 
949       // Optimize for 100% ASCII (Hotspot loves small simple top-level loops like this).
950       // This simple loop stops when we encounter a byte >= 0x80 (i.e. non-ASCII).
951       while (offset < limit) {
952         byte b = bytes[offset];
953         if (!DecodeUtil.isOneByte(b)) {
954           break;
955         }
956         offset++;
957         DecodeUtil.handleOneByte(b, resultArr, resultPos++);
958       }
959 
960       while (offset < limit) {
961         byte byte1 = bytes[offset++];
962         if (DecodeUtil.isOneByte(byte1)) {
963           DecodeUtil.handleOneByte(byte1, resultArr, resultPos++);
964           // It's common for there to be multiple ASCII characters in a run mixed in, so add an
965           // extra optimized loop to take care of these runs.
966           while (offset < limit) {
967             byte b = bytes[offset];
968             if (!DecodeUtil.isOneByte(b)) {
969               break;
970             }
971             offset++;
972             DecodeUtil.handleOneByte(b, resultArr, resultPos++);
973           }
974         } else if (DecodeUtil.isTwoBytes(byte1)) {
975           if (offset >= limit) {
976             throw InvalidProtocolBufferException.invalidUtf8();
977           }
978           DecodeUtil.handleTwoBytes(byte1, /* byte2 */ bytes[offset++], resultArr, resultPos++);
979         } else if (DecodeUtil.isThreeBytes(byte1)) {
980           if (offset >= limit - 1) {
981             throw InvalidProtocolBufferException.invalidUtf8();
982           }
983           DecodeUtil.handleThreeBytes(
984               byte1,
985               /* byte2 */ bytes[offset++],
986               /* byte3 */ bytes[offset++],
987               resultArr,
988               resultPos++);
989         } else {
990           if (offset >= limit - 2) {
991             throw InvalidProtocolBufferException.invalidUtf8();
992           }
993           DecodeUtil.handleFourBytes(
994               byte1,
995               /* byte2 */ bytes[offset++],
996               /* byte3 */ bytes[offset++],
997               /* byte4 */ bytes[offset++],
998               resultArr,
999               resultPos++);
1000           // 4-byte case requires two chars.
1001           resultPos++;
1002         }
1003       }
1004 
1005       return new String(resultArr, 0, resultPos);
1006     }
1007 
1008     @Override
decodeUtf8Direct(ByteBuffer buffer, int index, int size)1009     String decodeUtf8Direct(ByteBuffer buffer, int index, int size)
1010         throws InvalidProtocolBufferException {
1011       // For safe processing, we have to use the ByteBufferAPI.
1012       return decodeUtf8Default(buffer, index, size);
1013     }
1014 
1015     @Override
encodeUtf8(String in, byte[] out, int offset, int length)1016     int encodeUtf8(String in, byte[] out, int offset, int length) {
1017       int utf16Length = in.length();
1018       int j = offset;
1019       int i = 0;
1020       int limit = offset + length;
1021       // Designed to take advantage of
1022       // https://wiki.openjdk.java.net/display/HotSpotInternals/RangeCheckElimination
1023       for (char c; i < utf16Length && i + j < limit && (c = in.charAt(i)) < 0x80; i++) {
1024         out[j + i] = (byte) c;
1025       }
1026       if (i == utf16Length) {
1027         return j + utf16Length;
1028       }
1029       j += i;
1030       for (char c; i < utf16Length; i++) {
1031         c = in.charAt(i);
1032         if (c < 0x80 && j < limit) {
1033           out[j++] = (byte) c;
1034         } else if (c < 0x800 && j <= limit - 2) { // 11 bits, two UTF-8 bytes
1035           out[j++] = (byte) ((0xF << 6) | (c >>> 6));
1036           out[j++] = (byte) (0x80 | (0x3F & c));
1037         } else if ((c < Character.MIN_SURROGATE || Character.MAX_SURROGATE < c) && j <= limit - 3) {
1038           // Maximum single-char code point is 0xFFFF, 16 bits, three UTF-8 bytes
1039           out[j++] = (byte) ((0xF << 5) | (c >>> 12));
1040           out[j++] = (byte) (0x80 | (0x3F & (c >>> 6)));
1041           out[j++] = (byte) (0x80 | (0x3F & c));
1042         } else if (j <= limit - 4) {
1043           // Minimum code point represented by a surrogate pair is 0x10000, 17 bits,
1044           // four UTF-8 bytes
1045           final char low;
1046           if (i + 1 == in.length() || !Character.isSurrogatePair(c, (low = in.charAt(++i)))) {
1047             throw new UnpairedSurrogateException((i - 1), utf16Length);
1048           }
1049           int codePoint = Character.toCodePoint(c, low);
1050           out[j++] = (byte) ((0xF << 4) | (codePoint >>> 18));
1051           out[j++] = (byte) (0x80 | (0x3F & (codePoint >>> 12)));
1052           out[j++] = (byte) (0x80 | (0x3F & (codePoint >>> 6)));
1053           out[j++] = (byte) (0x80 | (0x3F & codePoint));
1054         } else {
1055           // If we are surrogates and we're not a surrogate pair, always throw an
1056           // UnpairedSurrogateException instead of an ArrayOutOfBoundsException.
1057           if ((Character.MIN_SURROGATE <= c && c <= Character.MAX_SURROGATE)
1058               && (i + 1 == in.length() || !Character.isSurrogatePair(c, in.charAt(i + 1)))) {
1059             throw new UnpairedSurrogateException(i, utf16Length);
1060           }
1061           throw new ArrayIndexOutOfBoundsException("Failed writing " + c + " at index " + j);
1062         }
1063       }
1064       return j;
1065     }
1066 
1067     @Override
encodeUtf8Direct(String in, ByteBuffer out)1068     void encodeUtf8Direct(String in, ByteBuffer out) {
1069       // For safe processing, we have to use the ByteBuffer API.
1070       encodeUtf8Default(in, out);
1071     }
1072 
partialIsValidUtf8(byte[] bytes, int index, int limit)1073     private static int partialIsValidUtf8(byte[] bytes, int index, int limit) {
1074       // Optimize for 100% ASCII (Hotspot loves small simple top-level loops like this).
1075       // This simple loop stops when we encounter a byte >= 0x80 (i.e. non-ASCII).
1076       while (index < limit && bytes[index] >= 0) {
1077         index++;
1078       }
1079 
1080       return (index >= limit) ? COMPLETE : partialIsValidUtf8NonAscii(bytes, index, limit);
1081     }
1082 
partialIsValidUtf8NonAscii(byte[] bytes, int index, int limit)1083     private static int partialIsValidUtf8NonAscii(byte[] bytes, int index, int limit) {
1084       for (; ; ) {
1085         int byte1;
1086         int byte2;
1087 
1088         // Optimize for interior runs of ASCII bytes.
1089         do {
1090           if (index >= limit) {
1091             return COMPLETE;
1092           }
1093         } while ((byte1 = bytes[index++]) >= 0);
1094 
1095         if (byte1 < (byte) 0xE0) {
1096           // two-byte form
1097 
1098           if (index >= limit) {
1099             // Incomplete sequence
1100             return byte1;
1101           }
1102 
1103           // Simultaneously checks for illegal trailing-byte in
1104           // leading position and overlong 2-byte form.
1105           if (byte1 < (byte) 0xC2 || bytes[index++] > (byte) 0xBF) {
1106             return MALFORMED;
1107           }
1108         } else if (byte1 < (byte) 0xF0) {
1109           // three-byte form
1110 
1111           if (index >= limit - 1) { // incomplete sequence
1112             return incompleteStateFor(bytes, index, limit);
1113           }
1114           if ((byte2 = bytes[index++]) > (byte) 0xBF
1115               // overlong? 5 most significant bits must not all be zero
1116               || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0)
1117               // check for illegal surrogate codepoints
1118               || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0)
1119               // byte3 trailing-byte test
1120               || bytes[index++] > (byte) 0xBF) {
1121             return MALFORMED;
1122           }
1123         } else {
1124           // four-byte form
1125 
1126           if (index >= limit - 2) { // incomplete sequence
1127             return incompleteStateFor(bytes, index, limit);
1128           }
1129           if ((byte2 = bytes[index++]) > (byte) 0xBF
1130               // Check that 1 <= plane <= 16.  Tricky optimized form of:
1131               // if (byte1 > (byte) 0xF4 ||
1132               //     byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 ||
1133               //     byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F)
1134               || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0
1135               // byte3 trailing-byte test
1136               || bytes[index++] > (byte) 0xBF
1137               // byte4 trailing-byte test
1138               || bytes[index++] > (byte) 0xBF) {
1139             return MALFORMED;
1140           }
1141         }
1142       }
1143     }
1144   }
1145 
1146   /** {@link Processor} that uses {@code sun.misc.Unsafe} where possible to improve performance. */
1147   static final class UnsafeProcessor extends Processor {
1148     /** Indicates whether or not all required unsafe operations are supported on this platform. */
isAvailable()1149     static boolean isAvailable() {
1150       return hasUnsafeArrayOperations() && hasUnsafeByteBufferOperations();
1151     }
1152 
1153     @Override
partialIsValidUtf8(int state, byte[] bytes, final int index, final int limit)1154     int partialIsValidUtf8(int state, byte[] bytes, final int index, final int limit) {
1155       // Bitwise OR combines the sign bits so any negative value fails the check.
1156       if ((index | limit | bytes.length - limit) < 0) {
1157         throw new ArrayIndexOutOfBoundsException(
1158             String.format("Array length=%d, index=%d, limit=%d", bytes.length, index, limit));
1159       }
1160       long offset = index;
1161       final long offsetLimit = limit;
1162       if (state != COMPLETE) {
1163         // The previous decoding operation was incomplete (or malformed).
1164         // We look for a well-formed sequence consisting of bytes from
1165         // the previous decoding operation (stored in state) together
1166         // with bytes from the array slice.
1167         //
1168         // We expect such "straddler characters" to be rare.
1169 
1170         if (offset >= offsetLimit) { // No bytes? No progress.
1171           return state;
1172         }
1173         int byte1 = (byte) state;
1174         // byte1 is never ASCII.
1175         if (byte1 < (byte) 0xE0) {
1176           // two-byte form
1177 
1178           // Simultaneously checks for illegal trailing-byte in
1179           // leading position and overlong 2-byte form.
1180           if (byte1 < (byte) 0xC2
1181               // byte2 trailing-byte test
1182               || UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF) {
1183             return MALFORMED;
1184           }
1185         } else if (byte1 < (byte) 0xF0) {
1186           // three-byte form
1187 
1188           // Get byte2 from saved state or array
1189           int byte2 = (byte) ~(state >> 8);
1190           if (byte2 == 0) {
1191             byte2 = UnsafeUtil.getByte(bytes, offset++);
1192             if (offset >= offsetLimit) {
1193               return incompleteStateFor(byte1, byte2);
1194             }
1195           }
1196           if (byte2 > (byte) 0xBF
1197               // overlong? 5 most significant bits must not all be zero
1198               || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0)
1199               // illegal surrogate codepoint?
1200               || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0)
1201               // byte3 trailing-byte test
1202               || UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF) {
1203             return MALFORMED;
1204           }
1205         } else {
1206           // four-byte form
1207 
1208           // Get byte2 and byte3 from saved state or array
1209           int byte2 = (byte) ~(state >> 8);
1210           int byte3 = 0;
1211           if (byte2 == 0) {
1212             byte2 = UnsafeUtil.getByte(bytes, offset++);
1213             if (offset >= offsetLimit) {
1214               return incompleteStateFor(byte1, byte2);
1215             }
1216           } else {
1217             byte3 = (byte) (state >> 16);
1218           }
1219           if (byte3 == 0) {
1220             byte3 = UnsafeUtil.getByte(bytes, offset++);
1221             if (offset >= offsetLimit) {
1222               return incompleteStateFor(byte1, byte2, byte3);
1223             }
1224           }
1225 
1226           // If we were called with state == MALFORMED, then byte1 is 0xFF,
1227           // which never occurs in well-formed UTF-8, and so we will return
1228           // MALFORMED again below.
1229 
1230           if (byte2 > (byte) 0xBF
1231               // Check that 1 <= plane <= 16.  Tricky optimized form of:
1232               // if (byte1 > (byte) 0xF4 ||
1233               //     byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 ||
1234               //     byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F)
1235               || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0
1236               // byte3 trailing-byte test
1237               || byte3 > (byte) 0xBF
1238               // byte4 trailing-byte test
1239               || UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF) {
1240             return MALFORMED;
1241           }
1242         }
1243       }
1244 
1245       return partialIsValidUtf8(bytes, offset, (int) (offsetLimit - offset));
1246     }
1247 
1248     @Override
partialIsValidUtf8Direct( final int state, ByteBuffer buffer, final int index, final int limit)1249     int partialIsValidUtf8Direct(
1250         final int state, ByteBuffer buffer, final int index, final int limit) {
1251       // Bitwise OR combines the sign bits so any negative value fails the check.
1252       if ((index | limit | buffer.limit() - limit) < 0) {
1253         throw new ArrayIndexOutOfBoundsException(
1254             String.format("buffer limit=%d, index=%d, limit=%d", buffer.limit(), index, limit));
1255       }
1256       long address = addressOffset(buffer) + index;
1257       final long addressLimit = address + (limit - index);
1258       if (state != COMPLETE) {
1259         // The previous decoding operation was incomplete (or malformed).
1260         // We look for a well-formed sequence consisting of bytes from
1261         // the previous decoding operation (stored in state) together
1262         // with bytes from the array slice.
1263         //
1264         // We expect such "straddler characters" to be rare.
1265 
1266         if (address >= addressLimit) { // No bytes? No progress.
1267           return state;
1268         }
1269 
1270         final int byte1 = (byte) state;
1271         // byte1 is never ASCII.
1272         if (byte1 < (byte) 0xE0) {
1273           // two-byte form
1274 
1275           // Simultaneously checks for illegal trailing-byte in
1276           // leading position and overlong 2-byte form.
1277           if (byte1 < (byte) 0xC2
1278               // byte2 trailing-byte test
1279               || UnsafeUtil.getByte(address++) > (byte) 0xBF) {
1280             return MALFORMED;
1281           }
1282         } else if (byte1 < (byte) 0xF0) {
1283           // three-byte form
1284 
1285           // Get byte2 from saved state or array
1286           int byte2 = (byte) ~(state >> 8);
1287           if (byte2 == 0) {
1288             byte2 = UnsafeUtil.getByte(address++);
1289             if (address >= addressLimit) {
1290               return incompleteStateFor(byte1, byte2);
1291             }
1292           }
1293           if (byte2 > (byte) 0xBF
1294               // overlong? 5 most significant bits must not all be zero
1295               || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0)
1296               // illegal surrogate codepoint?
1297               || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0)
1298               // byte3 trailing-byte test
1299               || UnsafeUtil.getByte(address++) > (byte) 0xBF) {
1300             return MALFORMED;
1301           }
1302         } else {
1303           // four-byte form
1304 
1305           // Get byte2 and byte3 from saved state or array
1306           int byte2 = (byte) ~(state >> 8);
1307           int byte3 = 0;
1308           if (byte2 == 0) {
1309             byte2 = UnsafeUtil.getByte(address++);
1310             if (address >= addressLimit) {
1311               return incompleteStateFor(byte1, byte2);
1312             }
1313           } else {
1314             byte3 = (byte) (state >> 16);
1315           }
1316           if (byte3 == 0) {
1317             byte3 = UnsafeUtil.getByte(address++);
1318             if (address >= addressLimit) {
1319               return incompleteStateFor(byte1, byte2, byte3);
1320             }
1321           }
1322 
1323           // If we were called with state == MALFORMED, then byte1 is 0xFF,
1324           // which never occurs in well-formed UTF-8, and so we will return
1325           // MALFORMED again below.
1326 
1327           if (byte2 > (byte) 0xBF
1328               // Check that 1 <= plane <= 16.  Tricky optimized form of:
1329               // if (byte1 > (byte) 0xF4 ||
1330               //     byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 ||
1331               //     byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F)
1332               || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0
1333               // byte3 trailing-byte test
1334               || byte3 > (byte) 0xBF
1335               // byte4 trailing-byte test
1336               || UnsafeUtil.getByte(address++) > (byte) 0xBF) {
1337             return MALFORMED;
1338           }
1339         }
1340       }
1341 
1342       return partialIsValidUtf8(address, (int) (addressLimit - address));
1343     }
1344 
1345     @Override
decodeUtf8(byte[] bytes, int index, int size)1346     String decodeUtf8(byte[] bytes, int index, int size) throws InvalidProtocolBufferException {
1347       String s = new String(bytes, index, size, Internal.UTF_8);
1348 
1349       // '\uFFFD' is the UTF-8 default replacement char, which illegal byte sequences get replaced
1350       // with.
1351       if (s.indexOf('\uFFFD') < 0) {
1352         return s;
1353       }
1354 
1355       // Since s contains '\uFFFD' there are 2 options:
1356       // 1) The byte array slice is invalid UTF-8.
1357       // 2) The byte array slice is valid UTF-8 and contains encodings for "\uFFFD".
1358       // To rule out (1), we encode s and compare it to the byte array slice.
1359       // If the byte array slice was invalid UTF-8, then we would get a different sequence of bytes.
1360       if (Arrays.equals(
1361           s.getBytes(Internal.UTF_8), Arrays.copyOfRange(bytes, index, index + size))) {
1362         return s;
1363       }
1364 
1365       throw InvalidProtocolBufferException.invalidUtf8();
1366     }
1367 
1368     @Override
decodeUtf8Direct(ByteBuffer buffer, int index, int size)1369     String decodeUtf8Direct(ByteBuffer buffer, int index, int size)
1370         throws InvalidProtocolBufferException {
1371       // Bitwise OR combines the sign bits so any negative value fails the check.
1372       if ((index | size | buffer.limit() - index - size) < 0) {
1373         throw new ArrayIndexOutOfBoundsException(
1374             String.format("buffer limit=%d, index=%d, limit=%d", buffer.limit(), index, size));
1375       }
1376       long address = UnsafeUtil.addressOffset(buffer) + index;
1377       final long addressLimit = address + size;
1378 
1379       // The longest possible resulting String is the same as the number of input bytes, when it is
1380       // all ASCII. For other cases, this over-allocates and we will truncate in the end.
1381       char[] resultArr = new char[size];
1382       int resultPos = 0;
1383 
1384       // Optimize for 100% ASCII (Hotspot loves small simple top-level loops like this).
1385       // This simple loop stops when we encounter a byte >= 0x80 (i.e. non-ASCII).
1386       while (address < addressLimit) {
1387         byte b = UnsafeUtil.getByte(address);
1388         if (!DecodeUtil.isOneByte(b)) {
1389           break;
1390         }
1391         address++;
1392         DecodeUtil.handleOneByte(b, resultArr, resultPos++);
1393       }
1394 
1395       while (address < addressLimit) {
1396         byte byte1 = UnsafeUtil.getByte(address++);
1397         if (DecodeUtil.isOneByte(byte1)) {
1398           DecodeUtil.handleOneByte(byte1, resultArr, resultPos++);
1399           // It's common for there to be multiple ASCII characters in a run mixed in, so add an
1400           // extra optimized loop to take care of these runs.
1401           while (address < addressLimit) {
1402             byte b = UnsafeUtil.getByte(address);
1403             if (!DecodeUtil.isOneByte(b)) {
1404               break;
1405             }
1406             address++;
1407             DecodeUtil.handleOneByte(b, resultArr, resultPos++);
1408           }
1409         } else if (DecodeUtil.isTwoBytes(byte1)) {
1410           if (address >= addressLimit) {
1411             throw InvalidProtocolBufferException.invalidUtf8();
1412           }
1413           DecodeUtil.handleTwoBytes(
1414               byte1, /* byte2 */ UnsafeUtil.getByte(address++), resultArr, resultPos++);
1415         } else if (DecodeUtil.isThreeBytes(byte1)) {
1416           if (address >= addressLimit - 1) {
1417             throw InvalidProtocolBufferException.invalidUtf8();
1418           }
1419           DecodeUtil.handleThreeBytes(
1420               byte1,
1421               /* byte2 */ UnsafeUtil.getByte(address++),
1422               /* byte3 */ UnsafeUtil.getByte(address++),
1423               resultArr,
1424               resultPos++);
1425         } else {
1426           if (address >= addressLimit - 2) {
1427             throw InvalidProtocolBufferException.invalidUtf8();
1428           }
1429           DecodeUtil.handleFourBytes(
1430               byte1,
1431               /* byte2 */ UnsafeUtil.getByte(address++),
1432               /* byte3 */ UnsafeUtil.getByte(address++),
1433               /* byte4 */ UnsafeUtil.getByte(address++),
1434               resultArr,
1435               resultPos++);
1436           // 4-byte case requires two chars.
1437           resultPos++;
1438         }
1439       }
1440 
1441       return new String(resultArr, 0, resultPos);
1442     }
1443 
1444     @Override
encodeUtf8(final String in, final byte[] out, final int offset, final int length)1445     int encodeUtf8(final String in, final byte[] out, final int offset, final int length) {
1446       long outIx = offset;
1447       final long outLimit = outIx + length;
1448       final int inLimit = in.length();
1449       if (inLimit > length || out.length - length < offset) {
1450         // Not even enough room for an ASCII-encoded string.
1451         throw new ArrayIndexOutOfBoundsException(
1452             "Failed writing " + in.charAt(inLimit - 1) + " at index " + (offset + length));
1453       }
1454 
1455       // Designed to take advantage of
1456       // https://wiki.openjdk.java.net/display/HotSpotInternals/RangeCheckElimination
1457       int inIx = 0;
1458       for (char c; inIx < inLimit && (c = in.charAt(inIx)) < 0x80; ++inIx) {
1459         UnsafeUtil.putByte(out, outIx++, (byte) c);
1460       }
1461       if (inIx == inLimit) {
1462         // We're done, it was ASCII encoded.
1463         return (int) outIx;
1464       }
1465 
1466       for (char c; inIx < inLimit; ++inIx) {
1467         c = in.charAt(inIx);
1468         if (c < 0x80 && outIx < outLimit) {
1469           UnsafeUtil.putByte(out, outIx++, (byte) c);
1470         } else if (c < 0x800 && outIx <= outLimit - 2L) { // 11 bits, two UTF-8 bytes
1471           UnsafeUtil.putByte(out, outIx++, (byte) ((0xF << 6) | (c >>> 6)));
1472           UnsafeUtil.putByte(out, outIx++, (byte) (0x80 | (0x3F & c)));
1473         } else if ((c < MIN_SURROGATE || MAX_SURROGATE < c) && outIx <= outLimit - 3L) {
1474           // Maximum single-char code point is 0xFFFF, 16 bits, three UTF-8 bytes
1475           UnsafeUtil.putByte(out, outIx++, (byte) ((0xF << 5) | (c >>> 12)));
1476           UnsafeUtil.putByte(out, outIx++, (byte) (0x80 | (0x3F & (c >>> 6))));
1477           UnsafeUtil.putByte(out, outIx++, (byte) (0x80 | (0x3F & c)));
1478         } else if (outIx <= outLimit - 4L) {
1479           // Minimum code point represented by a surrogate pair is 0x10000, 17 bits, four UTF-8
1480           // bytes
1481           final char low;
1482           if (inIx + 1 == inLimit || !isSurrogatePair(c, (low = in.charAt(++inIx)))) {
1483             throw new UnpairedSurrogateException((inIx - 1), inLimit);
1484           }
1485           int codePoint = toCodePoint(c, low);
1486           UnsafeUtil.putByte(out, outIx++, (byte) ((0xF << 4) | (codePoint >>> 18)));
1487           UnsafeUtil.putByte(out, outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 12))));
1488           UnsafeUtil.putByte(out, outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 6))));
1489           UnsafeUtil.putByte(out, outIx++, (byte) (0x80 | (0x3F & codePoint)));
1490         } else {
1491           if ((MIN_SURROGATE <= c && c <= MAX_SURROGATE)
1492               && (inIx + 1 == inLimit || !isSurrogatePair(c, in.charAt(inIx + 1)))) {
1493             // We are surrogates and we're not a surrogate pair.
1494             throw new UnpairedSurrogateException(inIx, inLimit);
1495           }
1496           // Not enough space in the output buffer.
1497           throw new ArrayIndexOutOfBoundsException("Failed writing " + c + " at index " + outIx);
1498         }
1499       }
1500 
1501       // All bytes have been encoded.
1502       return (int) outIx;
1503     }
1504 
1505     @Override
encodeUtf8Direct(String in, ByteBuffer out)1506     void encodeUtf8Direct(String in, ByteBuffer out) {
1507       final long address = addressOffset(out);
1508       long outIx = address + out.position();
1509       final long outLimit = address + out.limit();
1510       final int inLimit = in.length();
1511       if (inLimit > outLimit - outIx) {
1512         // Not even enough room for an ASCII-encoded string.
1513         throw new ArrayIndexOutOfBoundsException(
1514             "Failed writing " + in.charAt(inLimit - 1) + " at index " + out.limit());
1515       }
1516 
1517       // Designed to take advantage of
1518       // https://wiki.openjdk.java.net/display/HotSpotInternals/RangeCheckElimination
1519       int inIx = 0;
1520       for (char c; inIx < inLimit && (c = in.charAt(inIx)) < 0x80; ++inIx) {
1521         UnsafeUtil.putByte(outIx++, (byte) c);
1522       }
1523       if (inIx == inLimit) {
1524         // We're done, it was ASCII encoded.
1525         Java8Compatibility.position(out, (int) (outIx - address));
1526         return;
1527       }
1528 
1529       for (char c; inIx < inLimit; ++inIx) {
1530         c = in.charAt(inIx);
1531         if (c < 0x80 && outIx < outLimit) {
1532           UnsafeUtil.putByte(outIx++, (byte) c);
1533         } else if (c < 0x800 && outIx <= outLimit - 2L) { // 11 bits, two UTF-8 bytes
1534           UnsafeUtil.putByte(outIx++, (byte) ((0xF << 6) | (c >>> 6)));
1535           UnsafeUtil.putByte(outIx++, (byte) (0x80 | (0x3F & c)));
1536         } else if ((c < MIN_SURROGATE || MAX_SURROGATE < c) && outIx <= outLimit - 3L) {
1537           // Maximum single-char code point is 0xFFFF, 16 bits, three UTF-8 bytes
1538           UnsafeUtil.putByte(outIx++, (byte) ((0xF << 5) | (c >>> 12)));
1539           UnsafeUtil.putByte(outIx++, (byte) (0x80 | (0x3F & (c >>> 6))));
1540           UnsafeUtil.putByte(outIx++, (byte) (0x80 | (0x3F & c)));
1541         } else if (outIx <= outLimit - 4L) {
1542           // Minimum code point represented by a surrogate pair is 0x10000, 17 bits, four UTF-8
1543           // bytes
1544           final char low;
1545           if (inIx + 1 == inLimit || !isSurrogatePair(c, (low = in.charAt(++inIx)))) {
1546             throw new UnpairedSurrogateException((inIx - 1), inLimit);
1547           }
1548           int codePoint = toCodePoint(c, low);
1549           UnsafeUtil.putByte(outIx++, (byte) ((0xF << 4) | (codePoint >>> 18)));
1550           UnsafeUtil.putByte(outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 12))));
1551           UnsafeUtil.putByte(outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 6))));
1552           UnsafeUtil.putByte(outIx++, (byte) (0x80 | (0x3F & codePoint)));
1553         } else {
1554           if ((MIN_SURROGATE <= c && c <= MAX_SURROGATE)
1555               && (inIx + 1 == inLimit || !isSurrogatePair(c, in.charAt(inIx + 1)))) {
1556             // We are surrogates and we're not a surrogate pair.
1557             throw new UnpairedSurrogateException(inIx, inLimit);
1558           }
1559           // Not enough space in the output buffer.
1560           throw new ArrayIndexOutOfBoundsException("Failed writing " + c + " at index " + outIx);
1561         }
1562       }
1563 
1564       // All bytes have been encoded.
1565       Java8Compatibility.position(out, (int) (outIx - address));
1566     }
1567 
1568     /**
1569      * Counts (approximately) the number of consecutive ASCII characters starting from the given
1570      * position, using the most efficient method available to the platform.
1571      *
1572      * @param bytes the array containing the character sequence
1573      * @param offset the offset position of the index (same as index + arrayBaseOffset)
1574      * @param maxChars the maximum number of characters to count
1575      * @return the number of ASCII characters found. The stopping position will be at or before the
1576      *     first non-ASCII byte.
1577      */
unsafeEstimateConsecutiveAscii( byte[] bytes, long offset, final int maxChars)1578     private static int unsafeEstimateConsecutiveAscii(
1579         byte[] bytes, long offset, final int maxChars) {
1580       if (maxChars < UNSAFE_COUNT_ASCII_THRESHOLD) {
1581         // Don't bother with small strings.
1582         return 0;
1583       }
1584 
1585       // Read bytes until 8-byte aligned so that we can read longs in the loop below.
1586       // Byte arrays are already either 8 or 16-byte aligned, so we just need to make sure that
1587       // the index (relative to the start of the array) is also 8-byte aligned. We do this by
1588       // ANDing the index with 7 to determine the number of bytes that need to be read before
1589       // we're 8-byte aligned.
1590       final int unaligned = 8 - ((int) offset & 7);
1591       int i;
1592       for (i = 0; i < unaligned; i++) {
1593         if (UnsafeUtil.getByte(bytes, offset++) < 0) {
1594           return i;
1595         }
1596       }
1597 
1598       for (; i + 8 <= maxChars; i += 8) {
1599         if ((UnsafeUtil.getLong(bytes, UnsafeUtil.BYTE_ARRAY_BASE_OFFSET + offset)
1600                 & ASCII_MASK_LONG)
1601             != 0L) {
1602           break;
1603         }
1604         offset += 8;
1605       }
1606 
1607       for (; i < maxChars; i++) {
1608         if (UnsafeUtil.getByte(bytes, offset++) < 0) {
1609           return i;
1610         }
1611       }
1612       return maxChars;
1613     }
1614 
1615     /**
1616      * Same as {@link Utf8#estimateConsecutiveAscii(ByteBuffer, int, int)} except that it uses the
1617      * most efficient method available to the platform.
1618      */
unsafeEstimateConsecutiveAscii(long address, final int maxChars)1619     private static int unsafeEstimateConsecutiveAscii(long address, final int maxChars) {
1620       int remaining = maxChars;
1621       if (remaining < UNSAFE_COUNT_ASCII_THRESHOLD) {
1622         // Don't bother with small strings.
1623         return 0;
1624       }
1625 
1626       // Read bytes until 8-byte aligned so that we can read longs in the loop below.
1627       // This is equivalent to (8-address) mod 8, the number of bytes we need to read before we're
1628       // 8-byte aligned.
1629       final int unaligned = (int) (-address & 7);
1630       for (int j = unaligned; j > 0; j--) {
1631         if (UnsafeUtil.getByte(address++) < 0) {
1632           return unaligned - j;
1633         }
1634       }
1635 
1636       // This simple loop stops when we encounter a byte >= 0x80 (i.e. non-ASCII).
1637       // To speed things up further, we're reading longs instead of bytes so we use a mask to
1638       // determine if any byte in the current long is non-ASCII.
1639       remaining -= unaligned;
1640       for (;
1641           remaining >= 8 && (UnsafeUtil.getLong(address) & ASCII_MASK_LONG) == 0;
1642           address += 8, remaining -= 8) {}
1643       return maxChars - remaining;
1644     }
1645 
partialIsValidUtf8(final byte[] bytes, long offset, int remaining)1646     private static int partialIsValidUtf8(final byte[] bytes, long offset, int remaining) {
1647       // Skip past ASCII characters as quickly as possible.
1648       final int skipped = unsafeEstimateConsecutiveAscii(bytes, offset, remaining);
1649       remaining -= skipped;
1650       offset += skipped;
1651 
1652       for (; ; ) {
1653         // Optimize for interior runs of ASCII bytes.
1654         // TODO: Consider checking 8 bytes at a time after some threshold?
1655         // Maybe after seeing a few in a row that are ASCII, go back to fast mode?
1656         int byte1 = 0;
1657         for (; remaining > 0 && (byte1 = UnsafeUtil.getByte(bytes, offset++)) >= 0; --remaining) {}
1658         if (remaining == 0) {
1659           return COMPLETE;
1660         }
1661         remaining--;
1662 
1663         // If we're here byte1 is not ASCII. Only need to handle 2-4 byte forms.
1664         if (byte1 < (byte) 0xE0) {
1665           // Two-byte form (110xxxxx 10xxxxxx)
1666           if (remaining == 0) {
1667             // Incomplete sequence
1668             return byte1;
1669           }
1670           remaining--;
1671 
1672           // Simultaneously checks for illegal trailing-byte in
1673           // leading position and overlong 2-byte form.
1674           if (byte1 < (byte) 0xC2 || UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF) {
1675             return MALFORMED;
1676           }
1677         } else if (byte1 < (byte) 0xF0) {
1678           // Three-byte form (1110xxxx 10xxxxxx 10xxxxxx)
1679           if (remaining < 2) {
1680             // Incomplete sequence
1681             return unsafeIncompleteStateFor(bytes, byte1, offset, remaining);
1682           }
1683           remaining -= 2;
1684 
1685           final int byte2;
1686           if ((byte2 = UnsafeUtil.getByte(bytes, offset++)) > (byte) 0xBF
1687               // overlong? 5 most significant bits must not all be zero
1688               || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0)
1689               // check for illegal surrogate codepoints
1690               || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0)
1691               // byte3 trailing-byte test
1692               || UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF) {
1693             return MALFORMED;
1694           }
1695         } else {
1696           // Four-byte form (1110xxxx 10xxxxxx 10xxxxxx 10xxxxxx)
1697           if (remaining < 3) {
1698             // Incomplete sequence
1699             return unsafeIncompleteStateFor(bytes, byte1, offset, remaining);
1700           }
1701           remaining -= 3;
1702 
1703           final int byte2;
1704           if ((byte2 = UnsafeUtil.getByte(bytes, offset++)) > (byte) 0xBF
1705               // Check that 1 <= plane <= 16.  Tricky optimized form of:
1706               // if (byte1 > (byte) 0xF4 ||
1707               //     byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 ||
1708               //     byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F)
1709               || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0
1710               // byte3 trailing-byte test
1711               || UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF
1712               // byte4 trailing-byte test
1713               || UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF) {
1714             return MALFORMED;
1715           }
1716         }
1717       }
1718     }
1719 
partialIsValidUtf8(long address, int remaining)1720     private static int partialIsValidUtf8(long address, int remaining) {
1721       // Skip past ASCII characters as quickly as possible.
1722       final int skipped = unsafeEstimateConsecutiveAscii(address, remaining);
1723       address += skipped;
1724       remaining -= skipped;
1725 
1726       for (; ; ) {
1727         // Optimize for interior runs of ASCII bytes.
1728         // TODO: Consider checking 8 bytes at a time after some threshold?
1729         // Maybe after seeing a few in a row that are ASCII, go back to fast mode?
1730         int byte1 = 0;
1731         for (; remaining > 0 && (byte1 = UnsafeUtil.getByte(address++)) >= 0; --remaining) {}
1732         if (remaining == 0) {
1733           return COMPLETE;
1734         }
1735         remaining--;
1736 
1737         if (byte1 < (byte) 0xE0) {
1738           // Two-byte form
1739 
1740           if (remaining == 0) {
1741             // Incomplete sequence
1742             return byte1;
1743           }
1744           remaining--;
1745 
1746           // Simultaneously checks for illegal trailing-byte in
1747           // leading position and overlong 2-byte form.
1748           if (byte1 < (byte) 0xC2 || UnsafeUtil.getByte(address++) > (byte) 0xBF) {
1749             return MALFORMED;
1750           }
1751         } else if (byte1 < (byte) 0xF0) {
1752           // Three-byte form
1753 
1754           if (remaining < 2) {
1755             // Incomplete sequence
1756             return unsafeIncompleteStateFor(address, byte1, remaining);
1757           }
1758           remaining -= 2;
1759 
1760           final byte byte2 = UnsafeUtil.getByte(address++);
1761           if (byte2 > (byte) 0xBF
1762               // overlong? 5 most significant bits must not all be zero
1763               || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0)
1764               // check for illegal surrogate codepoints
1765               || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0)
1766               // byte3 trailing-byte test
1767               || UnsafeUtil.getByte(address++) > (byte) 0xBF) {
1768             return MALFORMED;
1769           }
1770         } else {
1771           // Four-byte form
1772 
1773           if (remaining < 3) {
1774             // Incomplete sequence
1775             return unsafeIncompleteStateFor(address, byte1, remaining);
1776           }
1777           remaining -= 3;
1778 
1779           final byte byte2 = UnsafeUtil.getByte(address++);
1780           if (byte2 > (byte) 0xBF
1781               // Check that 1 <= plane <= 16.  Tricky optimized form of:
1782               // if (byte1 > (byte) 0xF4 ||
1783               //     byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 ||
1784               //     byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F)
1785               || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0
1786               // byte3 trailing-byte test
1787               || UnsafeUtil.getByte(address++) > (byte) 0xBF
1788               // byte4 trailing-byte test
1789               || UnsafeUtil.getByte(address++) > (byte) 0xBF) {
1790             return MALFORMED;
1791           }
1792         }
1793       }
1794     }
1795 
unsafeIncompleteStateFor( byte[] bytes, int byte1, long offset, int remaining)1796     private static int unsafeIncompleteStateFor(
1797         byte[] bytes, int byte1, long offset, int remaining) {
1798       switch (remaining) {
1799         case 0:
1800           return incompleteStateFor(byte1);
1801         case 1:
1802           return incompleteStateFor(byte1, UnsafeUtil.getByte(bytes, offset));
1803         case 2:
1804           return incompleteStateFor(
1805               byte1, UnsafeUtil.getByte(bytes, offset), UnsafeUtil.getByte(bytes, offset + 1));
1806         default:
1807           throw new AssertionError();
1808       }
1809     }
1810 
unsafeIncompleteStateFor(long address, final int byte1, int remaining)1811     private static int unsafeIncompleteStateFor(long address, final int byte1, int remaining) {
1812       switch (remaining) {
1813         case 0:
1814           return incompleteStateFor(byte1);
1815         case 1:
1816           return incompleteStateFor(byte1, UnsafeUtil.getByte(address));
1817         case 2:
1818           return incompleteStateFor(
1819               byte1, UnsafeUtil.getByte(address), UnsafeUtil.getByte(address + 1));
1820         default:
1821           throw new AssertionError();
1822       }
1823     }
1824   }
1825 
1826   /**
1827    * Utility methods for decoding bytes into {@link String}. Callers are responsible for extracting
1828    * bytes (possibly using Unsafe methods), and checking remaining bytes. All other UTF-8 validity
1829    * checks and codepoint conversion happen in this class.
1830    */
1831   private static class DecodeUtil {
1832 
1833     /** Returns whether this is a single-byte codepoint (i.e., ASCII) with the form '0XXXXXXX'. */
isOneByte(byte b)1834     private static boolean isOneByte(byte b) {
1835       return b >= 0;
1836     }
1837 
1838     /**
1839      * Returns whether this is a two-byte codepoint with the form '10XXXXXX' iff
1840      * {@link #isOneByte(byte)} is false. This private method works in the limited use in
1841      * this class where this method is only called when {@link #isOneByte(byte)} has already
1842      * returned false. It is not suitable for general or public use.
1843      */
isTwoBytes(byte b)1844     private static boolean isTwoBytes(byte b) {
1845       return b < (byte) 0xE0;
1846     }
1847 
1848     /**
1849      * Returns whether this is a three-byte codepoint with the form '110XXXXX' iff
1850      * {@link #isOneByte(byte)} and {@link #isTwoBytes(byte)} are false.
1851      * This private method works in the limited use in
1852      * this class where this method is only called when {@link #isOneByte(byte)} an
1853      * {@link #isTwoBytes(byte)} have already returned false. It is not suitable for general
1854      * or public use.
1855      */
isThreeBytes(byte b)1856     private static boolean isThreeBytes(byte b) {
1857       return b < (byte) 0xF0;
1858     }
1859 
handleOneByte(byte byte1, char[] resultArr, int resultPos)1860     private static void handleOneByte(byte byte1, char[] resultArr, int resultPos) {
1861       resultArr[resultPos] = (char) byte1;
1862     }
1863 
handleTwoBytes(byte byte1, byte byte2, char[] resultArr, int resultPos)1864     private static void handleTwoBytes(byte byte1, byte byte2, char[] resultArr, int resultPos)
1865         throws InvalidProtocolBufferException {
1866       // Simultaneously checks for illegal trailing-byte in leading position (<= '11000000') and
1867       // overlong 2-byte, '11000001'.
1868       if (byte1 < (byte) 0xC2 || isNotTrailingByte(byte2)) {
1869         throw InvalidProtocolBufferException.invalidUtf8();
1870       }
1871       resultArr[resultPos] = (char) (((byte1 & 0x1F) << 6) | trailingByteValue(byte2));
1872     }
1873 
handleThreeBytes( byte byte1, byte byte2, byte byte3, char[] resultArr, int resultPos)1874     private static void handleThreeBytes(
1875         byte byte1, byte byte2, byte byte3, char[] resultArr, int resultPos)
1876         throws InvalidProtocolBufferException {
1877       if (isNotTrailingByte(byte2)
1878           // overlong? 5 most significant bits must not all be zero
1879           || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0)
1880           // check for illegal surrogate codepoints
1881           || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0)
1882           || isNotTrailingByte(byte3)) {
1883         throw InvalidProtocolBufferException.invalidUtf8();
1884       }
1885       resultArr[resultPos] =
1886           (char)
1887               (((byte1 & 0x0F) << 12) | (trailingByteValue(byte2) << 6) | trailingByteValue(byte3));
1888     }
1889 
handleFourBytes( byte byte1, byte byte2, byte byte3, byte byte4, char[] resultArr, int resultPos)1890     private static void handleFourBytes(
1891         byte byte1, byte byte2, byte byte3, byte byte4, char[] resultArr, int resultPos)
1892         throws InvalidProtocolBufferException {
1893       if (isNotTrailingByte(byte2)
1894           // Check that 1 <= plane <= 16.  Tricky optimized form of:
1895           //   valid 4-byte leading byte?
1896           // if (byte1 > (byte) 0xF4 ||
1897           //   overlong? 4 most significant bits must not all be zero
1898           //     byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 ||
1899           //   codepoint larger than the highest code point (U+10FFFF)?
1900           //     byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F)
1901           || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0
1902           || isNotTrailingByte(byte3)
1903           || isNotTrailingByte(byte4)) {
1904         throw InvalidProtocolBufferException.invalidUtf8();
1905       }
1906       int codepoint =
1907           ((byte1 & 0x07) << 18)
1908               | (trailingByteValue(byte2) << 12)
1909               | (trailingByteValue(byte3) << 6)
1910               | trailingByteValue(byte4);
1911       resultArr[resultPos] = DecodeUtil.highSurrogate(codepoint);
1912       resultArr[resultPos + 1] = DecodeUtil.lowSurrogate(codepoint);
1913     }
1914 
1915     /** Returns whether the byte is not a valid continuation of the form '10XXXXXX'. */
isNotTrailingByte(byte b)1916     private static boolean isNotTrailingByte(byte b) {
1917       return b > (byte) 0xBF;
1918     }
1919 
1920     /** Returns the actual value of the trailing byte (removes the prefix '10') for composition. */
trailingByteValue(byte b)1921     private static int trailingByteValue(byte b) {
1922       return b & 0x3F;
1923     }
1924 
highSurrogate(int codePoint)1925     private static char highSurrogate(int codePoint) {
1926       return (char)
1927           ((MIN_HIGH_SURROGATE - (MIN_SUPPLEMENTARY_CODE_POINT >>> 10)) + (codePoint >>> 10));
1928     }
1929 
lowSurrogate(int codePoint)1930     private static char lowSurrogate(int codePoint) {
1931       return (char) (MIN_LOW_SURROGATE + (codePoint & 0x3ff));
1932     }
1933   }
1934 
Utf8()1935   private Utf8() {}
1936 }
1937