• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2013 The Guava Authors
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
5  * in compliance with the License. You may obtain a copy of the License at
6  *
7  * http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software distributed under the License
10  * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
11  * or implied. See the License for the specific language governing permissions and limitations under
12  * the License.
13  */
14 
15 package com.google.common.base;
16 
17 import static com.google.common.base.Preconditions.checkPositionIndexes;
18 
19 import com.google.common.annotations.Beta;
20 import com.google.common.annotations.GwtCompatible;
21 
22 /**
23  * Low-level, high-performance utility methods related to the {@linkplain Charsets#UTF_8 UTF-8}
24  * character encoding. UTF-8 is defined in section D92 of
25  * <a href="http://www.unicode.org/versions/Unicode6.2.0/ch03.pdf">The Unicode Standard Core
26  * Specification, Chapter 3</a>.
27  *
28  * <p>The variant of UTF-8 implemented by this class is the restricted definition of UTF-8
29  * introduced in Unicode 3.1. One implication of this is that it rejects
30  * <a href="http://www.unicode.org/versions/corrigendum1.html">"non-shortest form"</a> byte
31  * sequences, even though the JDK decoder may accept them.
32  *
33  * @author Martin Buchholz
34  * @author Clément Roux
35  * @since 16.0
36  */
37 @Beta
38 @GwtCompatible
39 public final class Utf8 {
40   /**
41    * Returns the number of bytes in the UTF-8-encoded form of {@code sequence}. For a string,
42    * this method is equivalent to {@code string.getBytes(UTF_8).length}, but is more efficient in
43    * both time and space.
44    *
45    * @throws IllegalArgumentException if {@code sequence} contains ill-formed UTF-16 (unpaired
46    *     surrogates)
47    */
encodedLength(CharSequence sequence)48   public static int encodedLength(CharSequence sequence) {
49     // Warning to maintainers: this implementation is highly optimized.
50     int utf16Length = sequence.length();
51     int utf8Length = utf16Length;
52     int i = 0;
53 
54     // This loop optimizes for pure ASCII.
55     while (i < utf16Length && sequence.charAt(i) < 0x80) {
56       i++;
57     }
58 
59     // This loop optimizes for chars less than 0x800.
60     for (; i < utf16Length; i++) {
61       char c = sequence.charAt(i);
62       if (c < 0x800) {
63         utf8Length += ((0x7f - c) >>> 31);  // branch free!
64       } else {
65         utf8Length += encodedLengthGeneral(sequence, i);
66         break;
67       }
68     }
69 
70     if (utf8Length < utf16Length) {
71       // Necessary and sufficient condition for overflow because of maximum 3x expansion
72       throw new IllegalArgumentException("UTF-8 length does not fit in int: "
73                                          + (utf8Length + (1L << 32)));
74     }
75     return utf8Length;
76   }
77 
encodedLengthGeneral(CharSequence sequence, int start)78   private static int encodedLengthGeneral(CharSequence sequence, int start) {
79     int utf16Length = sequence.length();
80     int utf8Length = 0;
81     for (int i = start; i < utf16Length; i++) {
82       char c = sequence.charAt(i);
83       if (c < 0x800) {
84         utf8Length += (0x7f - c) >>> 31; // branch free!
85       } else {
86         utf8Length += 2;
87         // jdk7+: if (Character.isSurrogate(c)) {
88         if (Character.MIN_SURROGATE <= c && c <= Character.MAX_SURROGATE) {
89           // Check that we have a well-formed surrogate pair.
90           int cp = Character.codePointAt(sequence, i);
91           if (cp < Character.MIN_SUPPLEMENTARY_CODE_POINT) {
92             throw new IllegalArgumentException("Unpaired surrogate at index " + i);
93           }
94           i++;
95         }
96       }
97     }
98     return utf8Length;
99   }
100 
101   /**
102    * Returns {@code true} if {@code bytes} is a <i>well-formed</i> UTF-8 byte sequence according to
103    * Unicode 6.0. Note that this is a stronger criterion than simply whether the bytes can be
104    * decoded. For example, some versions of the JDK decoder will accept "non-shortest form" byte
105    * sequences, but encoding never reproduces these. Such byte sequences are <i>not</i> considered
106    * well-formed.
107    *
108    * <p>This method returns {@code true} if and only if {@code Arrays.equals(bytes, new
109    * String(bytes, UTF_8).getBytes(UTF_8))} does, but is more efficient in both time and space.
110    */
isWellFormed(byte[] bytes)111   public static boolean isWellFormed(byte[] bytes) {
112     return isWellFormed(bytes, 0, bytes.length);
113   }
114 
115   /**
116    * Returns whether the given byte array slice is a well-formed UTF-8 byte sequence, as defined by
117    * {@link #isWellFormed(byte[])}. Note that this can be false even when {@code
118    * isWellFormed(bytes)} is true.
119    *
120    * @param bytes the input buffer
121    * @param off the offset in the buffer of the first byte to read
122    * @param len the number of bytes to read from the buffer
123    */
isWellFormed(byte[] bytes, int off, int len)124   public static boolean isWellFormed(byte[] bytes, int off, int len) {
125     int end = off + len;
126     checkPositionIndexes(off, end, bytes.length);
127     // Look for the first non-ASCII character.
128     for (int i = off; i < end; i++) {
129       if (bytes[i] < 0) {
130         return isWellFormedSlowPath(bytes, i, end);
131       }
132     }
133     return true;
134   }
135 
isWellFormedSlowPath(byte[] bytes, int off, int end)136   private static boolean isWellFormedSlowPath(byte[] bytes, int off, int end) {
137     int index = off;
138     while (true) {
139       int byte1;
140 
141       // Optimize for interior runs of ASCII bytes.
142       do {
143         if (index >= end) {
144           return true;
145         }
146       } while ((byte1 = bytes[index++]) >= 0);
147 
148       if (byte1 < (byte) 0xE0) {
149         // Two-byte form.
150         if (index == end) {
151           return false;
152         }
153         // Simultaneously check for illegal trailing-byte in leading position
154         // and overlong 2-byte form.
155         if (byte1 < (byte) 0xC2 || bytes[index++] > (byte) 0xBF) {
156           return false;
157         }
158       } else if (byte1 < (byte) 0xF0) {
159         // Three-byte form.
160         if (index + 1 >= end) {
161           return false;
162         }
163         int byte2 = bytes[index++];
164         if (byte2 > (byte) 0xBF
165             // Overlong? 5 most significant bits must not all be zero.
166             || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0)
167             // Check for illegal surrogate codepoints.
168             || (byte1 == (byte) 0xED && (byte) 0xA0 <= byte2)
169             // Third byte trailing-byte test.
170             || bytes[index++] > (byte) 0xBF) {
171           return false;
172         }
173       } else {
174         // Four-byte form.
175         if (index + 2 >= end) {
176           return false;
177         }
178         int byte2 = bytes[index++];
179         if (byte2 > (byte) 0xBF
180             // Check that 1 <= plane <= 16. Tricky optimized form of:
181             // if (byte1 > (byte) 0xF4
182             //     || byte1 == (byte) 0xF0 && byte2 < (byte) 0x90
183             //     || byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F)
184             || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0
185             // Third byte trailing-byte test
186             || bytes[index++] > (byte) 0xBF
187             // Fourth byte trailing-byte test
188             || bytes[index++] > (byte) 0xBF) {
189           return false;
190         }
191       }
192     }
193   }
194 
Utf8()195   private Utf8() {}
196 }
197