• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2013 The Guava Authors
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 package com.google.common.base;
18 
19 import static com.google.common.truth.Truth.assertThat;
20 import static java.lang.Character.MAX_CODE_POINT;
21 import static java.lang.Character.MAX_HIGH_SURROGATE;
22 import static java.lang.Character.MAX_LOW_SURROGATE;
23 import static java.lang.Character.MIN_HIGH_SURROGATE;
24 import static java.lang.Character.MIN_LOW_SURROGATE;
25 import static java.lang.Character.MIN_SUPPLEMENTARY_CODE_POINT;
26 import static java.nio.charset.StandardCharsets.UTF_8;
27 
28 import com.google.common.annotations.GwtCompatible;
29 import com.google.common.annotations.GwtIncompatible;
30 import com.google.common.collect.ImmutableList;
31 import java.util.Arrays;
32 import java.util.HashMap;
33 import java.util.Random;
34 import junit.framework.TestCase;
35 
36 /**
37  * Unit tests for {@link Utf8}.
38  *
39  * @author Jon Perlow
40  * @author Martin Buchholz
41  * @author Clément Roux
42  */
43 @GwtCompatible(emulated = true)
44 public class Utf8Test extends TestCase {
45 
46   private static final ImmutableList<String> ILL_FORMED_STRINGS;
47 
48   static {
49     ImmutableList.Builder<String> builder = ImmutableList.builder();
50     char[] surrogates = {
51       MAX_LOW_SURROGATE, MAX_HIGH_SURROGATE, MIN_LOW_SURROGATE, MIN_HIGH_SURROGATE,
52     };
53     for (char surrogate : surrogates) {
newString(surrogate)54       builder.add(newString(surrogate));
newString(surrogate, 'n')55       builder.add(newString(surrogate, 'n'));
56       builder.add(newString('n', surrogate));
newString(surrogate, surrogate)57       builder.add(newString(surrogate, surrogate));
58     }
newString(MIN_LOW_SURROGATE, MAX_HIGH_SURROGATE)59     builder.add(newString(MIN_LOW_SURROGATE, MAX_HIGH_SURROGATE));
60     ILL_FORMED_STRINGS = builder.build();
61   }
62 
testEncodedLength_validStrings()63   public void testEncodedLength_validStrings() {
64     assertEquals(0, Utf8.encodedLength(""));
65     assertEquals(11, Utf8.encodedLength("Hello world"));
66     assertEquals(8, Utf8.encodedLength("Résumé"));
67     assertEquals(
68         461,
69         Utf8.encodedLength(
70             "威廉·莎士比亞(William Shakespeare,"
71                 + "1564年4月26號—1616年4月23號[1])係隻英國嗰演員、劇作家同詩人,"
72                 + "有時間佢簡稱莎翁;中國清末民初哈拕翻譯做舌克斯毕、沙斯皮耳、筛斯比耳、"
73                 + "莎基斯庇尔、索士比尔、夏克思芘尔、希哀苦皮阿、叶斯壁、沙克皮尔、"
74                 + "狹斯丕爾。[2]莎士比亞編寫過好多作品,佢嗰劇作響西洋文學好有影響,"
75                 + "哈都拕人翻譯做好多話。"));
76     // A surrogate pair
77     assertEquals(4, Utf8.encodedLength(newString(MIN_HIGH_SURROGATE, MIN_LOW_SURROGATE)));
78   }
79 
testEncodedLength_validStrings2()80   public void testEncodedLength_validStrings2() {
81     HashMap<Integer, Integer> utf8Lengths = new HashMap<>();
82     utf8Lengths.put(0x00, 1);
83     utf8Lengths.put(0x7f, 1);
84     utf8Lengths.put(0x80, 2);
85     utf8Lengths.put(0x7ff, 2);
86     utf8Lengths.put(0x800, 3);
87     utf8Lengths.put(MIN_SUPPLEMENTARY_CODE_POINT - 1, 3);
88     utf8Lengths.put(MIN_SUPPLEMENTARY_CODE_POINT, 4);
89     utf8Lengths.put(MAX_CODE_POINT, 4);
90 
91     Integer[] codePoints = utf8Lengths.keySet().toArray(new Integer[] {});
92     StringBuilder sb = new StringBuilder();
93     Random rnd = new Random();
94     for (int trial = 0; trial < 100; trial++) {
95       sb.setLength(0);
96       int utf8Length = 0;
97       for (int i = 0; i < 6; i++) {
98         Integer randomCodePoint = codePoints[rnd.nextInt(codePoints.length)];
99         sb.appendCodePoint(randomCodePoint);
100         utf8Length += utf8Lengths.get(randomCodePoint);
101         if (utf8Length != Utf8.encodedLength(sb)) {
102           StringBuilder repro = new StringBuilder();
103           for (int j = 0; j < sb.length(); j++) {
104             repro.append(" ").append((int) sb.charAt(j)); // GWT compatible
105           }
106           assertEquals(repro.toString(), utf8Length, Utf8.encodedLength(sb));
107         }
108       }
109     }
110   }
111 
testEncodedLength_invalidStrings()112   public void testEncodedLength_invalidStrings() {
113     testEncodedLengthFails(newString(MIN_HIGH_SURROGATE), 0);
114     testEncodedLengthFails("foobar" + newString(MIN_HIGH_SURROGATE), 6);
115     testEncodedLengthFails(newString(MIN_LOW_SURROGATE), 0);
116     testEncodedLengthFails("foobar" + newString(MIN_LOW_SURROGATE), 6);
117     testEncodedLengthFails(newString(MIN_HIGH_SURROGATE, MIN_HIGH_SURROGATE), 0);
118   }
119 
testEncodedLengthFails(String invalidString, int invalidCodePointIndex)120   private static void testEncodedLengthFails(String invalidString, int invalidCodePointIndex) {
121     try {
122       Utf8.encodedLength(invalidString);
123       fail();
124     } catch (IllegalArgumentException expected) {
125       assertThat(expected)
126           .hasMessageThat()
127           .isEqualTo("Unpaired surrogate at index " + invalidCodePointIndex);
128     }
129   }
130 
131   // 128 - [chars 0x0000 to 0x007f]
132   private static final long ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS = 0x007f - 0x0000 + 1;
133 
134   // 128
135   private static final long EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT =
136       ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS;
137 
138   // 1920 [chars 0x0080 to 0x07FF]
139   private static final long TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS = 0x07FF - 0x0080 + 1;
140 
141   // 18,304
142   private static final long EXPECTED_TWO_BYTE_ROUNDTRIPPABLE_COUNT =
143       // Both bytes are one byte characters
144       (long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 2)
145           +
146           // The possible number of two byte characters
147           TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS;
148 
149   // 2048
150   private static final long THREE_BYTE_SURROGATES = 2 * 1024;
151 
152   // 61,440 [chars 0x0800 to 0xFFFF, minus surrogates]
153   private static final long THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS =
154       0xFFFF - 0x0800 + 1 - THREE_BYTE_SURROGATES;
155 
156   // 2,650,112
157   private static final long EXPECTED_THREE_BYTE_ROUNDTRIPPABLE_COUNT =
158       // All one byte characters
159       (long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 3)
160           +
161           // One two byte character and a one byte character
162           2 * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS * ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS
163           +
164           // Three byte characters
165           THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS;
166 
167   // 1,048,576 [chars 0x10000L to 0x10FFFF]
168   private static final long FOUR_BYTE_ROUNDTRIPPABLE_CHARACTERS = 0x10FFFF - 0x10000L + 1;
169 
170   // 289,571,839
171   private static final long EXPECTED_FOUR_BYTE_ROUNDTRIPPABLE_COUNT =
172       // All one byte characters
173       (long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 4)
174           +
175           // One and three byte characters
176           2 * THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS * ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS
177           +
178           // Two two byte characters
179           TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS
180           +
181           // Permutations of one and two byte characters
182           3
183               * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS
184               * ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS
185               * ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS
186           +
187           // Four byte characters
188           FOUR_BYTE_ROUNDTRIPPABLE_CHARACTERS;
189 
190   /** Tests that round tripping of all two byte permutations work. */
191   @GwtIncompatible // java.nio.charset.Charset
testIsWellFormed_1Byte()192   public void testIsWellFormed_1Byte() {
193     testBytes(1, EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT);
194   }
195 
196   /** Tests that round tripping of all two byte permutations work. */
197   @GwtIncompatible // java.nio.charset.Charset
testIsWellFormed_2Bytes()198   public void testIsWellFormed_2Bytes() {
199     testBytes(2, EXPECTED_TWO_BYTE_ROUNDTRIPPABLE_COUNT);
200   }
201 
202   /** Tests that round tripping of all three byte permutations work. */
203   @GwtIncompatible // java.nio.charset.Charset
204 
testIsWellFormed_3Bytes()205   public void testIsWellFormed_3Bytes() {
206     testBytes(3, EXPECTED_THREE_BYTE_ROUNDTRIPPABLE_COUNT);
207   }
208 
209   /**
210    * Tests that round tripping of a sample of four byte permutations work. All permutations are
211    * prohibitively expensive to test for automated runs. This method tests specific four-byte cases.
212    */
testIsWellFormed_4BytesSamples()213   public void testIsWellFormed_4BytesSamples() {
214     // Valid 4 byte.
215     assertWellFormed(0xF0, 0xA4, 0xAD, 0xA2);
216     // Bad trailing bytes
217     assertNotWellFormed(0xF0, 0xA4, 0xAD, 0x7F);
218     assertNotWellFormed(0xF0, 0xA4, 0xAD, 0xC0);
219     // Special cases for byte2
220     assertNotWellFormed(0xF0, 0x8F, 0xAD, 0xA2);
221     assertNotWellFormed(0xF4, 0x90, 0xAD, 0xA2);
222   }
223 
224   /** Tests some hard-coded test cases. */
testSomeSequences()225   public void testSomeSequences() {
226     // Empty
227     assertWellFormed();
228     // One-byte characters, including control characters
229     assertWellFormed(0x00, 0x61, 0x62, 0x63, 0x7F); // "\u0000abc\u007f"
230     // Two-byte characters
231     assertWellFormed(0xC2, 0xA2, 0xC2, 0xA2); // "\u00a2\u00a2"
232     // Three-byte characters
233     assertWellFormed(0xc8, 0x8a, 0x63, 0xc8, 0x8a, 0x63); // "\u020ac\u020ac"
234     // Four-byte characters
235     // "\u024B62\u024B62"
236     assertWellFormed(0xc9, 0x8b, 0x36, 0x32, 0xc9, 0x8b, 0x36, 0x32);
237     // Mixed string
238     // "a\u020ac\u00a2b\\u024B62u020acc\u00a2de\u024B62"
239     assertWellFormed(
240         0x61, 0xc8, 0x8a, 0x63, 0xc2, 0xa2, 0x62, 0x5c, 0x75, 0x30, 0x32, 0x34, 0x42, 0x36, 0x32,
241         0x75, 0x30, 0x32, 0x30, 0x61, 0x63, 0x63, 0xc2, 0xa2, 0x64, 0x65, 0xc9, 0x8b, 0x36, 0x32);
242     // Not a valid string
243     assertNotWellFormed(-1, 0, -1, 0);
244   }
245 
testShardsHaveExpectedRoundTrippables()246   public void testShardsHaveExpectedRoundTrippables() {
247     // A sanity check.
248     long actual = 0;
249     for (long expected : generateFourByteShardsExpectedRunnables()) {
250       actual += expected;
251     }
252     assertEquals(EXPECTED_FOUR_BYTE_ROUNDTRIPPABLE_COUNT, actual);
253   }
254 
newString(char... chars)255   private static String newString(char... chars) {
256     return new String(chars);
257   }
258 
toByteArray(int... bytes)259   private static byte[] toByteArray(int... bytes) {
260     byte[] realBytes = new byte[bytes.length];
261     for (int i = 0; i < bytes.length; i++) {
262       realBytes[i] = (byte) bytes[i];
263     }
264     return realBytes;
265   }
266 
assertWellFormed(int... bytes)267   private static void assertWellFormed(int... bytes) {
268     assertTrue(Utf8.isWellFormed(toByteArray(bytes)));
269   }
270 
assertNotWellFormed(int... bytes)271   private static void assertNotWellFormed(int... bytes) {
272     assertFalse(Utf8.isWellFormed(toByteArray(bytes)));
273   }
274 
generateFourByteShardsExpectedRunnables()275   private static long[] generateFourByteShardsExpectedRunnables() {
276     long[] expected = new long[128];
277     // 0-63 are all 5300224
278     for (int i = 0; i <= 63; i++) {
279       expected[i] = 5300224;
280     }
281     // 97-111 are all 2342912
282     for (int i = 97; i <= 111; i++) {
283       expected[i] = 2342912;
284     }
285     // 113-117 are all 1048576
286     for (int i = 113; i <= 117; i++) {
287       expected[i] = 1048576;
288     }
289     // One offs
290     expected[112] = 786432;
291     expected[118] = 786432;
292     expected[119] = 1048576;
293     expected[120] = 458752;
294     expected[121] = 524288;
295     expected[122] = 65536;
296     // Anything not assigned was the default 0.
297     return expected;
298   }
299 
300   /**
301    * Helper to run the loop to test all the permutations for the number of bytes specified.
302    *
303    * @param numBytes the number of bytes in the byte array
304    * @param expectedCount the expected number of roundtrippable permutations
305    */
306   @GwtIncompatible // java.nio.charset.Charset
testBytes(int numBytes, long expectedCount)307   private static void testBytes(int numBytes, long expectedCount) {
308     testBytes(numBytes, expectedCount, 0, -1);
309   }
310 
311   /**
312    * Helper to run the loop to test all the permutations for the number of bytes specified. This
313    * overload is useful for debugging to get the loop to start at a certain character.
314    *
315    * @param numBytes the number of bytes in the byte array
316    * @param expectedCount the expected number of roundtrippable permutations
317    * @param start the starting bytes encoded as a long as big-endian
318    * @param lim the limit of bytes to process encoded as a long as big-endian, or -1 to mean the max
319    *     limit for numBytes
320    */
321   @GwtIncompatible // java.nio.charset.Charset
testBytes(int numBytes, long expectedCount, long start, long lim)322   private static void testBytes(int numBytes, long expectedCount, long start, long lim) {
323     byte[] bytes = new byte[numBytes];
324     if (lim == -1) {
325       lim = 1L << (numBytes * 8);
326     }
327     long countRoundTripped = 0;
328     for (long byteChar = start; byteChar < lim; byteChar++) {
329       long tmpByteChar = byteChar;
330       for (int i = 0; i < numBytes; i++) {
331         bytes[bytes.length - i - 1] = (byte) tmpByteChar;
332         tmpByteChar = tmpByteChar >> 8;
333       }
334       boolean isRoundTrippable = Utf8.isWellFormed(bytes);
335       assertEquals(isRoundTrippable, Utf8.isWellFormed(bytes, 0, numBytes));
336       String s = new String(bytes, UTF_8);
337       byte[] bytesReencoded = s.getBytes(UTF_8);
338       boolean bytesEqual = Arrays.equals(bytes, bytesReencoded);
339 
340       if (bytesEqual != isRoundTrippable) {
341         fail();
342       }
343       if (isRoundTrippable) {
344         countRoundTripped++;
345       }
346     }
347     assertEquals(expectedCount, countRoundTripped);
348   }
349 }
350