• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2013 The Guava Authors
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 package com.google.common.base;
18 
19 import static com.google.common.truth.Truth.assertThat;
20 import static java.lang.Character.MAX_CODE_POINT;
21 import static java.lang.Character.MAX_HIGH_SURROGATE;
22 import static java.lang.Character.MAX_LOW_SURROGATE;
23 import static java.lang.Character.MIN_HIGH_SURROGATE;
24 import static java.lang.Character.MIN_LOW_SURROGATE;
25 import static java.lang.Character.MIN_SUPPLEMENTARY_CODE_POINT;
26 
27 import com.google.common.annotations.GwtCompatible;
28 import com.google.common.annotations.GwtIncompatible;
29 import com.google.common.annotations.J2ktIncompatible;
30 import com.google.common.collect.ImmutableList;
31 import java.util.Arrays;
32 import java.util.HashMap;
33 import java.util.Random;
34 import junit.framework.TestCase;
35 
36 /**
37  * Unit tests for {@link Utf8}.
38  *
39  * @author Jon Perlow
40  * @author Martin Buchholz
41  * @author Clément Roux
42  */
43 @GwtCompatible(emulated = true)
44 public class Utf8Test extends TestCase {
45 
46   private static final ImmutableList<String> ILL_FORMED_STRINGS;
47 
48   static {
49     ImmutableList.Builder<String> builder = ImmutableList.builder();
50     char[] surrogates = {
51       MAX_LOW_SURROGATE, MAX_HIGH_SURROGATE, MIN_LOW_SURROGATE, MIN_HIGH_SURROGATE,
52     };
53     for (char surrogate : surrogates) {
newString(surrogate)54       builder.add(newString(surrogate));
newString(surrogate, 'n')55       builder.add(newString(surrogate, 'n'));
56       builder.add(newString('n', surrogate));
newString(surrogate, surrogate)57       builder.add(newString(surrogate, surrogate));
58     }
newString(MIN_LOW_SURROGATE, MAX_HIGH_SURROGATE)59     builder.add(newString(MIN_LOW_SURROGATE, MAX_HIGH_SURROGATE));
60     ILL_FORMED_STRINGS = builder.build();
61   }
62 
testEncodedLength_validStrings()63   public void testEncodedLength_validStrings() {
64     assertEquals(0, Utf8.encodedLength(""));
65     assertEquals(11, Utf8.encodedLength("Hello world"));
66     assertEquals(8, Utf8.encodedLength("Résumé"));
67     assertEquals(
68         461,
69         Utf8.encodedLength(
70             "威廉·莎士比亞(William Shakespeare,"
71                 + "1564年4月26號—1616年4月23號[1])係隻英國嗰演員、劇作家同詩人,"
72                 + "有時間佢簡稱莎翁;中國清末民初哈拕翻譯做舌克斯毕、沙斯皮耳、筛斯比耳、"
73                 + "莎基斯庇尔、索士比尔、夏克思芘尔、希哀苦皮阿、叶斯壁、沙克皮尔、"
74                 + "狹斯丕爾。[2]莎士比亞編寫過好多作品,佢嗰劇作響西洋文學好有影響,"
75                 + "哈都拕人翻譯做好多話。"));
76     // A surrogate pair
77     assertEquals(4, Utf8.encodedLength(newString(MIN_HIGH_SURROGATE, MIN_LOW_SURROGATE)));
78   }
79 
testEncodedLength_validStrings2()80   public void testEncodedLength_validStrings2() {
81     HashMap<Integer, Integer> utf8Lengths = new HashMap<>();
82     utf8Lengths.put(0x00, 1);
83     utf8Lengths.put(0x7f, 1);
84     utf8Lengths.put(0x80, 2);
85     utf8Lengths.put(0x7ff, 2);
86     utf8Lengths.put(0x800, 3);
87     utf8Lengths.put(MIN_SUPPLEMENTARY_CODE_POINT - 1, 3);
88     utf8Lengths.put(MIN_SUPPLEMENTARY_CODE_POINT, 4);
89     utf8Lengths.put(MAX_CODE_POINT, 4);
90 
91     Integer[] codePoints = utf8Lengths.keySet().toArray(new Integer[] {});
92     StringBuilder sb = new StringBuilder();
93     Random rnd = new Random();
94     for (int trial = 0; trial < 100; trial++) {
95       sb.setLength(0);
96       int utf8Length = 0;
97       for (int i = 0; i < 6; i++) {
98         Integer randomCodePoint = codePoints[rnd.nextInt(codePoints.length)];
99         sb.appendCodePoint(randomCodePoint);
100         utf8Length += utf8Lengths.get(randomCodePoint);
101         if (utf8Length != Utf8.encodedLength(sb)) {
102           StringBuilder repro = new StringBuilder();
103           for (int j = 0; j < sb.length(); j++) {
104             repro.append(" ").append((int) sb.charAt(j)); // GWT compatible
105           }
106           assertEquals(repro.toString(), utf8Length, Utf8.encodedLength(sb));
107         }
108       }
109     }
110   }
111 
testEncodedLength_invalidStrings()112   public void testEncodedLength_invalidStrings() {
113     testEncodedLengthFails(newString(MIN_HIGH_SURROGATE), 0);
114     testEncodedLengthFails("foobar" + newString(MIN_HIGH_SURROGATE), 6);
115     testEncodedLengthFails(newString(MIN_LOW_SURROGATE), 0);
116     testEncodedLengthFails("foobar" + newString(MIN_LOW_SURROGATE), 6);
117     testEncodedLengthFails(newString(MIN_HIGH_SURROGATE, MIN_HIGH_SURROGATE), 0);
118   }
119 
testEncodedLengthFails(String invalidString, int invalidCodePointIndex)120   private static void testEncodedLengthFails(String invalidString, int invalidCodePointIndex) {
121     try {
122       Utf8.encodedLength(invalidString);
123       fail();
124     } catch (IllegalArgumentException expected) {
125       assertThat(expected)
126           .hasMessageThat()
127           .isEqualTo("Unpaired surrogate at index " + invalidCodePointIndex);
128     }
129   }
130 
131   // 128 - [chars 0x0000 to 0x007f]
132   private static final long ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS = 0x007f - 0x0000 + 1;
133 
134   // 128
135   private static final long EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT =
136       ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS;
137 
138   // 1920 [chars 0x0080 to 0x07FF]
139   private static final long TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS = 0x07FF - 0x0080 + 1;
140 
141   // 18,304
142   private static final long EXPECTED_TWO_BYTE_ROUNDTRIPPABLE_COUNT =
143       // Both bytes are one byte characters
144       (long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 2)
145           +
146           // The possible number of two byte characters
147           TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS;
148 
149   // 2048
150   private static final long THREE_BYTE_SURROGATES = 2 * 1024;
151 
152   // 61,440 [chars 0x0800 to 0xFFFF, minus surrogates]
153   private static final long THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS =
154       0xFFFF - 0x0800 + 1 - THREE_BYTE_SURROGATES;
155 
156   // 2,650,112
157   private static final long EXPECTED_THREE_BYTE_ROUNDTRIPPABLE_COUNT =
158       // All one byte characters
159       (long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 3)
160           +
161           // One two byte character and a one byte character
162           2 * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS * ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS
163           +
164           // Three byte characters
165           THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS;
166 
167   // 1,048,576 [chars 0x10000L to 0x10FFFF]
168   private static final long FOUR_BYTE_ROUNDTRIPPABLE_CHARACTERS = 0x10FFFF - 0x10000L + 1;
169 
170   // 289,571,839
171   private static final long EXPECTED_FOUR_BYTE_ROUNDTRIPPABLE_COUNT =
172       // All one byte characters
173       (long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 4)
174           +
175           // One and three byte characters
176           2 * THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS * ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS
177           +
178           // Two two byte characters
179           TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS
180           +
181           // Permutations of one and two byte characters
182           3
183               * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS
184               * ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS
185               * ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS
186           +
187           // Four byte characters
188           FOUR_BYTE_ROUNDTRIPPABLE_CHARACTERS;
189 
190   /** Tests that round tripping of all two byte permutations work. */
191   @J2ktIncompatible
192   @GwtIncompatible // java.nio.charset.Charset
testIsWellFormed_1Byte()193   public void testIsWellFormed_1Byte() {
194     testBytes(1, EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT);
195   }
196 
197   /** Tests that round tripping of all two byte permutations work. */
198   @J2ktIncompatible
199   @GwtIncompatible // java.nio.charset.Charset
testIsWellFormed_2Bytes()200   public void testIsWellFormed_2Bytes() {
201     testBytes(2, EXPECTED_TWO_BYTE_ROUNDTRIPPABLE_COUNT);
202   }
203 
204   /** Tests that round tripping of all three byte permutations work. */
205   @J2ktIncompatible
206   @GwtIncompatible // java.nio.charset.Charset
207 
testIsWellFormed_3Bytes()208   public void testIsWellFormed_3Bytes() {
209     testBytes(3, EXPECTED_THREE_BYTE_ROUNDTRIPPABLE_COUNT);
210   }
211 
212   /**
213    * Tests that round tripping of a sample of four byte permutations work. All permutations are
214    * prohibitively expensive to test for automated runs. This method tests specific four-byte cases.
215    */
testIsWellFormed_4BytesSamples()216   public void testIsWellFormed_4BytesSamples() {
217     // Valid 4 byte.
218     assertWellFormed(0xF0, 0xA4, 0xAD, 0xA2);
219     // Bad trailing bytes
220     assertNotWellFormed(0xF0, 0xA4, 0xAD, 0x7F);
221     assertNotWellFormed(0xF0, 0xA4, 0xAD, 0xC0);
222     // Special cases for byte2
223     assertNotWellFormed(0xF0, 0x8F, 0xAD, 0xA2);
224     assertNotWellFormed(0xF4, 0x90, 0xAD, 0xA2);
225   }
226 
227   /** Tests some hard-coded test cases. */
testSomeSequences()228   public void testSomeSequences() {
229     // Empty
230     assertWellFormed();
231     // One-byte characters, including control characters
232     assertWellFormed(0x00, 0x61, 0x62, 0x63, 0x7F); // "\u0000abc\u007f"
233     // Two-byte characters
234     assertWellFormed(0xC2, 0xA2, 0xC2, 0xA2); // "\u00a2\u00a2"
235     // Three-byte characters
236     assertWellFormed(0xc8, 0x8a, 0x63, 0xc8, 0x8a, 0x63); // "\u020ac\u020ac"
237     // Four-byte characters
238     // "\u024B62\u024B62"
239     assertWellFormed(0xc9, 0x8b, 0x36, 0x32, 0xc9, 0x8b, 0x36, 0x32);
240     // Mixed string
241     // "a\u020ac\u00a2b\\u024B62u020acc\u00a2de\u024B62"
242     assertWellFormed(
243         0x61, 0xc8, 0x8a, 0x63, 0xc2, 0xa2, 0x62, 0x5c, 0x75, 0x30, 0x32, 0x34, 0x42, 0x36, 0x32,
244         0x75, 0x30, 0x32, 0x30, 0x61, 0x63, 0x63, 0xc2, 0xa2, 0x64, 0x65, 0xc9, 0x8b, 0x36, 0x32);
245     // Not a valid string
246     assertNotWellFormed(-1, 0, -1, 0);
247   }
248 
testShardsHaveExpectedRoundTrippables()249   public void testShardsHaveExpectedRoundTrippables() {
250     // A sanity check.
251     long actual = 0;
252     for (long expected : generateFourByteShardsExpectedRunnables()) {
253       actual += expected;
254     }
255     assertEquals(EXPECTED_FOUR_BYTE_ROUNDTRIPPABLE_COUNT, actual);
256   }
257 
newString(char... chars)258   private static String newString(char... chars) {
259     return new String(chars);
260   }
261 
toByteArray(int... bytes)262   private static byte[] toByteArray(int... bytes) {
263     byte[] realBytes = new byte[bytes.length];
264     for (int i = 0; i < bytes.length; i++) {
265       realBytes[i] = (byte) bytes[i];
266     }
267     return realBytes;
268   }
269 
assertWellFormed(int... bytes)270   private static void assertWellFormed(int... bytes) {
271     assertTrue(Utf8.isWellFormed(toByteArray(bytes)));
272   }
273 
assertNotWellFormed(int... bytes)274   private static void assertNotWellFormed(int... bytes) {
275     assertFalse(Utf8.isWellFormed(toByteArray(bytes)));
276   }
277 
generateFourByteShardsExpectedRunnables()278   private static long[] generateFourByteShardsExpectedRunnables() {
279     long[] expected = new long[128];
280     // 0-63 are all 5300224
281     for (int i = 0; i <= 63; i++) {
282       expected[i] = 5300224;
283     }
284     // 97-111 are all 2342912
285     for (int i = 97; i <= 111; i++) {
286       expected[i] = 2342912;
287     }
288     // 113-117 are all 1048576
289     for (int i = 113; i <= 117; i++) {
290       expected[i] = 1048576;
291     }
292     // One offs
293     expected[112] = 786432;
294     expected[118] = 786432;
295     expected[119] = 1048576;
296     expected[120] = 458752;
297     expected[121] = 524288;
298     expected[122] = 65536;
299     // Anything not assigned was the default 0.
300     return expected;
301   }
302 
303   /**
304    * Helper to run the loop to test all the permutations for the number of bytes specified.
305    *
306    * @param numBytes the number of bytes in the byte array
307    * @param expectedCount the expected number of roundtrippable permutations
308    */
309   @J2ktIncompatible
310   @GwtIncompatible // java.nio.charset.Charset
testBytes(int numBytes, long expectedCount)311   private static void testBytes(int numBytes, long expectedCount) {
312     testBytes(numBytes, expectedCount, 0, -1);
313   }
314 
315   /**
316    * Helper to run the loop to test all the permutations for the number of bytes specified. This
317    * overload is useful for debugging to get the loop to start at a certain character.
318    *
319    * @param numBytes the number of bytes in the byte array
320    * @param expectedCount the expected number of roundtrippable permutations
321    * @param start the starting bytes encoded as a long as big-endian
322    * @param lim the limit of bytes to process encoded as a long as big-endian, or -1 to mean the max
323    *     limit for numBytes
324    */
325   @J2ktIncompatible
326   @GwtIncompatible // java.nio.charset.Charset
testBytes(int numBytes, long expectedCount, long start, long lim)327   private static void testBytes(int numBytes, long expectedCount, long start, long lim) {
328     byte[] bytes = new byte[numBytes];
329     if (lim == -1) {
330       lim = 1L << (numBytes * 8);
331     }
332     long countRoundTripped = 0;
333     for (long byteChar = start; byteChar < lim; byteChar++) {
334       long tmpByteChar = byteChar;
335       for (int i = 0; i < numBytes; i++) {
336         bytes[bytes.length - i - 1] = (byte) tmpByteChar;
337         tmpByteChar = tmpByteChar >> 8;
338       }
339       boolean isRoundTrippable = Utf8.isWellFormed(bytes);
340       assertEquals(isRoundTrippable, Utf8.isWellFormed(bytes, 0, numBytes));
341       String s = new String(bytes, Charsets.UTF_8);
342       byte[] bytesReencoded = s.getBytes(Charsets.UTF_8);
343       boolean bytesEqual = Arrays.equals(bytes, bytesReencoded);
344 
345       if (bytesEqual != isRoundTrippable) {
346         fail();
347       }
348       if (isRoundTrippable) {
349         countRoundTripped++;
350       }
351     }
352     assertEquals(expectedCount, countRoundTripped);
353   }
354 }
355