• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2013 The Guava Authors
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 package com.google.common.base;
18 
19 import static com.google.common.truth.Truth.assertThat;
20 import static java.lang.Character.MAX_CODE_POINT;
21 import static java.lang.Character.MAX_HIGH_SURROGATE;
22 import static java.lang.Character.MAX_LOW_SURROGATE;
23 import static java.lang.Character.MIN_HIGH_SURROGATE;
24 import static java.lang.Character.MIN_LOW_SURROGATE;
25 import static java.lang.Character.MIN_SUPPLEMENTARY_CODE_POINT;
26 
27 import com.google.common.annotations.GwtCompatible;
28 import com.google.common.annotations.GwtIncompatible;
29 import com.google.common.collect.ImmutableList;
30 import java.util.Arrays;
31 import java.util.HashMap;
32 import java.util.Random;
33 import junit.framework.TestCase;
34 
35 /**
36  * Unit tests for {@link Utf8}.
37  *
38  * @author Jon Perlow
39  * @author Martin Buchholz
40  * @author Clément Roux
41  */
42 @GwtCompatible(emulated = true)
43 public class Utf8Test extends TestCase {
44 
45   private static final ImmutableList<String> ILL_FORMED_STRINGS;
46 
47   static {
48     ImmutableList.Builder<String> builder = ImmutableList.builder();
49     char[] surrogates = {
50       MAX_LOW_SURROGATE, MAX_HIGH_SURROGATE, MIN_LOW_SURROGATE, MIN_HIGH_SURROGATE,
51     };
52     for (char surrogate : surrogates) {
newString(surrogate)53       builder.add(newString(surrogate));
newString(surrogate, 'n')54       builder.add(newString(surrogate, 'n'));
55       builder.add(newString('n', surrogate));
newString(surrogate, surrogate)56       builder.add(newString(surrogate, surrogate));
57     }
newString(MIN_LOW_SURROGATE, MAX_HIGH_SURROGATE)58     builder.add(newString(MIN_LOW_SURROGATE, MAX_HIGH_SURROGATE));
59     ILL_FORMED_STRINGS = builder.build();
60   }
61 
testEncodedLength_validStrings()62   public void testEncodedLength_validStrings() {
63     assertEquals(0, Utf8.encodedLength(""));
64     assertEquals(11, Utf8.encodedLength("Hello world"));
65     assertEquals(8, Utf8.encodedLength("Résumé"));
66     assertEquals(
67         461,
68         Utf8.encodedLength(
69             "威廉·莎士比亞(William Shakespeare,"
70                 + "1564年4月26號—1616年4月23號[1])係隻英國嗰演員、劇作家同詩人,"
71                 + "有時間佢簡稱莎翁;中國清末民初哈拕翻譯做舌克斯毕、沙斯皮耳、筛斯比耳、"
72                 + "莎基斯庇尔、索士比尔、夏克思芘尔、希哀苦皮阿、叶斯壁、沙克皮尔、"
73                 + "狹斯丕爾。[2]莎士比亞編寫過好多作品,佢嗰劇作響西洋文學好有影響,"
74                 + "哈都拕人翻譯做好多話。"));
75     // A surrogate pair
76     assertEquals(4, Utf8.encodedLength(newString(MIN_HIGH_SURROGATE, MIN_LOW_SURROGATE)));
77   }
78 
testEncodedLength_validStrings2()79   public void testEncodedLength_validStrings2() {
80     HashMap<Integer, Integer> utf8Lengths = new HashMap<>();
81     utf8Lengths.put(0x00, 1);
82     utf8Lengths.put(0x7f, 1);
83     utf8Lengths.put(0x80, 2);
84     utf8Lengths.put(0x7ff, 2);
85     utf8Lengths.put(0x800, 3);
86     utf8Lengths.put(MIN_SUPPLEMENTARY_CODE_POINT - 1, 3);
87     utf8Lengths.put(MIN_SUPPLEMENTARY_CODE_POINT, 4);
88     utf8Lengths.put(MAX_CODE_POINT, 4);
89 
90     Integer[] codePoints = utf8Lengths.keySet().toArray(new Integer[] {});
91     StringBuilder sb = new StringBuilder();
92     Random rnd = new Random();
93     for (int trial = 0; trial < 100; trial++) {
94       sb.setLength(0);
95       int utf8Length = 0;
96       for (int i = 0; i < 6; i++) {
97         Integer randomCodePoint = codePoints[rnd.nextInt(codePoints.length)];
98         sb.appendCodePoint(randomCodePoint);
99         utf8Length += utf8Lengths.get(randomCodePoint);
100         if (utf8Length != Utf8.encodedLength(sb)) {
101           StringBuilder repro = new StringBuilder();
102           for (int j = 0; j < sb.length(); j++) {
103             repro.append(" ").append((int) sb.charAt(j)); // GWT compatible
104           }
105           assertEquals(repro.toString(), utf8Length, Utf8.encodedLength(sb));
106         }
107       }
108     }
109   }
110 
testEncodedLength_invalidStrings()111   public void testEncodedLength_invalidStrings() {
112     testEncodedLengthFails(newString(MIN_HIGH_SURROGATE), 0);
113     testEncodedLengthFails("foobar" + newString(MIN_HIGH_SURROGATE), 6);
114     testEncodedLengthFails(newString(MIN_LOW_SURROGATE), 0);
115     testEncodedLengthFails("foobar" + newString(MIN_LOW_SURROGATE), 6);
116     testEncodedLengthFails(newString(MIN_HIGH_SURROGATE, MIN_HIGH_SURROGATE), 0);
117   }
118 
testEncodedLengthFails(String invalidString, int invalidCodePointIndex)119   private static void testEncodedLengthFails(String invalidString, int invalidCodePointIndex) {
120     try {
121       Utf8.encodedLength(invalidString);
122       fail();
123     } catch (IllegalArgumentException expected) {
124       assertThat(expected)
125           .hasMessageThat()
126           .isEqualTo("Unpaired surrogate at index " + invalidCodePointIndex);
127     }
128   }
129 
130   // 128 - [chars 0x0000 to 0x007f]
131   private static final long ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS = 0x007f - 0x0000 + 1;
132 
133   // 128
134   private static final long EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT =
135       ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS;
136 
137   // 1920 [chars 0x0080 to 0x07FF]
138   private static final long TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS = 0x07FF - 0x0080 + 1;
139 
140   // 18,304
141   private static final long EXPECTED_TWO_BYTE_ROUNDTRIPPABLE_COUNT =
142       // Both bytes are one byte characters
143       (long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 2)
144           +
145           // The possible number of two byte characters
146           TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS;
147 
148   // 2048
149   private static final long THREE_BYTE_SURROGATES = 2 * 1024;
150 
151   // 61,440 [chars 0x0800 to 0xFFFF, minus surrogates]
152   private static final long THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS =
153       0xFFFF - 0x0800 + 1 - THREE_BYTE_SURROGATES;
154 
155   // 2,650,112
156   private static final long EXPECTED_THREE_BYTE_ROUNDTRIPPABLE_COUNT =
157       // All one byte characters
158       (long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 3)
159           +
160           // One two byte character and a one byte character
161           2 * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS * ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS
162           +
163           // Three byte characters
164           THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS;
165 
166   // 1,048,576 [chars 0x10000L to 0x10FFFF]
167   private static final long FOUR_BYTE_ROUNDTRIPPABLE_CHARACTERS = 0x10FFFF - 0x10000L + 1;
168 
169   // 289,571,839
170   private static final long EXPECTED_FOUR_BYTE_ROUNDTRIPPABLE_COUNT =
171       // All one byte characters
172       (long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 4)
173           +
174           // One and three byte characters
175           2 * THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS * ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS
176           +
177           // Two two byte characters
178           TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS
179           +
180           // Permutations of one and two byte characters
181           3
182               * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS
183               * ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS
184               * ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS
185           +
186           // Four byte characters
187           FOUR_BYTE_ROUNDTRIPPABLE_CHARACTERS;
188 
189   /** Tests that round tripping of all two byte permutations work. */
190   @GwtIncompatible // java.nio.charset.Charset
testIsWellFormed_1Byte()191   public void testIsWellFormed_1Byte() {
192     testBytes(1, EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT);
193   }
194 
195   /** Tests that round tripping of all two byte permutations work. */
196   @GwtIncompatible // java.nio.charset.Charset
testIsWellFormed_2Bytes()197   public void testIsWellFormed_2Bytes() {
198     testBytes(2, EXPECTED_TWO_BYTE_ROUNDTRIPPABLE_COUNT);
199   }
200 
201   /** Tests that round tripping of all three byte permutations work. */
202   @GwtIncompatible // java.nio.charset.Charset
203 
testIsWellFormed_3Bytes()204   public void testIsWellFormed_3Bytes() {
205     testBytes(3, EXPECTED_THREE_BYTE_ROUNDTRIPPABLE_COUNT);
206   }
207 
208   /**
209    * Tests that round tripping of a sample of four byte permutations work. All permutations are
210    * prohibitively expensive to test for automated runs. This method tests specific four-byte cases.
211    */
testIsWellFormed_4BytesSamples()212   public void testIsWellFormed_4BytesSamples() {
213     // Valid 4 byte.
214     assertWellFormed(0xF0, 0xA4, 0xAD, 0xA2);
215     // Bad trailing bytes
216     assertNotWellFormed(0xF0, 0xA4, 0xAD, 0x7F);
217     assertNotWellFormed(0xF0, 0xA4, 0xAD, 0xC0);
218     // Special cases for byte2
219     assertNotWellFormed(0xF0, 0x8F, 0xAD, 0xA2);
220     assertNotWellFormed(0xF4, 0x90, 0xAD, 0xA2);
221   }
222 
223   /** Tests some hard-coded test cases. */
testSomeSequences()224   public void testSomeSequences() {
225     // Empty
226     assertWellFormed();
227     // One-byte characters, including control characters
228     assertWellFormed(0x00, 0x61, 0x62, 0x63, 0x7F); // "\u0000abc\u007f"
229     // Two-byte characters
230     assertWellFormed(0xC2, 0xA2, 0xC2, 0xA2); // "\u00a2\u00a2"
231     // Three-byte characters
232     assertWellFormed(0xc8, 0x8a, 0x63, 0xc8, 0x8a, 0x63); // "\u020ac\u020ac"
233     // Four-byte characters
234     // "\u024B62\u024B62"
235     assertWellFormed(0xc9, 0x8b, 0x36, 0x32, 0xc9, 0x8b, 0x36, 0x32);
236     // Mixed string
237     // "a\u020ac\u00a2b\\u024B62u020acc\u00a2de\u024B62"
238     assertWellFormed(
239         0x61, 0xc8, 0x8a, 0x63, 0xc2, 0xa2, 0x62, 0x5c, 0x75, 0x30, 0x32, 0x34, 0x42, 0x36, 0x32,
240         0x75, 0x30, 0x32, 0x30, 0x61, 0x63, 0x63, 0xc2, 0xa2, 0x64, 0x65, 0xc9, 0x8b, 0x36, 0x32);
241     // Not a valid string
242     assertNotWellFormed(-1, 0, -1, 0);
243   }
244 
testShardsHaveExpectedRoundTrippables()245   public void testShardsHaveExpectedRoundTrippables() {
246     // A sanity check.
247     long actual = 0;
248     for (long expected : generateFourByteShardsExpectedRunnables()) {
249       actual += expected;
250     }
251     assertEquals(EXPECTED_FOUR_BYTE_ROUNDTRIPPABLE_COUNT, actual);
252   }
253 
newString(char... chars)254   private static String newString(char... chars) {
255     return new String(chars);
256   }
257 
toByteArray(int... bytes)258   private static byte[] toByteArray(int... bytes) {
259     byte[] realBytes = new byte[bytes.length];
260     for (int i = 0; i < bytes.length; i++) {
261       realBytes[i] = (byte) bytes[i];
262     }
263     return realBytes;
264   }
265 
assertWellFormed(int... bytes)266   private static void assertWellFormed(int... bytes) {
267     assertTrue(Utf8.isWellFormed(toByteArray(bytes)));
268   }
269 
assertNotWellFormed(int... bytes)270   private static void assertNotWellFormed(int... bytes) {
271     assertFalse(Utf8.isWellFormed(toByteArray(bytes)));
272   }
273 
generateFourByteShardsExpectedRunnables()274   private static long[] generateFourByteShardsExpectedRunnables() {
275     long[] expected = new long[128];
276     // 0-63 are all 5300224
277     for (int i = 0; i <= 63; i++) {
278       expected[i] = 5300224;
279     }
280     // 97-111 are all 2342912
281     for (int i = 97; i <= 111; i++) {
282       expected[i] = 2342912;
283     }
284     // 113-117 are all 1048576
285     for (int i = 113; i <= 117; i++) {
286       expected[i] = 1048576;
287     }
288     // One offs
289     expected[112] = 786432;
290     expected[118] = 786432;
291     expected[119] = 1048576;
292     expected[120] = 458752;
293     expected[121] = 524288;
294     expected[122] = 65536;
295     // Anything not assigned was the default 0.
296     return expected;
297   }
298 
299   /**
300    * Helper to run the loop to test all the permutations for the number of bytes specified.
301    *
302    * @param numBytes the number of bytes in the byte array
303    * @param expectedCount the expected number of roundtrippable permutations
304    */
305   @GwtIncompatible // java.nio.charset.Charset
testBytes(int numBytes, long expectedCount)306   private static void testBytes(int numBytes, long expectedCount) {
307     testBytes(numBytes, expectedCount, 0, -1);
308   }
309 
310   /**
311    * Helper to run the loop to test all the permutations for the number of bytes specified. This
312    * overload is useful for debugging to get the loop to start at a certain character.
313    *
314    * @param numBytes the number of bytes in the byte array
315    * @param expectedCount the expected number of roundtrippable permutations
316    * @param start the starting bytes encoded as a long as big-endian
317    * @param lim the limit of bytes to process encoded as a long as big-endian, or -1 to mean the max
318    *     limit for numBytes
319    */
320   @GwtIncompatible // java.nio.charset.Charset
testBytes(int numBytes, long expectedCount, long start, long lim)321   private static void testBytes(int numBytes, long expectedCount, long start, long lim) {
322     byte[] bytes = new byte[numBytes];
323     if (lim == -1) {
324       lim = 1L << (numBytes * 8);
325     }
326     long countRoundTripped = 0;
327     for (long byteChar = start; byteChar < lim; byteChar++) {
328       long tmpByteChar = byteChar;
329       for (int i = 0; i < numBytes; i++) {
330         bytes[bytes.length - i - 1] = (byte) tmpByteChar;
331         tmpByteChar = tmpByteChar >> 8;
332       }
333       boolean isRoundTrippable = Utf8.isWellFormed(bytes);
334       assertEquals(isRoundTrippable, Utf8.isWellFormed(bytes, 0, numBytes));
335       String s = new String(bytes, Charsets.UTF_8);
336       byte[] bytesReencoded = s.getBytes(Charsets.UTF_8);
337       boolean bytesEqual = Arrays.equals(bytes, bytesReencoded);
338 
339       if (bytesEqual != isRoundTrippable) {
340         fail();
341       }
342       if (isRoundTrippable) {
343         countRoundTripped++;
344       }
345     }
346     assertEquals(expectedCount, countRoundTripped);
347   }
348 }
349