1 /* 2 * Copyright (C) 2013 The Guava Authors 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.google.common.base; 18 19 import static com.google.common.truth.Truth.assertThat; 20 import static java.lang.Character.MAX_CODE_POINT; 21 import static java.lang.Character.MAX_HIGH_SURROGATE; 22 import static java.lang.Character.MAX_LOW_SURROGATE; 23 import static java.lang.Character.MIN_HIGH_SURROGATE; 24 import static java.lang.Character.MIN_LOW_SURROGATE; 25 import static java.lang.Character.MIN_SUPPLEMENTARY_CODE_POINT; 26 27 import com.google.common.annotations.GwtCompatible; 28 import com.google.common.annotations.GwtIncompatible; 29 import com.google.common.annotations.J2ktIncompatible; 30 import com.google.common.collect.ImmutableList; 31 import java.util.Arrays; 32 import java.util.HashMap; 33 import java.util.Random; 34 import junit.framework.TestCase; 35 36 /** 37 * Unit tests for {@link Utf8}. 38 * 39 * @author Jon Perlow 40 * @author Martin Buchholz 41 * @author Clément Roux 42 */ 43 @GwtCompatible(emulated = true) 44 public class Utf8Test extends TestCase { 45 46 private static final ImmutableList<String> ILL_FORMED_STRINGS; 47 48 static { 49 ImmutableList.Builder<String> builder = ImmutableList.builder(); 50 char[] surrogates = { 51 MAX_LOW_SURROGATE, MAX_HIGH_SURROGATE, MIN_LOW_SURROGATE, MIN_HIGH_SURROGATE, 52 }; 53 for (char surrogate : surrogates) { newString(surrogate)54 builder.add(newString(surrogate)); newString(surrogate, 'n')55 builder.add(newString(surrogate, 'n')); 56 builder.add(newString('n', surrogate)); newString(surrogate, surrogate)57 builder.add(newString(surrogate, surrogate)); 58 } newString(MIN_LOW_SURROGATE, MAX_HIGH_SURROGATE)59 builder.add(newString(MIN_LOW_SURROGATE, MAX_HIGH_SURROGATE)); 60 ILL_FORMED_STRINGS = builder.build(); 61 } 62 testEncodedLength_validStrings()63 public void testEncodedLength_validStrings() { 64 assertEquals(0, Utf8.encodedLength("")); 65 assertEquals(11, Utf8.encodedLength("Hello world")); 66 assertEquals(8, Utf8.encodedLength("Résumé")); 67 assertEquals( 68 461, 69 Utf8.encodedLength( 70 "威廉·莎士比亞(William Shakespeare," 71 + "1564年4月26號—1616年4月23號[1])係隻英國嗰演員、劇作家同詩人," 72 + "有時間佢簡稱莎翁;中國清末民初哈拕翻譯做舌克斯毕、沙斯皮耳、筛斯比耳、" 73 + "莎基斯庇尔、索士比尔、夏克思芘尔、希哀苦皮阿、叶斯壁、沙克皮尔、" 74 + "狹斯丕爾。[2]莎士比亞編寫過好多作品,佢嗰劇作響西洋文學好有影響," 75 + "哈都拕人翻譯做好多話。")); 76 // A surrogate pair 77 assertEquals(4, Utf8.encodedLength(newString(MIN_HIGH_SURROGATE, MIN_LOW_SURROGATE))); 78 } 79 testEncodedLength_validStrings2()80 public void testEncodedLength_validStrings2() { 81 HashMap<Integer, Integer> utf8Lengths = new HashMap<>(); 82 utf8Lengths.put(0x00, 1); 83 utf8Lengths.put(0x7f, 1); 84 utf8Lengths.put(0x80, 2); 85 utf8Lengths.put(0x7ff, 2); 86 utf8Lengths.put(0x800, 3); 87 utf8Lengths.put(MIN_SUPPLEMENTARY_CODE_POINT - 1, 3); 88 utf8Lengths.put(MIN_SUPPLEMENTARY_CODE_POINT, 4); 89 utf8Lengths.put(MAX_CODE_POINT, 4); 90 91 Integer[] codePoints = utf8Lengths.keySet().toArray(new Integer[] {}); 92 StringBuilder sb = new StringBuilder(); 93 Random rnd = new Random(); 94 for (int trial = 0; trial < 100; trial++) { 95 sb.setLength(0); 96 int utf8Length = 0; 97 for (int i = 0; i < 6; i++) { 98 Integer randomCodePoint = codePoints[rnd.nextInt(codePoints.length)]; 99 sb.appendCodePoint(randomCodePoint); 100 utf8Length += utf8Lengths.get(randomCodePoint); 101 if (utf8Length != Utf8.encodedLength(sb)) { 102 StringBuilder repro = new StringBuilder(); 103 for (int j = 0; j < sb.length(); j++) { 104 repro.append(" ").append((int) sb.charAt(j)); // GWT compatible 105 } 106 assertEquals(repro.toString(), utf8Length, Utf8.encodedLength(sb)); 107 } 108 } 109 } 110 } 111 testEncodedLength_invalidStrings()112 public void testEncodedLength_invalidStrings() { 113 testEncodedLengthFails(newString(MIN_HIGH_SURROGATE), 0); 114 testEncodedLengthFails("foobar" + newString(MIN_HIGH_SURROGATE), 6); 115 testEncodedLengthFails(newString(MIN_LOW_SURROGATE), 0); 116 testEncodedLengthFails("foobar" + newString(MIN_LOW_SURROGATE), 6); 117 testEncodedLengthFails(newString(MIN_HIGH_SURROGATE, MIN_HIGH_SURROGATE), 0); 118 } 119 testEncodedLengthFails(String invalidString, int invalidCodePointIndex)120 private static void testEncodedLengthFails(String invalidString, int invalidCodePointIndex) { 121 try { 122 Utf8.encodedLength(invalidString); 123 fail(); 124 } catch (IllegalArgumentException expected) { 125 assertThat(expected) 126 .hasMessageThat() 127 .isEqualTo("Unpaired surrogate at index " + invalidCodePointIndex); 128 } 129 } 130 131 // 128 - [chars 0x0000 to 0x007f] 132 private static final long ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS = 0x007f - 0x0000 + 1; 133 134 // 128 135 private static final long EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT = 136 ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS; 137 138 // 1920 [chars 0x0080 to 0x07FF] 139 private static final long TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS = 0x07FF - 0x0080 + 1; 140 141 // 18,304 142 private static final long EXPECTED_TWO_BYTE_ROUNDTRIPPABLE_COUNT = 143 // Both bytes are one byte characters 144 (long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 2) 145 + 146 // The possible number of two byte characters 147 TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS; 148 149 // 2048 150 private static final long THREE_BYTE_SURROGATES = 2 * 1024; 151 152 // 61,440 [chars 0x0800 to 0xFFFF, minus surrogates] 153 private static final long THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS = 154 0xFFFF - 0x0800 + 1 - THREE_BYTE_SURROGATES; 155 156 // 2,650,112 157 private static final long EXPECTED_THREE_BYTE_ROUNDTRIPPABLE_COUNT = 158 // All one byte characters 159 (long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 3) 160 + 161 // One two byte character and a one byte character 162 2 * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS * ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS 163 + 164 // Three byte characters 165 THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS; 166 167 // 1,048,576 [chars 0x10000L to 0x10FFFF] 168 private static final long FOUR_BYTE_ROUNDTRIPPABLE_CHARACTERS = 0x10FFFF - 0x10000L + 1; 169 170 // 289,571,839 171 private static final long EXPECTED_FOUR_BYTE_ROUNDTRIPPABLE_COUNT = 172 // All one byte characters 173 (long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 4) 174 + 175 // One and three byte characters 176 2 * THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS * ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS 177 + 178 // Two two byte characters 179 TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS 180 + 181 // Permutations of one and two byte characters 182 3 183 * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS 184 * ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS 185 * ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS 186 + 187 // Four byte characters 188 FOUR_BYTE_ROUNDTRIPPABLE_CHARACTERS; 189 190 /** Tests that round tripping of all two byte permutations work. */ 191 @J2ktIncompatible 192 @GwtIncompatible // java.nio.charset.Charset testIsWellFormed_1Byte()193 public void testIsWellFormed_1Byte() { 194 testBytes(1, EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT); 195 } 196 197 /** Tests that round tripping of all two byte permutations work. */ 198 @J2ktIncompatible 199 @GwtIncompatible // java.nio.charset.Charset testIsWellFormed_2Bytes()200 public void testIsWellFormed_2Bytes() { 201 testBytes(2, EXPECTED_TWO_BYTE_ROUNDTRIPPABLE_COUNT); 202 } 203 204 /** Tests that round tripping of all three byte permutations work. */ 205 @J2ktIncompatible 206 @GwtIncompatible // java.nio.charset.Charset 207 testIsWellFormed_3Bytes()208 public void testIsWellFormed_3Bytes() { 209 testBytes(3, EXPECTED_THREE_BYTE_ROUNDTRIPPABLE_COUNT); 210 } 211 212 /** 213 * Tests that round tripping of a sample of four byte permutations work. All permutations are 214 * prohibitively expensive to test for automated runs. This method tests specific four-byte cases. 215 */ testIsWellFormed_4BytesSamples()216 public void testIsWellFormed_4BytesSamples() { 217 // Valid 4 byte. 218 assertWellFormed(0xF0, 0xA4, 0xAD, 0xA2); 219 // Bad trailing bytes 220 assertNotWellFormed(0xF0, 0xA4, 0xAD, 0x7F); 221 assertNotWellFormed(0xF0, 0xA4, 0xAD, 0xC0); 222 // Special cases for byte2 223 assertNotWellFormed(0xF0, 0x8F, 0xAD, 0xA2); 224 assertNotWellFormed(0xF4, 0x90, 0xAD, 0xA2); 225 } 226 227 /** Tests some hard-coded test cases. */ testSomeSequences()228 public void testSomeSequences() { 229 // Empty 230 assertWellFormed(); 231 // One-byte characters, including control characters 232 assertWellFormed(0x00, 0x61, 0x62, 0x63, 0x7F); // "\u0000abc\u007f" 233 // Two-byte characters 234 assertWellFormed(0xC2, 0xA2, 0xC2, 0xA2); // "\u00a2\u00a2" 235 // Three-byte characters 236 assertWellFormed(0xc8, 0x8a, 0x63, 0xc8, 0x8a, 0x63); // "\u020ac\u020ac" 237 // Four-byte characters 238 // "\u024B62\u024B62" 239 assertWellFormed(0xc9, 0x8b, 0x36, 0x32, 0xc9, 0x8b, 0x36, 0x32); 240 // Mixed string 241 // "a\u020ac\u00a2b\\u024B62u020acc\u00a2de\u024B62" 242 assertWellFormed( 243 0x61, 0xc8, 0x8a, 0x63, 0xc2, 0xa2, 0x62, 0x5c, 0x75, 0x30, 0x32, 0x34, 0x42, 0x36, 0x32, 244 0x75, 0x30, 0x32, 0x30, 0x61, 0x63, 0x63, 0xc2, 0xa2, 0x64, 0x65, 0xc9, 0x8b, 0x36, 0x32); 245 // Not a valid string 246 assertNotWellFormed(-1, 0, -1, 0); 247 } 248 testShardsHaveExpectedRoundTrippables()249 public void testShardsHaveExpectedRoundTrippables() { 250 // A sanity check. 251 long actual = 0; 252 for (long expected : generateFourByteShardsExpectedRunnables()) { 253 actual += expected; 254 } 255 assertEquals(EXPECTED_FOUR_BYTE_ROUNDTRIPPABLE_COUNT, actual); 256 } 257 newString(char... chars)258 private static String newString(char... chars) { 259 return new String(chars); 260 } 261 toByteArray(int... bytes)262 private static byte[] toByteArray(int... bytes) { 263 byte[] realBytes = new byte[bytes.length]; 264 for (int i = 0; i < bytes.length; i++) { 265 realBytes[i] = (byte) bytes[i]; 266 } 267 return realBytes; 268 } 269 assertWellFormed(int... bytes)270 private static void assertWellFormed(int... bytes) { 271 assertTrue(Utf8.isWellFormed(toByteArray(bytes))); 272 } 273 assertNotWellFormed(int... bytes)274 private static void assertNotWellFormed(int... bytes) { 275 assertFalse(Utf8.isWellFormed(toByteArray(bytes))); 276 } 277 generateFourByteShardsExpectedRunnables()278 private static long[] generateFourByteShardsExpectedRunnables() { 279 long[] expected = new long[128]; 280 // 0-63 are all 5300224 281 for (int i = 0; i <= 63; i++) { 282 expected[i] = 5300224; 283 } 284 // 97-111 are all 2342912 285 for (int i = 97; i <= 111; i++) { 286 expected[i] = 2342912; 287 } 288 // 113-117 are all 1048576 289 for (int i = 113; i <= 117; i++) { 290 expected[i] = 1048576; 291 } 292 // One offs 293 expected[112] = 786432; 294 expected[118] = 786432; 295 expected[119] = 1048576; 296 expected[120] = 458752; 297 expected[121] = 524288; 298 expected[122] = 65536; 299 // Anything not assigned was the default 0. 300 return expected; 301 } 302 303 /** 304 * Helper to run the loop to test all the permutations for the number of bytes specified. 305 * 306 * @param numBytes the number of bytes in the byte array 307 * @param expectedCount the expected number of roundtrippable permutations 308 */ 309 @J2ktIncompatible 310 @GwtIncompatible // java.nio.charset.Charset testBytes(int numBytes, long expectedCount)311 private static void testBytes(int numBytes, long expectedCount) { 312 testBytes(numBytes, expectedCount, 0, -1); 313 } 314 315 /** 316 * Helper to run the loop to test all the permutations for the number of bytes specified. This 317 * overload is useful for debugging to get the loop to start at a certain character. 318 * 319 * @param numBytes the number of bytes in the byte array 320 * @param expectedCount the expected number of roundtrippable permutations 321 * @param start the starting bytes encoded as a long as big-endian 322 * @param lim the limit of bytes to process encoded as a long as big-endian, or -1 to mean the max 323 * limit for numBytes 324 */ 325 @J2ktIncompatible 326 @GwtIncompatible // java.nio.charset.Charset testBytes(int numBytes, long expectedCount, long start, long lim)327 private static void testBytes(int numBytes, long expectedCount, long start, long lim) { 328 byte[] bytes = new byte[numBytes]; 329 if (lim == -1) { 330 lim = 1L << (numBytes * 8); 331 } 332 long countRoundTripped = 0; 333 for (long byteChar = start; byteChar < lim; byteChar++) { 334 long tmpByteChar = byteChar; 335 for (int i = 0; i < numBytes; i++) { 336 bytes[bytes.length - i - 1] = (byte) tmpByteChar; 337 tmpByteChar = tmpByteChar >> 8; 338 } 339 boolean isRoundTrippable = Utf8.isWellFormed(bytes); 340 assertEquals(isRoundTrippable, Utf8.isWellFormed(bytes, 0, numBytes)); 341 String s = new String(bytes, Charsets.UTF_8); 342 byte[] bytesReencoded = s.getBytes(Charsets.UTF_8); 343 boolean bytesEqual = Arrays.equals(bytes, bytesReencoded); 344 345 if (bytesEqual != isRoundTrippable) { 346 fail(); 347 } 348 if (isRoundTrippable) { 349 countRoundTripped++; 350 } 351 } 352 assertEquals(expectedCount, countRoundTripped); 353 } 354 } 355