1 /* 2 * Copyright (C) 2013 The Guava Authors 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.google.common.base; 18 19 import static com.google.common.truth.Truth.assertThat; 20 import static java.lang.Character.MAX_CODE_POINT; 21 import static java.lang.Character.MAX_HIGH_SURROGATE; 22 import static java.lang.Character.MAX_LOW_SURROGATE; 23 import static java.lang.Character.MIN_HIGH_SURROGATE; 24 import static java.lang.Character.MIN_LOW_SURROGATE; 25 import static java.lang.Character.MIN_SUPPLEMENTARY_CODE_POINT; 26 27 import com.google.common.annotations.GwtCompatible; 28 import com.google.common.annotations.GwtIncompatible; 29 import com.google.common.collect.ImmutableList; 30 import java.util.Arrays; 31 import java.util.HashMap; 32 import java.util.Random; 33 import junit.framework.TestCase; 34 35 /** 36 * Unit tests for {@link Utf8}. 37 * 38 * @author Jon Perlow 39 * @author Martin Buchholz 40 * @author Clément Roux 41 */ 42 @GwtCompatible(emulated = true) 43 public class Utf8Test extends TestCase { 44 45 private static final ImmutableList<String> ILL_FORMED_STRINGS; 46 47 static { 48 ImmutableList.Builder<String> builder = ImmutableList.builder(); 49 char[] surrogates = { 50 MAX_LOW_SURROGATE, MAX_HIGH_SURROGATE, MIN_LOW_SURROGATE, MIN_HIGH_SURROGATE, 51 }; 52 for (char surrogate : surrogates) { newString(surrogate)53 builder.add(newString(surrogate)); newString(surrogate, 'n')54 builder.add(newString(surrogate, 'n')); 55 builder.add(newString('n', surrogate)); newString(surrogate, surrogate)56 builder.add(newString(surrogate, surrogate)); 57 } newString(MIN_LOW_SURROGATE, MAX_HIGH_SURROGATE)58 builder.add(newString(MIN_LOW_SURROGATE, MAX_HIGH_SURROGATE)); 59 ILL_FORMED_STRINGS = builder.build(); 60 } 61 testEncodedLength_validStrings()62 public void testEncodedLength_validStrings() { 63 assertEquals(0, Utf8.encodedLength("")); 64 assertEquals(11, Utf8.encodedLength("Hello world")); 65 assertEquals(8, Utf8.encodedLength("Résumé")); 66 assertEquals( 67 461, 68 Utf8.encodedLength( 69 "威廉·莎士比亞(William Shakespeare," 70 + "1564年4月26號—1616年4月23號[1])係隻英國嗰演員、劇作家同詩人," 71 + "有時間佢簡稱莎翁;中國清末民初哈拕翻譯做舌克斯毕、沙斯皮耳、筛斯比耳、" 72 + "莎基斯庇尔、索士比尔、夏克思芘尔、希哀苦皮阿、叶斯壁、沙克皮尔、" 73 + "狹斯丕爾。[2]莎士比亞編寫過好多作品,佢嗰劇作響西洋文學好有影響," 74 + "哈都拕人翻譯做好多話。")); 75 // A surrogate pair 76 assertEquals(4, Utf8.encodedLength(newString(MIN_HIGH_SURROGATE, MIN_LOW_SURROGATE))); 77 } 78 testEncodedLength_validStrings2()79 public void testEncodedLength_validStrings2() { 80 HashMap<Integer, Integer> utf8Lengths = new HashMap<>(); 81 utf8Lengths.put(0x00, 1); 82 utf8Lengths.put(0x7f, 1); 83 utf8Lengths.put(0x80, 2); 84 utf8Lengths.put(0x7ff, 2); 85 utf8Lengths.put(0x800, 3); 86 utf8Lengths.put(MIN_SUPPLEMENTARY_CODE_POINT - 1, 3); 87 utf8Lengths.put(MIN_SUPPLEMENTARY_CODE_POINT, 4); 88 utf8Lengths.put(MAX_CODE_POINT, 4); 89 90 Integer[] codePoints = utf8Lengths.keySet().toArray(new Integer[] {}); 91 StringBuilder sb = new StringBuilder(); 92 Random rnd = new Random(); 93 for (int trial = 0; trial < 100; trial++) { 94 sb.setLength(0); 95 int utf8Length = 0; 96 for (int i = 0; i < 6; i++) { 97 Integer randomCodePoint = codePoints[rnd.nextInt(codePoints.length)]; 98 sb.appendCodePoint(randomCodePoint); 99 utf8Length += utf8Lengths.get(randomCodePoint); 100 if (utf8Length != Utf8.encodedLength(sb)) { 101 StringBuilder repro = new StringBuilder(); 102 for (int j = 0; j < sb.length(); j++) { 103 repro.append(" ").append((int) sb.charAt(j)); // GWT compatible 104 } 105 assertEquals(repro.toString(), utf8Length, Utf8.encodedLength(sb)); 106 } 107 } 108 } 109 } 110 testEncodedLength_invalidStrings()111 public void testEncodedLength_invalidStrings() { 112 testEncodedLengthFails(newString(MIN_HIGH_SURROGATE), 0); 113 testEncodedLengthFails("foobar" + newString(MIN_HIGH_SURROGATE), 6); 114 testEncodedLengthFails(newString(MIN_LOW_SURROGATE), 0); 115 testEncodedLengthFails("foobar" + newString(MIN_LOW_SURROGATE), 6); 116 testEncodedLengthFails(newString(MIN_HIGH_SURROGATE, MIN_HIGH_SURROGATE), 0); 117 } 118 testEncodedLengthFails(String invalidString, int invalidCodePointIndex)119 private static void testEncodedLengthFails(String invalidString, int invalidCodePointIndex) { 120 try { 121 Utf8.encodedLength(invalidString); 122 fail(); 123 } catch (IllegalArgumentException expected) { 124 assertThat(expected) 125 .hasMessageThat() 126 .isEqualTo("Unpaired surrogate at index " + invalidCodePointIndex); 127 } 128 } 129 130 // 128 - [chars 0x0000 to 0x007f] 131 private static final long ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS = 0x007f - 0x0000 + 1; 132 133 // 128 134 private static final long EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT = 135 ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS; 136 137 // 1920 [chars 0x0080 to 0x07FF] 138 private static final long TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS = 0x07FF - 0x0080 + 1; 139 140 // 18,304 141 private static final long EXPECTED_TWO_BYTE_ROUNDTRIPPABLE_COUNT = 142 // Both bytes are one byte characters 143 (long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 2) 144 + 145 // The possible number of two byte characters 146 TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS; 147 148 // 2048 149 private static final long THREE_BYTE_SURROGATES = 2 * 1024; 150 151 // 61,440 [chars 0x0800 to 0xFFFF, minus surrogates] 152 private static final long THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS = 153 0xFFFF - 0x0800 + 1 - THREE_BYTE_SURROGATES; 154 155 // 2,650,112 156 private static final long EXPECTED_THREE_BYTE_ROUNDTRIPPABLE_COUNT = 157 // All one byte characters 158 (long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 3) 159 + 160 // One two byte character and a one byte character 161 2 * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS * ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS 162 + 163 // Three byte characters 164 THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS; 165 166 // 1,048,576 [chars 0x10000L to 0x10FFFF] 167 private static final long FOUR_BYTE_ROUNDTRIPPABLE_CHARACTERS = 0x10FFFF - 0x10000L + 1; 168 169 // 289,571,839 170 private static final long EXPECTED_FOUR_BYTE_ROUNDTRIPPABLE_COUNT = 171 // All one byte characters 172 (long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 4) 173 + 174 // One and three byte characters 175 2 * THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS * ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS 176 + 177 // Two two byte characters 178 TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS 179 + 180 // Permutations of one and two byte characters 181 3 182 * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS 183 * ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS 184 * ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS 185 + 186 // Four byte characters 187 FOUR_BYTE_ROUNDTRIPPABLE_CHARACTERS; 188 189 /** Tests that round tripping of all two byte permutations work. */ 190 @GwtIncompatible // java.nio.charset.Charset testIsWellFormed_1Byte()191 public void testIsWellFormed_1Byte() { 192 testBytes(1, EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT); 193 } 194 195 /** Tests that round tripping of all two byte permutations work. */ 196 @GwtIncompatible // java.nio.charset.Charset testIsWellFormed_2Bytes()197 public void testIsWellFormed_2Bytes() { 198 testBytes(2, EXPECTED_TWO_BYTE_ROUNDTRIPPABLE_COUNT); 199 } 200 201 /** Tests that round tripping of all three byte permutations work. */ 202 @GwtIncompatible // java.nio.charset.Charset 203 testIsWellFormed_3Bytes()204 public void testIsWellFormed_3Bytes() { 205 testBytes(3, EXPECTED_THREE_BYTE_ROUNDTRIPPABLE_COUNT); 206 } 207 208 /** 209 * Tests that round tripping of a sample of four byte permutations work. All permutations are 210 * prohibitively expensive to test for automated runs. This method tests specific four-byte cases. 211 */ testIsWellFormed_4BytesSamples()212 public void testIsWellFormed_4BytesSamples() { 213 // Valid 4 byte. 214 assertWellFormed(0xF0, 0xA4, 0xAD, 0xA2); 215 // Bad trailing bytes 216 assertNotWellFormed(0xF0, 0xA4, 0xAD, 0x7F); 217 assertNotWellFormed(0xF0, 0xA4, 0xAD, 0xC0); 218 // Special cases for byte2 219 assertNotWellFormed(0xF0, 0x8F, 0xAD, 0xA2); 220 assertNotWellFormed(0xF4, 0x90, 0xAD, 0xA2); 221 } 222 223 /** Tests some hard-coded test cases. */ testSomeSequences()224 public void testSomeSequences() { 225 // Empty 226 assertWellFormed(); 227 // One-byte characters, including control characters 228 assertWellFormed(0x00, 0x61, 0x62, 0x63, 0x7F); // "\u0000abc\u007f" 229 // Two-byte characters 230 assertWellFormed(0xC2, 0xA2, 0xC2, 0xA2); // "\u00a2\u00a2" 231 // Three-byte characters 232 assertWellFormed(0xc8, 0x8a, 0x63, 0xc8, 0x8a, 0x63); // "\u020ac\u020ac" 233 // Four-byte characters 234 // "\u024B62\u024B62" 235 assertWellFormed(0xc9, 0x8b, 0x36, 0x32, 0xc9, 0x8b, 0x36, 0x32); 236 // Mixed string 237 // "a\u020ac\u00a2b\\u024B62u020acc\u00a2de\u024B62" 238 assertWellFormed( 239 0x61, 0xc8, 0x8a, 0x63, 0xc2, 0xa2, 0x62, 0x5c, 0x75, 0x30, 0x32, 0x34, 0x42, 0x36, 0x32, 240 0x75, 0x30, 0x32, 0x30, 0x61, 0x63, 0x63, 0xc2, 0xa2, 0x64, 0x65, 0xc9, 0x8b, 0x36, 0x32); 241 // Not a valid string 242 assertNotWellFormed(-1, 0, -1, 0); 243 } 244 testShardsHaveExpectedRoundTrippables()245 public void testShardsHaveExpectedRoundTrippables() { 246 // A sanity check. 247 long actual = 0; 248 for (long expected : generateFourByteShardsExpectedRunnables()) { 249 actual += expected; 250 } 251 assertEquals(EXPECTED_FOUR_BYTE_ROUNDTRIPPABLE_COUNT, actual); 252 } 253 newString(char... chars)254 private static String newString(char... chars) { 255 return new String(chars); 256 } 257 toByteArray(int... bytes)258 private static byte[] toByteArray(int... bytes) { 259 byte[] realBytes = new byte[bytes.length]; 260 for (int i = 0; i < bytes.length; i++) { 261 realBytes[i] = (byte) bytes[i]; 262 } 263 return realBytes; 264 } 265 assertWellFormed(int... bytes)266 private static void assertWellFormed(int... bytes) { 267 assertTrue(Utf8.isWellFormed(toByteArray(bytes))); 268 } 269 assertNotWellFormed(int... bytes)270 private static void assertNotWellFormed(int... bytes) { 271 assertFalse(Utf8.isWellFormed(toByteArray(bytes))); 272 } 273 generateFourByteShardsExpectedRunnables()274 private static long[] generateFourByteShardsExpectedRunnables() { 275 long[] expected = new long[128]; 276 // 0-63 are all 5300224 277 for (int i = 0; i <= 63; i++) { 278 expected[i] = 5300224; 279 } 280 // 97-111 are all 2342912 281 for (int i = 97; i <= 111; i++) { 282 expected[i] = 2342912; 283 } 284 // 113-117 are all 1048576 285 for (int i = 113; i <= 117; i++) { 286 expected[i] = 1048576; 287 } 288 // One offs 289 expected[112] = 786432; 290 expected[118] = 786432; 291 expected[119] = 1048576; 292 expected[120] = 458752; 293 expected[121] = 524288; 294 expected[122] = 65536; 295 // Anything not assigned was the default 0. 296 return expected; 297 } 298 299 /** 300 * Helper to run the loop to test all the permutations for the number of bytes specified. 301 * 302 * @param numBytes the number of bytes in the byte array 303 * @param expectedCount the expected number of roundtrippable permutations 304 */ 305 @GwtIncompatible // java.nio.charset.Charset testBytes(int numBytes, long expectedCount)306 private static void testBytes(int numBytes, long expectedCount) { 307 testBytes(numBytes, expectedCount, 0, -1); 308 } 309 310 /** 311 * Helper to run the loop to test all the permutations for the number of bytes specified. This 312 * overload is useful for debugging to get the loop to start at a certain character. 313 * 314 * @param numBytes the number of bytes in the byte array 315 * @param expectedCount the expected number of roundtrippable permutations 316 * @param start the starting bytes encoded as a long as big-endian 317 * @param lim the limit of bytes to process encoded as a long as big-endian, or -1 to mean the max 318 * limit for numBytes 319 */ 320 @GwtIncompatible // java.nio.charset.Charset testBytes(int numBytes, long expectedCount, long start, long lim)321 private static void testBytes(int numBytes, long expectedCount, long start, long lim) { 322 byte[] bytes = new byte[numBytes]; 323 if (lim == -1) { 324 lim = 1L << (numBytes * 8); 325 } 326 long countRoundTripped = 0; 327 for (long byteChar = start; byteChar < lim; byteChar++) { 328 long tmpByteChar = byteChar; 329 for (int i = 0; i < numBytes; i++) { 330 bytes[bytes.length - i - 1] = (byte) tmpByteChar; 331 tmpByteChar = tmpByteChar >> 8; 332 } 333 boolean isRoundTrippable = Utf8.isWellFormed(bytes); 334 assertEquals(isRoundTrippable, Utf8.isWellFormed(bytes, 0, numBytes)); 335 String s = new String(bytes, Charsets.UTF_8); 336 byte[] bytesReencoded = s.getBytes(Charsets.UTF_8); 337 boolean bytesEqual = Arrays.equals(bytes, bytesReencoded); 338 339 if (bytesEqual != isRoundTrippable) { 340 fail(); 341 } 342 if (isRoundTrippable) { 343 countRoundTripped++; 344 } 345 } 346 assertEquals(expectedCount, countRoundTripped); 347 } 348 } 349