1 /* 2 * Copyright (C) 2013 The Guava Authors 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.google.common.base; 18 19 import static com.google.common.truth.Truth.assertThat; 20 import static java.lang.Character.MAX_CODE_POINT; 21 import static java.lang.Character.MAX_HIGH_SURROGATE; 22 import static java.lang.Character.MAX_LOW_SURROGATE; 23 import static java.lang.Character.MIN_HIGH_SURROGATE; 24 import static java.lang.Character.MIN_LOW_SURROGATE; 25 import static java.lang.Character.MIN_SUPPLEMENTARY_CODE_POINT; 26 import static java.nio.charset.StandardCharsets.UTF_8; 27 28 import com.google.common.annotations.GwtCompatible; 29 import com.google.common.annotations.GwtIncompatible; 30 import com.google.common.collect.ImmutableList; 31 import java.util.Arrays; 32 import java.util.HashMap; 33 import java.util.Random; 34 import junit.framework.TestCase; 35 36 /** 37 * Unit tests for {@link Utf8}. 38 * 39 * @author Jon Perlow 40 * @author Martin Buchholz 41 * @author Clément Roux 42 */ 43 @GwtCompatible(emulated = true) 44 public class Utf8Test extends TestCase { 45 46 private static final ImmutableList<String> ILL_FORMED_STRINGS; 47 48 static { 49 ImmutableList.Builder<String> builder = ImmutableList.builder(); 50 char[] surrogates = { 51 MAX_LOW_SURROGATE, MAX_HIGH_SURROGATE, MIN_LOW_SURROGATE, MIN_HIGH_SURROGATE, 52 }; 53 for (char surrogate : surrogates) { newString(surrogate)54 builder.add(newString(surrogate)); newString(surrogate, 'n')55 builder.add(newString(surrogate, 'n')); 56 builder.add(newString('n', surrogate)); newString(surrogate, surrogate)57 builder.add(newString(surrogate, surrogate)); 58 } newString(MIN_LOW_SURROGATE, MAX_HIGH_SURROGATE)59 builder.add(newString(MIN_LOW_SURROGATE, MAX_HIGH_SURROGATE)); 60 ILL_FORMED_STRINGS = builder.build(); 61 } 62 testEncodedLength_validStrings()63 public void testEncodedLength_validStrings() { 64 assertEquals(0, Utf8.encodedLength("")); 65 assertEquals(11, Utf8.encodedLength("Hello world")); 66 assertEquals(8, Utf8.encodedLength("Résumé")); 67 assertEquals( 68 461, 69 Utf8.encodedLength( 70 "威廉·莎士比亞(William Shakespeare," 71 + "1564年4月26號—1616年4月23號[1])係隻英國嗰演員、劇作家同詩人," 72 + "有時間佢簡稱莎翁;中國清末民初哈拕翻譯做舌克斯毕、沙斯皮耳、筛斯比耳、" 73 + "莎基斯庇尔、索士比尔、夏克思芘尔、希哀苦皮阿、叶斯壁、沙克皮尔、" 74 + "狹斯丕爾。[2]莎士比亞編寫過好多作品,佢嗰劇作響西洋文學好有影響," 75 + "哈都拕人翻譯做好多話。")); 76 // A surrogate pair 77 assertEquals(4, Utf8.encodedLength(newString(MIN_HIGH_SURROGATE, MIN_LOW_SURROGATE))); 78 } 79 testEncodedLength_validStrings2()80 public void testEncodedLength_validStrings2() { 81 HashMap<Integer, Integer> utf8Lengths = new HashMap<>(); 82 utf8Lengths.put(0x00, 1); 83 utf8Lengths.put(0x7f, 1); 84 utf8Lengths.put(0x80, 2); 85 utf8Lengths.put(0x7ff, 2); 86 utf8Lengths.put(0x800, 3); 87 utf8Lengths.put(MIN_SUPPLEMENTARY_CODE_POINT - 1, 3); 88 utf8Lengths.put(MIN_SUPPLEMENTARY_CODE_POINT, 4); 89 utf8Lengths.put(MAX_CODE_POINT, 4); 90 91 Integer[] codePoints = utf8Lengths.keySet().toArray(new Integer[] {}); 92 StringBuilder sb = new StringBuilder(); 93 Random rnd = new Random(); 94 for (int trial = 0; trial < 100; trial++) { 95 sb.setLength(0); 96 int utf8Length = 0; 97 for (int i = 0; i < 6; i++) { 98 Integer randomCodePoint = codePoints[rnd.nextInt(codePoints.length)]; 99 sb.appendCodePoint(randomCodePoint); 100 utf8Length += utf8Lengths.get(randomCodePoint); 101 if (utf8Length != Utf8.encodedLength(sb)) { 102 StringBuilder repro = new StringBuilder(); 103 for (int j = 0; j < sb.length(); j++) { 104 repro.append(" ").append((int) sb.charAt(j)); // GWT compatible 105 } 106 assertEquals(repro.toString(), utf8Length, Utf8.encodedLength(sb)); 107 } 108 } 109 } 110 } 111 testEncodedLength_invalidStrings()112 public void testEncodedLength_invalidStrings() { 113 testEncodedLengthFails(newString(MIN_HIGH_SURROGATE), 0); 114 testEncodedLengthFails("foobar" + newString(MIN_HIGH_SURROGATE), 6); 115 testEncodedLengthFails(newString(MIN_LOW_SURROGATE), 0); 116 testEncodedLengthFails("foobar" + newString(MIN_LOW_SURROGATE), 6); 117 testEncodedLengthFails(newString(MIN_HIGH_SURROGATE, MIN_HIGH_SURROGATE), 0); 118 } 119 testEncodedLengthFails(String invalidString, int invalidCodePointIndex)120 private static void testEncodedLengthFails(String invalidString, int invalidCodePointIndex) { 121 try { 122 Utf8.encodedLength(invalidString); 123 fail(); 124 } catch (IllegalArgumentException expected) { 125 assertThat(expected) 126 .hasMessageThat() 127 .isEqualTo("Unpaired surrogate at index " + invalidCodePointIndex); 128 } 129 } 130 131 // 128 - [chars 0x0000 to 0x007f] 132 private static final long ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS = 0x007f - 0x0000 + 1; 133 134 // 128 135 private static final long EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT = 136 ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS; 137 138 // 1920 [chars 0x0080 to 0x07FF] 139 private static final long TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS = 0x07FF - 0x0080 + 1; 140 141 // 18,304 142 private static final long EXPECTED_TWO_BYTE_ROUNDTRIPPABLE_COUNT = 143 // Both bytes are one byte characters 144 (long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 2) 145 + 146 // The possible number of two byte characters 147 TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS; 148 149 // 2048 150 private static final long THREE_BYTE_SURROGATES = 2 * 1024; 151 152 // 61,440 [chars 0x0800 to 0xFFFF, minus surrogates] 153 private static final long THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS = 154 0xFFFF - 0x0800 + 1 - THREE_BYTE_SURROGATES; 155 156 // 2,650,112 157 private static final long EXPECTED_THREE_BYTE_ROUNDTRIPPABLE_COUNT = 158 // All one byte characters 159 (long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 3) 160 + 161 // One two byte character and a one byte character 162 2 * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS * ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS 163 + 164 // Three byte characters 165 THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS; 166 167 // 1,048,576 [chars 0x10000L to 0x10FFFF] 168 private static final long FOUR_BYTE_ROUNDTRIPPABLE_CHARACTERS = 0x10FFFF - 0x10000L + 1; 169 170 // 289,571,839 171 private static final long EXPECTED_FOUR_BYTE_ROUNDTRIPPABLE_COUNT = 172 // All one byte characters 173 (long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 4) 174 + 175 // One and three byte characters 176 2 * THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS * ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS 177 + 178 // Two two byte characters 179 TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS 180 + 181 // Permutations of one and two byte characters 182 3 183 * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS 184 * ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS 185 * ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS 186 + 187 // Four byte characters 188 FOUR_BYTE_ROUNDTRIPPABLE_CHARACTERS; 189 190 /** Tests that round tripping of all two byte permutations work. */ 191 @GwtIncompatible // java.nio.charset.Charset testIsWellFormed_1Byte()192 public void testIsWellFormed_1Byte() { 193 testBytes(1, EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT); 194 } 195 196 /** Tests that round tripping of all two byte permutations work. */ 197 @GwtIncompatible // java.nio.charset.Charset testIsWellFormed_2Bytes()198 public void testIsWellFormed_2Bytes() { 199 testBytes(2, EXPECTED_TWO_BYTE_ROUNDTRIPPABLE_COUNT); 200 } 201 202 /** Tests that round tripping of all three byte permutations work. */ 203 @GwtIncompatible // java.nio.charset.Charset 204 testIsWellFormed_3Bytes()205 public void testIsWellFormed_3Bytes() { 206 testBytes(3, EXPECTED_THREE_BYTE_ROUNDTRIPPABLE_COUNT); 207 } 208 209 /** 210 * Tests that round tripping of a sample of four byte permutations work. All permutations are 211 * prohibitively expensive to test for automated runs. This method tests specific four-byte cases. 212 */ testIsWellFormed_4BytesSamples()213 public void testIsWellFormed_4BytesSamples() { 214 // Valid 4 byte. 215 assertWellFormed(0xF0, 0xA4, 0xAD, 0xA2); 216 // Bad trailing bytes 217 assertNotWellFormed(0xF0, 0xA4, 0xAD, 0x7F); 218 assertNotWellFormed(0xF0, 0xA4, 0xAD, 0xC0); 219 // Special cases for byte2 220 assertNotWellFormed(0xF0, 0x8F, 0xAD, 0xA2); 221 assertNotWellFormed(0xF4, 0x90, 0xAD, 0xA2); 222 } 223 224 /** Tests some hard-coded test cases. */ testSomeSequences()225 public void testSomeSequences() { 226 // Empty 227 assertWellFormed(); 228 // One-byte characters, including control characters 229 assertWellFormed(0x00, 0x61, 0x62, 0x63, 0x7F); // "\u0000abc\u007f" 230 // Two-byte characters 231 assertWellFormed(0xC2, 0xA2, 0xC2, 0xA2); // "\u00a2\u00a2" 232 // Three-byte characters 233 assertWellFormed(0xc8, 0x8a, 0x63, 0xc8, 0x8a, 0x63); // "\u020ac\u020ac" 234 // Four-byte characters 235 // "\u024B62\u024B62" 236 assertWellFormed(0xc9, 0x8b, 0x36, 0x32, 0xc9, 0x8b, 0x36, 0x32); 237 // Mixed string 238 // "a\u020ac\u00a2b\\u024B62u020acc\u00a2de\u024B62" 239 assertWellFormed( 240 0x61, 0xc8, 0x8a, 0x63, 0xc2, 0xa2, 0x62, 0x5c, 0x75, 0x30, 0x32, 0x34, 0x42, 0x36, 0x32, 241 0x75, 0x30, 0x32, 0x30, 0x61, 0x63, 0x63, 0xc2, 0xa2, 0x64, 0x65, 0xc9, 0x8b, 0x36, 0x32); 242 // Not a valid string 243 assertNotWellFormed(-1, 0, -1, 0); 244 } 245 testShardsHaveExpectedRoundTrippables()246 public void testShardsHaveExpectedRoundTrippables() { 247 // A sanity check. 248 long actual = 0; 249 for (long expected : generateFourByteShardsExpectedRunnables()) { 250 actual += expected; 251 } 252 assertEquals(EXPECTED_FOUR_BYTE_ROUNDTRIPPABLE_COUNT, actual); 253 } 254 newString(char... chars)255 private static String newString(char... chars) { 256 return new String(chars); 257 } 258 toByteArray(int... bytes)259 private static byte[] toByteArray(int... bytes) { 260 byte[] realBytes = new byte[bytes.length]; 261 for (int i = 0; i < bytes.length; i++) { 262 realBytes[i] = (byte) bytes[i]; 263 } 264 return realBytes; 265 } 266 assertWellFormed(int... bytes)267 private static void assertWellFormed(int... bytes) { 268 assertTrue(Utf8.isWellFormed(toByteArray(bytes))); 269 } 270 assertNotWellFormed(int... bytes)271 private static void assertNotWellFormed(int... bytes) { 272 assertFalse(Utf8.isWellFormed(toByteArray(bytes))); 273 } 274 generateFourByteShardsExpectedRunnables()275 private static long[] generateFourByteShardsExpectedRunnables() { 276 long[] expected = new long[128]; 277 // 0-63 are all 5300224 278 for (int i = 0; i <= 63; i++) { 279 expected[i] = 5300224; 280 } 281 // 97-111 are all 2342912 282 for (int i = 97; i <= 111; i++) { 283 expected[i] = 2342912; 284 } 285 // 113-117 are all 1048576 286 for (int i = 113; i <= 117; i++) { 287 expected[i] = 1048576; 288 } 289 // One offs 290 expected[112] = 786432; 291 expected[118] = 786432; 292 expected[119] = 1048576; 293 expected[120] = 458752; 294 expected[121] = 524288; 295 expected[122] = 65536; 296 // Anything not assigned was the default 0. 297 return expected; 298 } 299 300 /** 301 * Helper to run the loop to test all the permutations for the number of bytes specified. 302 * 303 * @param numBytes the number of bytes in the byte array 304 * @param expectedCount the expected number of roundtrippable permutations 305 */ 306 @GwtIncompatible // java.nio.charset.Charset testBytes(int numBytes, long expectedCount)307 private static void testBytes(int numBytes, long expectedCount) { 308 testBytes(numBytes, expectedCount, 0, -1); 309 } 310 311 /** 312 * Helper to run the loop to test all the permutations for the number of bytes specified. This 313 * overload is useful for debugging to get the loop to start at a certain character. 314 * 315 * @param numBytes the number of bytes in the byte array 316 * @param expectedCount the expected number of roundtrippable permutations 317 * @param start the starting bytes encoded as a long as big-endian 318 * @param lim the limit of bytes to process encoded as a long as big-endian, or -1 to mean the max 319 * limit for numBytes 320 */ 321 @GwtIncompatible // java.nio.charset.Charset testBytes(int numBytes, long expectedCount, long start, long lim)322 private static void testBytes(int numBytes, long expectedCount, long start, long lim) { 323 byte[] bytes = new byte[numBytes]; 324 if (lim == -1) { 325 lim = 1L << (numBytes * 8); 326 } 327 long countRoundTripped = 0; 328 for (long byteChar = start; byteChar < lim; byteChar++) { 329 long tmpByteChar = byteChar; 330 for (int i = 0; i < numBytes; i++) { 331 bytes[bytes.length - i - 1] = (byte) tmpByteChar; 332 tmpByteChar = tmpByteChar >> 8; 333 } 334 boolean isRoundTrippable = Utf8.isWellFormed(bytes); 335 assertEquals(isRoundTrippable, Utf8.isWellFormed(bytes, 0, numBytes)); 336 String s = new String(bytes, UTF_8); 337 byte[] bytesReencoded = s.getBytes(UTF_8); 338 boolean bytesEqual = Arrays.equals(bytes, bytesReencoded); 339 340 if (bytesEqual != isRoundTrippable) { 341 fail(); 342 } 343 if (isRoundTrippable) { 344 countRoundTripped++; 345 } 346 } 347 assertEquals(expectedCount, countRoundTripped); 348 } 349 } 350