1 /* 2 * Copyright (C) 2013 The Guava Authors 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.google.common.base; 18 19 import com.google.common.annotations.GwtCompatible; 20 import com.google.common.annotations.GwtIncompatible; 21 22 import junit.framework.TestCase; 23 24 import java.util.Arrays; 25 import java.util.HashMap; 26 import java.util.Random; 27 28 /** 29 * Unit tests for {@link Utf8}. 30 * 31 * @author Jon Perlow 32 * @author Martin Buchholz 33 * @author Clément Roux 34 */ 35 @GwtCompatible(emulated = true) 36 public class Utf8Test extends TestCase { testEncodedLength_validStrings()37 public void testEncodedLength_validStrings() { 38 assertEquals(0, Utf8.encodedLength("")); 39 assertEquals(11, Utf8.encodedLength("Hello world")); 40 assertEquals(8, Utf8.encodedLength("Résumé")); 41 assertEquals(461, Utf8.encodedLength("威廉·莎士比亞(William Shakespeare," 42 + "1564年4月26號—1616年4月23號[1])係隻英國嗰演員、劇作家同詩人," 43 + "有時間佢簡稱莎翁;中國清末民初哈拕翻譯做舌克斯毕、沙斯皮耳、筛斯比耳、" 44 + "莎基斯庇尔、索士比尔、夏克思芘尔、希哀苦皮阿、叶斯壁、沙克皮尔、" 45 + "狹斯丕爾。[2]莎士比亞編寫過好多作品,佢嗰劇作響西洋文學好有影響," 46 + "哈都拕人翻譯做好多話。")); 47 // A surrogate pair 48 assertEquals(4, Utf8.encodedLength( 49 newString(Character.MIN_HIGH_SURROGATE, Character.MIN_LOW_SURROGATE))); 50 } 51 52 @GwtIncompatible("StringBuilder.appendCodePoint()") testEncodedLength_validStrings2()53 public void testEncodedLength_validStrings2() { 54 HashMap<Integer, Integer> utf8Lengths = new HashMap<Integer, Integer>(); 55 utf8Lengths.put(0x00, 1); 56 utf8Lengths.put(0x7f, 1); 57 utf8Lengths.put(0x80, 2); 58 utf8Lengths.put(0x7ff, 2); 59 utf8Lengths.put(0x800, 3); 60 utf8Lengths.put(Character.MIN_SUPPLEMENTARY_CODE_POINT - 1, 3); 61 utf8Lengths.put(Character.MIN_SUPPLEMENTARY_CODE_POINT, 4); 62 utf8Lengths.put(Character.MAX_CODE_POINT, 4); 63 64 Integer[] codePoints = utf8Lengths.keySet().toArray(new Integer[]{}); 65 StringBuilder sb = new StringBuilder(); 66 Random rnd = new Random(); 67 for (int trial = 0; trial < 100; trial++) { 68 sb.setLength(0); 69 int utf8Length = 0; 70 for (int i = 0; i < 6; i++) { 71 Integer randomCodePoint = codePoints[rnd.nextInt(codePoints.length)]; 72 sb.appendCodePoint(randomCodePoint); 73 utf8Length += utf8Lengths.get(randomCodePoint); 74 if (utf8Length != Utf8.encodedLength(sb)) { 75 StringBuilder repro = new StringBuilder(); 76 for (int j = 0; j < sb.length(); j++) { 77 repro.append(" " + (int) sb.charAt(j)); // GWT compatible 78 } 79 assertEquals(repro.toString(), utf8Length, Utf8.encodedLength(sb)); 80 } 81 } 82 } 83 } 84 testEncodedLength_invalidStrings()85 public void testEncodedLength_invalidStrings() { 86 testEncodedLengthFails(newString(Character.MIN_HIGH_SURROGATE), 0); 87 testEncodedLengthFails("foobar" + newString(Character.MIN_HIGH_SURROGATE), 6); 88 testEncodedLengthFails(newString(Character.MIN_LOW_SURROGATE), 0); 89 testEncodedLengthFails("foobar" + newString(Character.MIN_LOW_SURROGATE), 6); 90 testEncodedLengthFails( 91 newString( 92 Character.MIN_HIGH_SURROGATE, 93 Character.MIN_HIGH_SURROGATE), 0); 94 } 95 testEncodedLengthFails(String invalidString, int invalidCodePointIndex)96 private static void testEncodedLengthFails(String invalidString, 97 int invalidCodePointIndex) { 98 try { 99 Utf8.encodedLength(invalidString); 100 fail(); 101 } catch (IllegalArgumentException expected) { 102 assertEquals("Unpaired surrogate at index " + invalidCodePointIndex, 103 expected.getMessage()); 104 } 105 } 106 107 // 128 - [chars 0x0000 to 0x007f] 108 private static final long ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS = 109 0x007f - 0x0000 + 1; 110 111 // 128 112 private static final long EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT = 113 ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS; 114 115 // 1920 [chars 0x0080 to 0x07FF] 116 private static final long TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS = 117 0x07FF - 0x0080 + 1; 118 119 // 18,304 120 private static final long EXPECTED_TWO_BYTE_ROUNDTRIPPABLE_COUNT = 121 // Both bytes are one byte characters 122 (long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 2) + 123 // The possible number of two byte characters 124 TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS; 125 126 // 2048 127 private static final long THREE_BYTE_SURROGATES = 2 * 1024; 128 129 // 61,440 [chars 0x0800 to 0xFFFF, minus surrogates] 130 private static final long THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS = 131 0xFFFF - 0x0800 + 1 - THREE_BYTE_SURROGATES; 132 133 // 2,650,112 134 private static final long EXPECTED_THREE_BYTE_ROUNDTRIPPABLE_COUNT = 135 // All one byte characters 136 (long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 3) + 137 // One two byte character and a one byte character 138 2 * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS * 139 ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS + 140 // Three byte characters 141 THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS; 142 143 // 1,048,576 [chars 0x10000L to 0x10FFFF] 144 private static final long FOUR_BYTE_ROUNDTRIPPABLE_CHARACTERS = 145 0x10FFFF - 0x10000L + 1; 146 147 // 289,571,839 148 private static final long EXPECTED_FOUR_BYTE_ROUNDTRIPPABLE_COUNT = 149 // All one byte characters 150 (long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 4) + 151 // One and three byte characters 152 2 * THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS * 153 ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS + 154 // Two two byte characters 155 TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS + 156 // Permutations of one and two byte characters 157 3 * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS * 158 ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS * 159 ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS + 160 // Four byte characters 161 FOUR_BYTE_ROUNDTRIPPABLE_CHARACTERS; 162 163 /** Tests that round tripping of all two byte permutations work. */ 164 @GwtIncompatible("java.nio.charset.Charset") testIsWellFormed_1Byte()165 public void testIsWellFormed_1Byte() { 166 testBytes(1, EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT); 167 } 168 169 /** Tests that round tripping of all two byte permutations work. */ 170 @GwtIncompatible("java.nio.charset.Charset") testIsWellFormed_2Bytes()171 public void testIsWellFormed_2Bytes() { 172 testBytes(2, EXPECTED_TWO_BYTE_ROUNDTRIPPABLE_COUNT); 173 } 174 175 /** Tests that round tripping of all three byte permutations work. */ 176 @GwtIncompatible("java.nio.charset.Charset") testIsWellFormed_3Bytes()177 public void testIsWellFormed_3Bytes() { 178 testBytes(3, EXPECTED_THREE_BYTE_ROUNDTRIPPABLE_COUNT); 179 } 180 181 /** 182 * Tests that round tripping of a sample of four byte permutations work. 183 * All permutations are prohibitively expensive to test for automated runs. 184 * This method tests specific four-byte cases. 185 */ testIsWellFormed_4BytesSamples()186 public void testIsWellFormed_4BytesSamples() { 187 // Valid 4 byte. 188 assertWellFormed(0xF0, 0xA4, 0xAD, 0xA2); 189 // Bad trailing bytes 190 assertNotWellFormed(0xF0, 0xA4, 0xAD, 0x7F); 191 assertNotWellFormed(0xF0, 0xA4, 0xAD, 0xC0); 192 // Special cases for byte2 193 assertNotWellFormed(0xF0, 0x8F, 0xAD, 0xA2); 194 assertNotWellFormed(0xF4, 0x90, 0xAD, 0xA2); 195 } 196 197 /** Tests some hard-coded test cases. */ testSomeSequences()198 public void testSomeSequences() { 199 // Empty 200 assertWellFormed(); 201 // One-byte characters, including control characters 202 assertWellFormed(0x00, 0x61, 0x62, 0x63, 0x7F); // "\u0000abc\u007f" 203 // Two-byte characters 204 assertWellFormed(0xC2, 0xA2, 0xC2, 0xA2); // "\u00a2\u00a2" 205 // Three-byte characters 206 assertWellFormed(0xc8, 0x8a, 0x63, 0xc8, 0x8a, 0x63); // "\u020ac\u020ac" 207 // Four-byte characters 208 // "\u024B62\u024B62" 209 assertWellFormed(0xc9, 0x8b, 0x36, 0x32, 0xc9, 0x8b, 0x36, 0x32); 210 // Mixed string 211 // "a\u020ac\u00a2b\\u024B62u020acc\u00a2de\u024B62" 212 assertWellFormed(0x61, 0xc8, 0x8a, 0x63, 0xc2, 0xa2, 0x62, 0x5c, 0x75, 0x30, 213 0x32, 0x34, 0x42, 0x36, 0x32, 0x75, 0x30, 0x32, 0x30, 0x61, 0x63, 0x63, 214 0xc2, 0xa2, 0x64, 0x65, 0xc9, 0x8b, 0x36, 0x32); 215 // Not a valid string 216 assertNotWellFormed(-1, 0, -1, 0); 217 } 218 testShardsHaveExpectedRoundTrippables()219 public void testShardsHaveExpectedRoundTrippables() { 220 // A sanity check. 221 long actual = 0; 222 for (long expected : generateFourByteShardsExpectedRunnables()) { 223 actual += expected; 224 } 225 assertEquals(EXPECTED_FOUR_BYTE_ROUNDTRIPPABLE_COUNT, actual); 226 } 227 newString(char... chars)228 private String newString(char... chars) { 229 return new String(chars); 230 } 231 toByteArray(int... bytes)232 private byte[] toByteArray(int... bytes) { 233 byte[] realBytes = new byte[bytes.length]; 234 for (int i = 0; i < bytes.length; i++) { 235 realBytes[i] = (byte) bytes[i]; 236 } 237 return realBytes; 238 } 239 assertWellFormed(int... bytes)240 private void assertWellFormed(int... bytes) { 241 assertTrue(Utf8.isWellFormed(toByteArray(bytes))); 242 } 243 assertNotWellFormed(int... bytes)244 private void assertNotWellFormed(int... bytes) { 245 assertFalse(Utf8.isWellFormed(toByteArray(bytes))); 246 } 247 generateFourByteShardsExpectedRunnables()248 private static long[] generateFourByteShardsExpectedRunnables() { 249 long[] expected = new long[128]; 250 // 0-63 are all 5300224 251 for (int i = 0; i <= 63; i++) { 252 expected[i] = 5300224; 253 } 254 // 97-111 are all 2342912 255 for (int i = 97; i <= 111; i++) { 256 expected[i] = 2342912; 257 } 258 // 113-117 are all 1048576 259 for (int i = 113; i <= 117; i++) { 260 expected[i] = 1048576; 261 } 262 // One offs 263 expected[112] = 786432; 264 expected[118] = 786432; 265 expected[119] = 1048576; 266 expected[120] = 458752; 267 expected[121] = 524288; 268 expected[122] = 65536; 269 // Anything not assigned was the default 0. 270 return expected; 271 } 272 273 /** 274 * Helper to run the loop to test all the permutations for the number of bytes 275 * specified. 276 * 277 * @param numBytes the number of bytes in the byte array 278 * @param expectedCount the expected number of roundtrippable permutations 279 */ 280 @GwtIncompatible("java.nio.charset.Charset") testBytes(int numBytes, long expectedCount)281 private static void testBytes(int numBytes, long expectedCount) { 282 testBytes(numBytes, expectedCount, 0, -1); 283 } 284 285 /** 286 * Helper to run the loop to test all the permutations for the number of bytes 287 * specified. This overload is useful for debugging to get the loop to start 288 * at a certain character. 289 * 290 * @param numBytes the number of bytes in the byte array 291 * @param expectedCount the expected number of roundtrippable permutations 292 * @param start the starting bytes encoded as a long as big-endian 293 * @param lim the limit of bytes to process encoded as a long as big-endian, 294 * or -1 to mean the max limit for numBytes 295 */ 296 @GwtIncompatible("java.nio.charset.Charset") testBytes(int numBytes, long expectedCount, long start, long lim)297 private static void testBytes(int numBytes, long expectedCount, long start, 298 long lim) { 299 byte[] bytes = new byte[numBytes]; 300 if (lim == -1) { 301 lim = 1L << (numBytes * 8); 302 } 303 long countRoundTripped = 0; 304 for (long byteChar = start; byteChar < lim; byteChar++) { 305 long tmpByteChar = byteChar; 306 for (int i = 0; i < numBytes; i++) { 307 bytes[bytes.length - i - 1] = (byte) tmpByteChar; 308 tmpByteChar = tmpByteChar >> 8; 309 } 310 boolean isRoundTrippable = Utf8.isWellFormed(bytes); 311 assertEquals(isRoundTrippable, Utf8.isWellFormed(bytes, 0, numBytes)); 312 String s = new String(bytes, Charsets.UTF_8); 313 byte[] bytesReencoded = s.getBytes(Charsets.UTF_8); 314 boolean bytesEqual = Arrays.equals(bytes, bytesReencoded); 315 316 if (bytesEqual != isRoundTrippable) { 317 fail(); 318 } 319 if (isRoundTrippable) { 320 countRoundTripped++; 321 } 322 } 323 assertEquals(expectedCount, countRoundTripped); 324 } 325 } 326