1 // Protocol Buffers - Google's data interchange format 2 // Copyright 2008 Google Inc. All rights reserved. 3 // https://developers.google.com/protocol-buffers/ 4 // 5 // Redistribution and use in source and binary forms, with or without 6 // modification, are permitted provided that the following conditions are 7 // met: 8 // 9 // * Redistributions of source code must retain the above copyright 10 // notice, this list of conditions and the following disclaimer. 11 // * Redistributions in binary form must reproduce the above 12 // copyright notice, this list of conditions and the following disclaimer 13 // in the documentation and/or other materials provided with the 14 // distribution. 15 // * Neither the name of Google Inc. nor the names of its 16 // contributors may be used to endorse or promote products derived from 17 // this software without specific prior written permission. 18 // 19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 31 package com.google.protobuf; 32 33 import static org.junit.Assert.assertEquals; 34 import static org.junit.Assert.assertFalse; 35 import static org.junit.Assert.assertSame; 36 import static org.junit.Assert.assertTrue; 37 import static org.junit.Assert.fail; 38 39 import java.lang.ref.SoftReference; 40 import java.nio.ByteBuffer; 41 import java.nio.CharBuffer; 42 import java.nio.charset.CharsetDecoder; 43 import java.nio.charset.CharsetEncoder; 44 import java.nio.charset.CoderResult; 45 import java.nio.charset.CodingErrorAction; 46 import java.util.ArrayList; 47 import java.util.Arrays; 48 import java.util.List; 49 import java.util.Random; 50 import java.util.logging.Logger; 51 52 /** 53 * Shared testing code for {@link IsValidUtf8Test} and 54 * {@link IsValidUtf8FourByteTest}. 55 * 56 * @author jonp@google.com (Jon Perlow) 57 * @author martinrb@google.com (Martin Buchholz) 58 */ 59 final class IsValidUtf8TestUtil { 60 private static Logger logger = Logger.getLogger(IsValidUtf8TestUtil.class.getName()); 61 IsValidUtf8TestUtil()62 private IsValidUtf8TestUtil() {} 63 64 static interface ByteStringFactory { newByteString(byte[] bytes)65 ByteString newByteString(byte[] bytes); 66 } 67 68 static final ByteStringFactory LITERAL_FACTORY = new ByteStringFactory() { 69 @Override 70 public ByteString newByteString(byte[] bytes) { 71 return ByteString.wrap(bytes); 72 } 73 }; 74 75 static final ByteStringFactory HEAP_NIO_FACTORY = new ByteStringFactory() { 76 @Override 77 public ByteString newByteString(byte[] bytes) { 78 return new NioByteString(ByteBuffer.wrap(bytes)); 79 } 80 }; 81 82 private static ThreadLocal<SoftReference<ByteBuffer>> directBuffer = 83 new ThreadLocal<SoftReference<ByteBuffer>>(); 84 85 /** 86 * Factory for direct {@link ByteBuffer} instances. To reduce direct memory usage, this 87 * uses a thread local direct buffer. This means that each call will overwrite the buffer's 88 * contents from the previous call, so the calling code must be careful not to continue using 89 * a buffer returned from a previous invocation. 90 */ 91 static final ByteStringFactory DIRECT_NIO_FACTORY = new ByteStringFactory() { 92 @Override 93 public ByteString newByteString(byte[] bytes) { 94 SoftReference<ByteBuffer> ref = directBuffer.get(); 95 ByteBuffer buffer = ref == null ? null : ref.get(); 96 if (buffer == null || buffer.capacity() < bytes.length) { 97 buffer = ByteBuffer.allocateDirect(bytes.length); 98 directBuffer.set(new SoftReference<ByteBuffer>(buffer)); 99 } 100 buffer.clear(); 101 buffer.put(bytes); 102 buffer.flip(); 103 return new NioByteString(buffer); 104 } 105 }; 106 107 // 128 - [chars 0x0000 to 0x007f] 108 static final long ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS = 0x007f - 0x0000 + 1; 109 110 // 128 111 static final long EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT = ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS; 112 113 // 1920 [chars 0x0080 to 0x07FF] 114 static final long TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS = 0x07FF - 0x0080 + 1; 115 116 // 18,304 117 static final long EXPECTED_TWO_BYTE_ROUNDTRIPPABLE_COUNT = 118 // Both bytes are one byte characters 119 (long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 2) + 120 // The possible number of two byte characters 121 TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS; 122 123 // 2048 124 static final long THREE_BYTE_SURROGATES = 2 * 1024; 125 126 // 61,440 [chars 0x0800 to 0xFFFF, minus surrogates] 127 static final long THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS = 128 0xFFFF - 0x0800 + 1 - THREE_BYTE_SURROGATES; 129 130 // 2,650,112 131 static final long EXPECTED_THREE_BYTE_ROUNDTRIPPABLE_COUNT = 132 // All one byte characters 133 (long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 3) + 134 // One two byte character and a one byte character 135 2 * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS * ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS + 136 // Three byte characters 137 THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS; 138 139 // 1,048,576 [chars 0x10000L to 0x10FFFF] 140 static final long FOUR_BYTE_ROUNDTRIPPABLE_CHARACTERS = 0x10FFFF - 0x10000L + 1; 141 142 // 289,571,839 143 static final long EXPECTED_FOUR_BYTE_ROUNDTRIPPABLE_COUNT = 144 // All one byte characters 145 (long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 4) + 146 // One and three byte characters 147 2 * THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS * ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS + 148 // Two two byte characters 149 TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS + 150 // Permutations of one and two byte characters 151 3 * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS * ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS 152 * ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS 153 + 154 // Four byte characters 155 FOUR_BYTE_ROUNDTRIPPABLE_CHARACTERS; 156 157 static final class Shard { 158 final long index; 159 final long start; 160 final long lim; 161 final long expected; 162 163 Shard(long index, long start, long lim, long expected)164 public Shard(long index, long start, long lim, long expected) { 165 assertTrue(start < lim); 166 this.index = index; 167 this.start = start; 168 this.lim = lim; 169 this.expected = expected; 170 } 171 } 172 173 static final long[] FOUR_BYTE_SHARDS_EXPECTED_ROUNTRIPPABLES = 174 generateFourByteShardsExpectedRunnables(); 175 176 private static long[] generateFourByteShardsExpectedRunnables() { 177 long[] expected = new long[128]; 178 179 // 0-63 are all 5300224 180 for (int i = 0; i <= 63; i++) { 181 expected[i] = 5300224; 182 } 183 184 // 97-111 are all 2342912 185 for (int i = 97; i <= 111; i++) { 186 expected[i] = 2342912; 187 } 188 189 // 113-117 are all 1048576 190 for (int i = 113; i <= 117; i++) { 191 expected[i] = 1048576; 192 } 193 194 // One offs 195 expected[112] = 786432; 196 expected[118] = 786432; 197 expected[119] = 1048576; 198 expected[120] = 458752; 199 expected[121] = 524288; 200 expected[122] = 65536; 201 202 // Anything not assigned was the default 0. 203 return expected; 204 } 205 206 static final List<Shard> FOUR_BYTE_SHARDS = 207 generateFourByteShards(128, FOUR_BYTE_SHARDS_EXPECTED_ROUNTRIPPABLES); 208 209 210 private static List<Shard> generateFourByteShards(int numShards, long[] expected) { 211 assertEquals(numShards, expected.length); 212 List<Shard> shards = new ArrayList<Shard>(numShards); 213 long LIM = 1L << 32; 214 long increment = LIM / numShards; 215 assertTrue(LIM % numShards == 0); 216 for (int i = 0; i < numShards; i++) { 217 shards.add(new Shard(i, increment * i, increment * (i + 1), expected[i])); 218 } 219 return shards; 220 } 221 222 /** 223 * Helper to run the loop to test all the permutations for the number of bytes 224 * specified. 225 * 226 * @param factory the factory for {@link ByteString} instances. 227 * @param numBytes the number of bytes in the byte array 228 * @param expectedCount the expected number of roundtrippable permutations 229 */ 230 static void testBytes(ByteStringFactory factory, int numBytes, long expectedCount) { 231 testBytes(factory, numBytes, expectedCount, 0, -1); 232 } 233 234 /** 235 * Helper to run the loop to test all the permutations for the number of bytes 236 * specified. This overload is useful for debugging to get the loop to start 237 * at a certain character. 238 * 239 * @param factory the factory for {@link ByteString} instances. 240 * @param numBytes the number of bytes in the byte array 241 * @param expectedCount the expected number of roundtrippable permutations 242 * @param start the starting bytes encoded as a long as big-endian 243 * @param lim the limit of bytes to process encoded as a long as big-endian, 244 * or -1 to mean the max limit for numBytes 245 */ 246 static void testBytes( 247 ByteStringFactory factory, int numBytes, long expectedCount, long start, long lim) { 248 Random rnd = new Random(); 249 byte[] bytes = new byte[numBytes]; 250 251 if (lim == -1) { 252 lim = 1L << (numBytes * 8); 253 } 254 long count = 0; 255 long countRoundTripped = 0; 256 for (long byteChar = start; byteChar < lim; byteChar++) { 257 long tmpByteChar = byteChar; 258 for (int i = 0; i < numBytes; i++) { 259 bytes[bytes.length - i - 1] = (byte) tmpByteChar; 260 tmpByteChar = tmpByteChar >> 8; 261 } 262 ByteString bs = factory.newByteString(bytes); 263 boolean isRoundTrippable = bs.isValidUtf8(); 264 String s = new String(bytes, Internal.UTF_8); 265 byte[] bytesReencoded = s.getBytes(Internal.UTF_8); 266 boolean bytesEqual = Arrays.equals(bytes, bytesReencoded); 267 268 if (bytesEqual != isRoundTrippable) { 269 outputFailure(byteChar, bytes, bytesReencoded); 270 } 271 272 // Check agreement with static Utf8 methods. 273 assertEquals(isRoundTrippable, Utf8.isValidUtf8(bytes)); 274 assertEquals(isRoundTrippable, Utf8.isValidUtf8(bytes, 0, numBytes)); 275 276 // Test partial sequences. 277 // Partition numBytes into three segments (not necessarily non-empty). 278 int i = rnd.nextInt(numBytes); 279 int j = rnd.nextInt(numBytes); 280 if (j < i) { 281 int tmp = i; 282 i = j; 283 j = tmp; 284 } 285 int state1 = Utf8.partialIsValidUtf8(Utf8.COMPLETE, bytes, 0, i); 286 int state2 = Utf8.partialIsValidUtf8(state1, bytes, i, j); 287 int state3 = Utf8.partialIsValidUtf8(state2, bytes, j, numBytes); 288 if (isRoundTrippable != (state3 == Utf8.COMPLETE)) { 289 System.out.printf("state=%04x %04x %04x i=%d j=%d%n", state1, state2, state3, i, j); 290 outputFailure(byteChar, bytes, bytesReencoded); 291 } 292 assertEquals(isRoundTrippable, (state3 == Utf8.COMPLETE)); 293 294 // Test ropes built out of small partial sequences 295 ByteString rope = RopeByteString.newInstanceForTest( 296 bs.substring(0, i), 297 RopeByteString.newInstanceForTest(bs.substring(i, j), bs.substring(j, numBytes))); 298 assertSame(RopeByteString.class, rope.getClass()); 299 300 ByteString[] byteStrings = {bs, bs.substring(0, numBytes), rope}; 301 for (ByteString x : byteStrings) { 302 assertEquals(isRoundTrippable, x.isValidUtf8()); 303 assertEquals(state3, x.partialIsValidUtf8(Utf8.COMPLETE, 0, numBytes)); 304 305 assertEquals(state1, x.partialIsValidUtf8(Utf8.COMPLETE, 0, i)); 306 assertEquals(state1, x.substring(0, i).partialIsValidUtf8(Utf8.COMPLETE, 0, i)); 307 assertEquals(state2, x.partialIsValidUtf8(state1, i, j - i)); 308 assertEquals(state2, x.substring(i, j).partialIsValidUtf8(state1, 0, j - i)); 309 assertEquals(state3, x.partialIsValidUtf8(state2, j, numBytes - j)); 310 assertEquals(state3, x.substring(j, numBytes).partialIsValidUtf8(state2, 0, numBytes - j)); 311 } 312 313 // ByteString reduplication should not affect its UTF-8 validity. 314 ByteString ropeADope = RopeByteString.newInstanceForTest(bs, bs.substring(0, numBytes)); 315 assertEquals(isRoundTrippable, ropeADope.isValidUtf8()); 316 317 if (isRoundTrippable) { 318 countRoundTripped++; 319 } 320 count++; 321 if (byteChar != 0 && byteChar % 1000000L == 0) { 322 logger.info("Processed " + (byteChar / 1000000L) + " million characters"); 323 } 324 } 325 logger.info("Round tripped " + countRoundTripped + " of " + count); 326 assertEquals(expectedCount, countRoundTripped); 327 } 328 329 /** 330 * Variation of {@link #testBytes} that does less allocation using the 331 * low-level encoders/decoders directly. Checked in because it's useful for 332 * debugging when trying to process bytes faster, but since it doesn't use the 333 * actual String class, it's possible for incompatibilities to develop 334 * (although unlikely). 335 * 336 * @param factory the factory for {@link ByteString} instances. 337 * @param numBytes the number of bytes in the byte array 338 * @param expectedCount the expected number of roundtrippable permutations 339 * @param start the starting bytes encoded as a long as big-endian 340 * @param lim the limit of bytes to process encoded as a long as big-endian, 341 * or -1 to mean the max limit for numBytes 342 */ 343 static void testBytesUsingByteBuffers( 344 ByteStringFactory factory, int numBytes, long expectedCount, long start, long lim) { 345 CharsetDecoder decoder = 346 Internal.UTF_8.newDecoder() 347 .onMalformedInput(CodingErrorAction.REPLACE) 348 .onUnmappableCharacter(CodingErrorAction.REPLACE); 349 CharsetEncoder encoder = 350 Internal.UTF_8.newEncoder() 351 .onMalformedInput(CodingErrorAction.REPLACE) 352 .onUnmappableCharacter(CodingErrorAction.REPLACE); 353 byte[] bytes = new byte[numBytes]; 354 int maxChars = (int) (decoder.maxCharsPerByte() * numBytes) + 1; 355 char[] charsDecoded = new char[(int) (decoder.maxCharsPerByte() * numBytes) + 1]; 356 int maxBytes = (int) (encoder.maxBytesPerChar() * maxChars) + 1; 357 byte[] bytesReencoded = new byte[maxBytes]; 358 359 ByteBuffer bb = ByteBuffer.wrap(bytes); 360 CharBuffer cb = CharBuffer.wrap(charsDecoded); 361 ByteBuffer bbReencoded = ByteBuffer.wrap(bytesReencoded); 362 if (lim == -1) { 363 lim = 1L << (numBytes * 8); 364 } 365 long count = 0; 366 long countRoundTripped = 0; 367 for (long byteChar = start; byteChar < lim; byteChar++) { 368 bb.rewind(); 369 bb.limit(bytes.length); 370 cb.rewind(); 371 cb.limit(charsDecoded.length); 372 bbReencoded.rewind(); 373 bbReencoded.limit(bytesReencoded.length); 374 encoder.reset(); 375 decoder.reset(); 376 long tmpByteChar = byteChar; 377 for (int i = 0; i < bytes.length; i++) { 378 bytes[bytes.length - i - 1] = (byte) tmpByteChar; 379 tmpByteChar = tmpByteChar >> 8; 380 } 381 boolean isRoundTrippable = factory.newByteString(bytes).isValidUtf8(); 382 CoderResult result = decoder.decode(bb, cb, true); 383 assertFalse(result.isError()); 384 result = decoder.flush(cb); 385 assertFalse(result.isError()); 386 387 int charLen = cb.position(); 388 cb.rewind(); 389 cb.limit(charLen); 390 result = encoder.encode(cb, bbReencoded, true); 391 assertFalse(result.isError()); 392 result = encoder.flush(bbReencoded); 393 assertFalse(result.isError()); 394 395 boolean bytesEqual = true; 396 int bytesLen = bbReencoded.position(); 397 if (bytesLen != numBytes) { 398 bytesEqual = false; 399 } else { 400 for (int i = 0; i < numBytes; i++) { 401 if (bytes[i] != bytesReencoded[i]) { 402 bytesEqual = false; 403 break; 404 } 405 } 406 } 407 if (bytesEqual != isRoundTrippable) { 408 outputFailure(byteChar, bytes, bytesReencoded, bytesLen); 409 } 410 411 count++; 412 if (isRoundTrippable) { 413 countRoundTripped++; 414 } 415 if (byteChar != 0 && byteChar % 1000000 == 0) { 416 logger.info("Processed " + (byteChar / 1000000) + " million characters"); 417 } 418 } 419 logger.info("Round tripped " + countRoundTripped + " of " + count); 420 assertEquals(expectedCount, countRoundTripped); 421 } 422 423 private static void outputFailure(long byteChar, byte[] bytes, byte[] after) { 424 outputFailure(byteChar, bytes, after, after.length); 425 } 426 427 private static void outputFailure(long byteChar, byte[] bytes, byte[] after, int len) { 428 fail("Failure: (" + Long.toHexString(byteChar) + ") " + toHexString(bytes) + " => " 429 + toHexString(after, len)); 430 } 431 432 private static String toHexString(byte[] b) { 433 return toHexString(b, b.length); 434 } 435 436 private static String toHexString(byte[] b, int len) { 437 StringBuilder s = new StringBuilder(); 438 s.append("\""); 439 for (int i = 0; i < len; i++) { 440 if (i > 0) { 441 s.append(" "); 442 } 443 s.append(String.format("%02x", b[i] & 0xFF)); 444 } 445 s.append("\""); 446 return s.toString(); 447 } 448 } 449