1 // Protocol Buffers - Google's data interchange format 2 // Copyright 2008 Google Inc. All rights reserved. 3 // https://developers.google.com/protocol-buffers/ 4 // 5 // Redistribution and use in source and binary forms, with or without 6 // modification, are permitted provided that the following conditions are 7 // met: 8 // 9 // * Redistributions of source code must retain the above copyright 10 // notice, this list of conditions and the following disclaimer. 11 // * Redistributions in binary form must reproduce the above 12 // copyright notice, this list of conditions and the following disclaimer 13 // in the documentation and/or other materials provided with the 14 // distribution. 15 // * Neither the name of Google Inc. nor the names of its 16 // contributors may be used to endorse or promote products derived from 17 // this software without specific prior written permission. 18 // 19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 31 package com.google.protobuf; 32 33 import static org.junit.Assert.assertEquals; 34 import static org.junit.Assert.assertFalse; 35 import static org.junit.Assert.assertSame; 36 import static org.junit.Assert.assertTrue; 37 import static org.junit.Assert.fail; 38 39 import java.lang.ref.SoftReference; 40 import java.nio.ByteBuffer; 41 import java.nio.CharBuffer; 42 import java.nio.charset.CharsetDecoder; 43 import java.nio.charset.CharsetEncoder; 44 import java.nio.charset.CoderResult; 45 import java.nio.charset.CodingErrorAction; 46 import java.util.ArrayList; 47 import java.util.Arrays; 48 import java.util.List; 49 import java.util.Random; 50 import java.util.logging.Logger; 51 52 /** 53 * Shared testing code for {@link IsValidUtf8Test} and {@link IsValidUtf8FourByteTest}. 54 * 55 * @author jonp@google.com (Jon Perlow) 56 * @author martinrb@google.com (Martin Buchholz) 57 */ 58 final class IsValidUtf8TestUtil { 59 private static Logger logger = Logger.getLogger(IsValidUtf8TestUtil.class.getName()); 60 IsValidUtf8TestUtil()61 private IsValidUtf8TestUtil() {} 62 63 static interface ByteStringFactory { newByteString(byte[] bytes)64 ByteString newByteString(byte[] bytes); 65 } 66 67 static final ByteStringFactory LITERAL_FACTORY = 68 new ByteStringFactory() { 69 @Override 70 public ByteString newByteString(byte[] bytes) { 71 return ByteString.wrap(bytes); 72 } 73 }; 74 75 static final ByteStringFactory HEAP_NIO_FACTORY = 76 new ByteStringFactory() { 77 @Override 78 public ByteString newByteString(byte[] bytes) { 79 return new NioByteString(ByteBuffer.wrap(bytes)); 80 } 81 }; 82 83 private static ThreadLocal<SoftReference<ByteBuffer>> directBuffer = 84 new ThreadLocal<SoftReference<ByteBuffer>>(); 85 86 /** 87 * Factory for direct {@link ByteBuffer} instances. To reduce direct memory usage, this uses a 88 * thread local direct buffer. This means that each call will overwrite the buffer's contents from 89 * the previous call, so the calling code must be careful not to continue using a buffer returned 90 * from a previous invocation. 91 */ 92 static final ByteStringFactory DIRECT_NIO_FACTORY = 93 new ByteStringFactory() { 94 @Override 95 public ByteString newByteString(byte[] bytes) { 96 SoftReference<ByteBuffer> ref = directBuffer.get(); 97 ByteBuffer buffer = ref == null ? null : ref.get(); 98 if (buffer == null || buffer.capacity() < bytes.length) { 99 buffer = ByteBuffer.allocateDirect(bytes.length); 100 directBuffer.set(new SoftReference<ByteBuffer>(buffer)); 101 } 102 buffer.clear(); 103 buffer.put(bytes); 104 buffer.flip(); 105 return new NioByteString(buffer); 106 } 107 }; 108 109 // 128 - [chars 0x0000 to 0x007f] 110 static final long ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS = 0x007f - 0x0000 + 1; 111 112 // 128 113 static final long EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT = ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS; 114 115 // 1920 [chars 0x0080 to 0x07FF] 116 static final long TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS = 0x07FF - 0x0080 + 1; 117 118 // 18,304 119 static final long EXPECTED_TWO_BYTE_ROUNDTRIPPABLE_COUNT = 120 // Both bytes are one byte characters 121 (long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 2) 122 + 123 // The possible number of two byte characters 124 TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS; 125 126 // 2048 127 static final long THREE_BYTE_SURROGATES = 2 * 1024; 128 129 // 61,440 [chars 0x0800 to 0xFFFF, minus surrogates] 130 static final long THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS = 131 0xFFFF - 0x0800 + 1 - THREE_BYTE_SURROGATES; 132 133 // 2,650,112 134 static final long EXPECTED_THREE_BYTE_ROUNDTRIPPABLE_COUNT = 135 // All one byte characters 136 (long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 3) 137 + 138 // One two byte character and a one byte character 139 2 * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS * ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS 140 + 141 // Three byte characters 142 THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS; 143 144 // 1,048,576 [chars 0x10000L to 0x10FFFF] 145 static final long FOUR_BYTE_ROUNDTRIPPABLE_CHARACTERS = 0x10FFFF - 0x10000L + 1; 146 147 // 289,571,839 148 static final long EXPECTED_FOUR_BYTE_ROUNDTRIPPABLE_COUNT = 149 // All one byte characters 150 (long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 4) 151 + 152 // One and three byte characters 153 2 * THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS * ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS 154 + 155 // Two two byte characters 156 TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS 157 + 158 // Permutations of one and two byte characters 159 3 160 * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS 161 * ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS 162 * ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS 163 + 164 // Four byte characters 165 FOUR_BYTE_ROUNDTRIPPABLE_CHARACTERS; 166 167 static final class Shard { 168 final long index; 169 final long start; 170 final long lim; 171 final long expected; 172 Shard(long index, long start, long lim, long expected)173 public Shard(long index, long start, long lim, long expected) { 174 assertTrue(start < lim); 175 this.index = index; 176 this.start = start; 177 this.lim = lim; 178 this.expected = expected; 179 } 180 } 181 182 static final long[] FOUR_BYTE_SHARDS_EXPECTED_ROUNTRIPPABLES = 183 generateFourByteShardsExpectedRunnables(); 184 185 private static long[] generateFourByteShardsExpectedRunnables() { 186 long[] expected = new long[128]; 187 188 // 0-63 are all 5300224 189 for (int i = 0; i <= 63; i++) { 190 expected[i] = 5300224; 191 } 192 193 // 97-111 are all 2342912 194 for (int i = 97; i <= 111; i++) { 195 expected[i] = 2342912; 196 } 197 198 // 113-117 are all 1048576 199 for (int i = 113; i <= 117; i++) { 200 expected[i] = 1048576; 201 } 202 203 // One offs 204 expected[112] = 786432; 205 expected[118] = 786432; 206 expected[119] = 1048576; 207 expected[120] = 458752; 208 expected[121] = 524288; 209 expected[122] = 65536; 210 211 // Anything not assigned was the default 0. 212 return expected; 213 } 214 215 static final List<Shard> FOUR_BYTE_SHARDS = 216 generateFourByteShards(128, FOUR_BYTE_SHARDS_EXPECTED_ROUNTRIPPABLES); 217 218 private static List<Shard> generateFourByteShards(int numShards, long[] expected) { 219 assertEquals(numShards, expected.length); 220 List<Shard> shards = new ArrayList<Shard>(numShards); 221 long lim = 1L << 32; 222 long increment = lim / numShards; 223 assertTrue(lim % numShards == 0); 224 for (int i = 0; i < numShards; i++) { 225 shards.add(new Shard(i, increment * i, increment * (i + 1), expected[i])); 226 } 227 return shards; 228 } 229 230 /** 231 * Helper to run the loop to test all the permutations for the number of bytes specified. 232 * 233 * @param factory the factory for {@link ByteString} instances. 234 * @param numBytes the number of bytes in the byte array 235 * @param expectedCount the expected number of roundtrippable permutations 236 */ 237 static void testBytes(ByteStringFactory factory, int numBytes, long expectedCount) { 238 testBytes(factory, numBytes, expectedCount, 0, -1); 239 } 240 241 /** 242 * Helper to run the loop to test all the permutations for the number of bytes specified. This 243 * overload is useful for debugging to get the loop to start at a certain character. 244 * 245 * @param factory the factory for {@link ByteString} instances. 246 * @param numBytes the number of bytes in the byte array 247 * @param expectedCount the expected number of roundtrippable permutations 248 * @param start the starting bytes encoded as a long as big-endian 249 * @param lim the limit of bytes to process encoded as a long as big-endian, or -1 to mean the max 250 * limit for numBytes 251 */ 252 static void testBytes( 253 ByteStringFactory factory, int numBytes, long expectedCount, long start, long lim) { 254 Random rnd = new Random(); 255 byte[] bytes = new byte[numBytes]; 256 257 if (lim == -1) { 258 lim = 1L << (numBytes * 8); 259 } 260 long count = 0; 261 long countRoundTripped = 0; 262 for (long byteChar = start; byteChar < lim; byteChar++) { 263 long tmpByteChar = byteChar; 264 for (int i = 0; i < numBytes; i++) { 265 bytes[bytes.length - i - 1] = (byte) tmpByteChar; 266 tmpByteChar = tmpByteChar >> 8; 267 } 268 ByteString bs = factory.newByteString(bytes); 269 boolean isRoundTrippable = bs.isValidUtf8(); 270 String s = new String(bytes, Internal.UTF_8); 271 byte[] bytesReencoded = s.getBytes(Internal.UTF_8); 272 boolean bytesEqual = Arrays.equals(bytes, bytesReencoded); 273 274 if (bytesEqual != isRoundTrippable) { 275 outputFailure(byteChar, bytes, bytesReencoded); 276 } 277 278 // Check agreement with static Utf8 methods. 279 assertEquals(isRoundTrippable, Utf8.isValidUtf8(bytes)); 280 assertEquals(isRoundTrippable, Utf8.isValidUtf8(bytes, 0, numBytes)); 281 282 try { 283 assertEquals(s, Utf8.decodeUtf8(bytes, 0, numBytes)); 284 } catch (InvalidProtocolBufferException e) { 285 if (isRoundTrippable) { 286 System.out.println("Could not decode utf-8"); 287 outputFailure(byteChar, bytes, bytesReencoded); 288 } 289 } 290 291 // Test partial sequences. 292 // Partition numBytes into three segments (not necessarily non-empty). 293 int i = rnd.nextInt(numBytes); 294 int j = rnd.nextInt(numBytes); 295 if (j < i) { 296 int tmp = i; 297 i = j; 298 j = tmp; 299 } 300 int state1 = Utf8.partialIsValidUtf8(Utf8.COMPLETE, bytes, 0, i); 301 int state2 = Utf8.partialIsValidUtf8(state1, bytes, i, j); 302 int state3 = Utf8.partialIsValidUtf8(state2, bytes, j, numBytes); 303 if (isRoundTrippable != (state3 == Utf8.COMPLETE)) { 304 System.out.printf("state=%04x %04x %04x i=%d j=%d%n", state1, state2, state3, i, j); 305 outputFailure(byteChar, bytes, bytesReencoded); 306 } 307 assertEquals(isRoundTrippable, (state3 == Utf8.COMPLETE)); 308 309 // Test ropes built out of small partial sequences 310 ByteString rope = 311 RopeByteString.newInstanceForTest( 312 bs.substring(0, i), 313 RopeByteString.newInstanceForTest(bs.substring(i, j), bs.substring(j, numBytes))); 314 assertSame(RopeByteString.class, rope.getClass()); 315 316 ByteString[] byteStrings = {bs, bs.substring(0, numBytes), rope}; 317 for (ByteString x : byteStrings) { 318 assertEquals(isRoundTrippable, x.isValidUtf8()); 319 assertEquals(state3, x.partialIsValidUtf8(Utf8.COMPLETE, 0, numBytes)); 320 321 assertEquals(state1, x.partialIsValidUtf8(Utf8.COMPLETE, 0, i)); 322 assertEquals(state1, x.substring(0, i).partialIsValidUtf8(Utf8.COMPLETE, 0, i)); 323 assertEquals(state2, x.partialIsValidUtf8(state1, i, j - i)); 324 assertEquals(state2, x.substring(i, j).partialIsValidUtf8(state1, 0, j - i)); 325 assertEquals(state3, x.partialIsValidUtf8(state2, j, numBytes - j)); 326 assertEquals(state3, x.substring(j, numBytes).partialIsValidUtf8(state2, 0, numBytes - j)); 327 } 328 329 // ByteString reduplication should not affect its UTF-8 validity. 330 ByteString ropeADope = RopeByteString.newInstanceForTest(bs, bs.substring(0, numBytes)); 331 assertEquals(isRoundTrippable, ropeADope.isValidUtf8()); 332 333 if (isRoundTrippable) { 334 countRoundTripped++; 335 } 336 count++; 337 if (byteChar != 0 && byteChar % 1000000L == 0) { 338 logger.info("Processed " + (byteChar / 1000000L) + " million characters"); 339 } 340 } 341 logger.info("Round tripped " + countRoundTripped + " of " + count); 342 assertEquals(expectedCount, countRoundTripped); 343 } 344 345 /** 346 * Variation of {@link #testBytes} that does less allocation using the low-level encoders/decoders 347 * directly. Checked in because it's useful for debugging when trying to process bytes faster, but 348 * since it doesn't use the actual String class, it's possible for incompatibilities to develop 349 * (although unlikely). 350 * 351 * @param factory the factory for {@link ByteString} instances. 352 * @param numBytes the number of bytes in the byte array 353 * @param expectedCount the expected number of roundtrippable permutations 354 * @param start the starting bytes encoded as a long as big-endian 355 * @param lim the limit of bytes to process encoded as a long as big-endian, or -1 to mean the max 356 * limit for numBytes 357 */ 358 static void testBytesUsingByteBuffers( 359 ByteStringFactory factory, int numBytes, long expectedCount, long start, long lim) { 360 CharsetDecoder decoder = 361 Internal.UTF_8 362 .newDecoder() 363 .onMalformedInput(CodingErrorAction.REPLACE) 364 .onUnmappableCharacter(CodingErrorAction.REPLACE); 365 CharsetEncoder encoder = 366 Internal.UTF_8 367 .newEncoder() 368 .onMalformedInput(CodingErrorAction.REPLACE) 369 .onUnmappableCharacter(CodingErrorAction.REPLACE); 370 byte[] bytes = new byte[numBytes]; 371 int maxChars = (int) (decoder.maxCharsPerByte() * numBytes) + 1; 372 char[] charsDecoded = new char[(int) (decoder.maxCharsPerByte() * numBytes) + 1]; 373 int maxBytes = (int) (encoder.maxBytesPerChar() * maxChars) + 1; 374 byte[] bytesReencoded = new byte[maxBytes]; 375 376 ByteBuffer bb = ByteBuffer.wrap(bytes); 377 CharBuffer cb = CharBuffer.wrap(charsDecoded); 378 ByteBuffer bbReencoded = ByteBuffer.wrap(bytesReencoded); 379 if (lim == -1) { 380 lim = 1L << (numBytes * 8); 381 } 382 long count = 0; 383 long countRoundTripped = 0; 384 for (long byteChar = start; byteChar < lim; byteChar++) { 385 bb.rewind(); 386 bb.limit(bytes.length); 387 cb.rewind(); 388 cb.limit(charsDecoded.length); 389 bbReencoded.rewind(); 390 bbReencoded.limit(bytesReencoded.length); 391 encoder.reset(); 392 decoder.reset(); 393 long tmpByteChar = byteChar; 394 for (int i = 0; i < bytes.length; i++) { 395 bytes[bytes.length - i - 1] = (byte) tmpByteChar; 396 tmpByteChar = tmpByteChar >> 8; 397 } 398 boolean isRoundTrippable = factory.newByteString(bytes).isValidUtf8(); 399 CoderResult result = decoder.decode(bb, cb, true); 400 assertFalse(result.isError()); 401 result = decoder.flush(cb); 402 assertFalse(result.isError()); 403 404 int charLen = cb.position(); 405 cb.rewind(); 406 cb.limit(charLen); 407 result = encoder.encode(cb, bbReencoded, true); 408 assertFalse(result.isError()); 409 result = encoder.flush(bbReencoded); 410 assertFalse(result.isError()); 411 412 boolean bytesEqual = true; 413 int bytesLen = bbReencoded.position(); 414 if (bytesLen != numBytes) { 415 bytesEqual = false; 416 } else { 417 for (int i = 0; i < numBytes; i++) { 418 if (bytes[i] != bytesReencoded[i]) { 419 bytesEqual = false; 420 break; 421 } 422 } 423 } 424 if (bytesEqual != isRoundTrippable) { 425 outputFailure(byteChar, bytes, bytesReencoded, bytesLen); 426 } 427 428 count++; 429 if (isRoundTrippable) { 430 countRoundTripped++; 431 } 432 if (byteChar != 0 && byteChar % 1000000 == 0) { 433 logger.info("Processed " + (byteChar / 1000000) + " million characters"); 434 } 435 } 436 logger.info("Round tripped " + countRoundTripped + " of " + count); 437 assertEquals(expectedCount, countRoundTripped); 438 } 439 440 private static void outputFailure(long byteChar, byte[] bytes, byte[] after) { 441 outputFailure(byteChar, bytes, after, after.length); 442 } 443 444 private static void outputFailure(long byteChar, byte[] bytes, byte[] after, int len) { 445 fail( 446 String.format( 447 "Failure: (%s) %s => %s", 448 Long.toHexString(byteChar), toHexString(bytes), toHexString(after, len))); 449 } 450 451 private static String toHexString(byte[] b) { 452 return toHexString(b, b.length); 453 } 454 455 private static String toHexString(byte[] b, int len) { 456 StringBuilder s = new StringBuilder(); 457 s.append("\""); 458 for (int i = 0; i < len; i++) { 459 if (i > 0) { 460 s.append(" "); 461 } 462 s.append(String.format("%02x", b[i] & 0xFF)); 463 } 464 s.append("\""); 465 return s.toString(); 466 } 467 } 468