• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Protocol Buffers - Google's data interchange format
2 // Copyright 2008 Google Inc.  All rights reserved.
3 // https://developers.google.com/protocol-buffers/
4 //
5 // Redistribution and use in source and binary forms, with or without
6 // modification, are permitted provided that the following conditions are
7 // met:
8 //
9 //     * Redistributions of source code must retain the above copyright
10 // notice, this list of conditions and the following disclaimer.
11 //     * Redistributions in binary form must reproduce the above
12 // copyright notice, this list of conditions and the following disclaimer
13 // in the documentation and/or other materials provided with the
14 // distribution.
15 //     * Neither the name of Google Inc. nor the names of its
16 // contributors may be used to endorse or promote products derived from
17 // this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 
31 package com.google.protobuf;
32 
33 import static org.junit.Assert.assertEquals;
34 import static org.junit.Assert.assertFalse;
35 import static org.junit.Assert.assertSame;
36 import static org.junit.Assert.assertTrue;
37 import static org.junit.Assert.fail;
38 
39 import java.lang.ref.SoftReference;
40 import java.nio.ByteBuffer;
41 import java.nio.CharBuffer;
42 import java.nio.charset.CharsetDecoder;
43 import java.nio.charset.CharsetEncoder;
44 import java.nio.charset.CoderResult;
45 import java.nio.charset.CodingErrorAction;
46 import java.util.ArrayList;
47 import java.util.Arrays;
48 import java.util.List;
49 import java.util.Random;
50 import java.util.logging.Logger;
51 
52 /**
53  * Shared testing code for {@link IsValidUtf8Test} and {@link IsValidUtf8FourByteTest}.
54  *
55  * @author jonp@google.com (Jon Perlow)
56  * @author martinrb@google.com (Martin Buchholz)
57  */
58 final class IsValidUtf8TestUtil {
59   private static Logger logger = Logger.getLogger(IsValidUtf8TestUtil.class.getName());
60 
IsValidUtf8TestUtil()61   private IsValidUtf8TestUtil() {}
62 
63   static interface ByteStringFactory {
newByteString(byte[] bytes)64     ByteString newByteString(byte[] bytes);
65   }
66 
67   static final ByteStringFactory LITERAL_FACTORY =
68       new ByteStringFactory() {
69         @Override
70         public ByteString newByteString(byte[] bytes) {
71           return ByteString.wrap(bytes);
72         }
73       };
74 
75   static final ByteStringFactory HEAP_NIO_FACTORY =
76       new ByteStringFactory() {
77         @Override
78         public ByteString newByteString(byte[] bytes) {
79           return new NioByteString(ByteBuffer.wrap(bytes));
80         }
81       };
82 
83   private static ThreadLocal<SoftReference<ByteBuffer>> directBuffer =
84       new ThreadLocal<SoftReference<ByteBuffer>>();
85 
86   /**
87    * Factory for direct {@link ByteBuffer} instances. To reduce direct memory usage, this uses a
88    * thread local direct buffer. This means that each call will overwrite the buffer's contents from
89    * the previous call, so the calling code must be careful not to continue using a buffer returned
90    * from a previous invocation.
91    */
92   static final ByteStringFactory DIRECT_NIO_FACTORY =
93       new ByteStringFactory() {
94         @Override
95         public ByteString newByteString(byte[] bytes) {
96           SoftReference<ByteBuffer> ref = directBuffer.get();
97           ByteBuffer buffer = ref == null ? null : ref.get();
98           if (buffer == null || buffer.capacity() < bytes.length) {
99             buffer = ByteBuffer.allocateDirect(bytes.length);
100             directBuffer.set(new SoftReference<ByteBuffer>(buffer));
101           }
102           buffer.clear();
103           buffer.put(bytes);
104           buffer.flip();
105           return new NioByteString(buffer);
106         }
107       };
108 
109   // 128 - [chars 0x0000 to 0x007f]
110   static final long ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS = 0x007f - 0x0000 + 1;
111 
112   // 128
113   static final long EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT = ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS;
114 
115   // 1920 [chars 0x0080 to 0x07FF]
116   static final long TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS = 0x07FF - 0x0080 + 1;
117 
118   // 18,304
119   static final long EXPECTED_TWO_BYTE_ROUNDTRIPPABLE_COUNT =
120       // Both bytes are one byte characters
121       (long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 2)
122           +
123           // The possible number of two byte characters
124           TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS;
125 
126   // 2048
127   static final long THREE_BYTE_SURROGATES = 2 * 1024;
128 
129   // 61,440 [chars 0x0800 to 0xFFFF, minus surrogates]
130   static final long THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS =
131       0xFFFF - 0x0800 + 1 - THREE_BYTE_SURROGATES;
132 
133   // 2,650,112
134   static final long EXPECTED_THREE_BYTE_ROUNDTRIPPABLE_COUNT =
135       // All one byte characters
136       (long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 3)
137           +
138           // One two byte character and a one byte character
139           2 * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS * ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS
140           +
141           // Three byte characters
142           THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS;
143 
144   // 1,048,576 [chars 0x10000L to 0x10FFFF]
145   static final long FOUR_BYTE_ROUNDTRIPPABLE_CHARACTERS = 0x10FFFF - 0x10000L + 1;
146 
147   // 289,571,839
148   static final long EXPECTED_FOUR_BYTE_ROUNDTRIPPABLE_COUNT =
149       // All one byte characters
150       (long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 4)
151           +
152           // One and three byte characters
153           2 * THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS * ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS
154           +
155           // Two two byte characters
156           TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS
157           +
158           // Permutations of one and two byte characters
159           3
160               * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS
161               * ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS
162               * ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS
163           +
164           // Four byte characters
165           FOUR_BYTE_ROUNDTRIPPABLE_CHARACTERS;
166 
167   static final class Shard {
168     final long index;
169     final long start;
170     final long lim;
171     final long expected;
172 
Shard(long index, long start, long lim, long expected)173     public Shard(long index, long start, long lim, long expected) {
174       assertTrue(start < lim);
175       this.index = index;
176       this.start = start;
177       this.lim = lim;
178       this.expected = expected;
179     }
180   }
181 
182   static final long[] FOUR_BYTE_SHARDS_EXPECTED_ROUNTRIPPABLES =
183       generateFourByteShardsExpectedRunnables();
184 
185   private static long[] generateFourByteShardsExpectedRunnables() {
186     long[] expected = new long[128];
187 
188     // 0-63 are all 5300224
189     for (int i = 0; i <= 63; i++) {
190       expected[i] = 5300224;
191     }
192 
193     // 97-111 are all 2342912
194     for (int i = 97; i <= 111; i++) {
195       expected[i] = 2342912;
196     }
197 
198     // 113-117 are all 1048576
199     for (int i = 113; i <= 117; i++) {
200       expected[i] = 1048576;
201     }
202 
203     // One offs
204     expected[112] = 786432;
205     expected[118] = 786432;
206     expected[119] = 1048576;
207     expected[120] = 458752;
208     expected[121] = 524288;
209     expected[122] = 65536;
210 
211     // Anything not assigned was the default 0.
212     return expected;
213   }
214 
215   static final List<Shard> FOUR_BYTE_SHARDS =
216       generateFourByteShards(128, FOUR_BYTE_SHARDS_EXPECTED_ROUNTRIPPABLES);
217 
218   private static List<Shard> generateFourByteShards(int numShards, long[] expected) {
219     assertEquals(numShards, expected.length);
220     List<Shard> shards = new ArrayList<Shard>(numShards);
221     long lim = 1L << 32;
222     long increment = lim / numShards;
223     assertTrue(lim % numShards == 0);
224     for (int i = 0; i < numShards; i++) {
225       shards.add(new Shard(i, increment * i, increment * (i + 1), expected[i]));
226     }
227     return shards;
228   }
229 
230   /**
231    * Helper to run the loop to test all the permutations for the number of bytes specified.
232    *
233    * @param factory the factory for {@link ByteString} instances.
234    * @param numBytes the number of bytes in the byte array
235    * @param expectedCount the expected number of roundtrippable permutations
236    */
237   static void testBytes(ByteStringFactory factory, int numBytes, long expectedCount) {
238     testBytes(factory, numBytes, expectedCount, 0, -1);
239   }
240 
241   /**
242    * Helper to run the loop to test all the permutations for the number of bytes specified. This
243    * overload is useful for debugging to get the loop to start at a certain character.
244    *
245    * @param factory the factory for {@link ByteString} instances.
246    * @param numBytes the number of bytes in the byte array
247    * @param expectedCount the expected number of roundtrippable permutations
248    * @param start the starting bytes encoded as a long as big-endian
249    * @param lim the limit of bytes to process encoded as a long as big-endian, or -1 to mean the max
250    *     limit for numBytes
251    */
252   static void testBytes(
253       ByteStringFactory factory, int numBytes, long expectedCount, long start, long lim) {
254     Random rnd = new Random();
255     byte[] bytes = new byte[numBytes];
256 
257     if (lim == -1) {
258       lim = 1L << (numBytes * 8);
259     }
260     long count = 0;
261     long countRoundTripped = 0;
262     for (long byteChar = start; byteChar < lim; byteChar++) {
263       long tmpByteChar = byteChar;
264       for (int i = 0; i < numBytes; i++) {
265         bytes[bytes.length - i - 1] = (byte) tmpByteChar;
266         tmpByteChar = tmpByteChar >> 8;
267       }
268       ByteString bs = factory.newByteString(bytes);
269       boolean isRoundTrippable = bs.isValidUtf8();
270       String s = new String(bytes, Internal.UTF_8);
271       byte[] bytesReencoded = s.getBytes(Internal.UTF_8);
272       boolean bytesEqual = Arrays.equals(bytes, bytesReencoded);
273 
274       if (bytesEqual != isRoundTrippable) {
275         outputFailure(byteChar, bytes, bytesReencoded);
276       }
277 
278       // Check agreement with static Utf8 methods.
279       assertEquals(isRoundTrippable, Utf8.isValidUtf8(bytes));
280       assertEquals(isRoundTrippable, Utf8.isValidUtf8(bytes, 0, numBytes));
281 
282       try {
283         assertEquals(s, Utf8.decodeUtf8(bytes, 0, numBytes));
284       } catch (InvalidProtocolBufferException e) {
285         if (isRoundTrippable) {
286           System.out.println("Could not decode utf-8");
287           outputFailure(byteChar, bytes, bytesReencoded);
288         }
289       }
290 
291       // Test partial sequences.
292       // Partition numBytes into three segments (not necessarily non-empty).
293       int i = rnd.nextInt(numBytes);
294       int j = rnd.nextInt(numBytes);
295       if (j < i) {
296         int tmp = i;
297         i = j;
298         j = tmp;
299       }
300       int state1 = Utf8.partialIsValidUtf8(Utf8.COMPLETE, bytes, 0, i);
301       int state2 = Utf8.partialIsValidUtf8(state1, bytes, i, j);
302       int state3 = Utf8.partialIsValidUtf8(state2, bytes, j, numBytes);
303       if (isRoundTrippable != (state3 == Utf8.COMPLETE)) {
304         System.out.printf("state=%04x %04x %04x i=%d j=%d%n", state1, state2, state3, i, j);
305         outputFailure(byteChar, bytes, bytesReencoded);
306       }
307       assertEquals(isRoundTrippable, (state3 == Utf8.COMPLETE));
308 
309       // Test ropes built out of small partial sequences
310       ByteString rope =
311           RopeByteString.newInstanceForTest(
312               bs.substring(0, i),
313               RopeByteString.newInstanceForTest(bs.substring(i, j), bs.substring(j, numBytes)));
314       assertSame(RopeByteString.class, rope.getClass());
315 
316       ByteString[] byteStrings = {bs, bs.substring(0, numBytes), rope};
317       for (ByteString x : byteStrings) {
318         assertEquals(isRoundTrippable, x.isValidUtf8());
319         assertEquals(state3, x.partialIsValidUtf8(Utf8.COMPLETE, 0, numBytes));
320 
321         assertEquals(state1, x.partialIsValidUtf8(Utf8.COMPLETE, 0, i));
322         assertEquals(state1, x.substring(0, i).partialIsValidUtf8(Utf8.COMPLETE, 0, i));
323         assertEquals(state2, x.partialIsValidUtf8(state1, i, j - i));
324         assertEquals(state2, x.substring(i, j).partialIsValidUtf8(state1, 0, j - i));
325         assertEquals(state3, x.partialIsValidUtf8(state2, j, numBytes - j));
326         assertEquals(state3, x.substring(j, numBytes).partialIsValidUtf8(state2, 0, numBytes - j));
327       }
328 
329       // ByteString reduplication should not affect its UTF-8 validity.
330       ByteString ropeADope = RopeByteString.newInstanceForTest(bs, bs.substring(0, numBytes));
331       assertEquals(isRoundTrippable, ropeADope.isValidUtf8());
332 
333       if (isRoundTrippable) {
334         countRoundTripped++;
335       }
336       count++;
337       if (byteChar != 0 && byteChar % 1000000L == 0) {
338         logger.info("Processed " + (byteChar / 1000000L) + " million characters");
339       }
340     }
341     logger.info("Round tripped " + countRoundTripped + " of " + count);
342     assertEquals(expectedCount, countRoundTripped);
343   }
344 
345   /**
346    * Variation of {@link #testBytes} that does less allocation using the low-level encoders/decoders
347    * directly. Checked in because it's useful for debugging when trying to process bytes faster, but
348    * since it doesn't use the actual String class, it's possible for incompatibilities to develop
349    * (although unlikely).
350    *
351    * @param factory the factory for {@link ByteString} instances.
352    * @param numBytes the number of bytes in the byte array
353    * @param expectedCount the expected number of roundtrippable permutations
354    * @param start the starting bytes encoded as a long as big-endian
355    * @param lim the limit of bytes to process encoded as a long as big-endian, or -1 to mean the max
356    *     limit for numBytes
357    */
358   static void testBytesUsingByteBuffers(
359       ByteStringFactory factory, int numBytes, long expectedCount, long start, long lim) {
360     CharsetDecoder decoder =
361         Internal.UTF_8
362             .newDecoder()
363             .onMalformedInput(CodingErrorAction.REPLACE)
364             .onUnmappableCharacter(CodingErrorAction.REPLACE);
365     CharsetEncoder encoder =
366         Internal.UTF_8
367             .newEncoder()
368             .onMalformedInput(CodingErrorAction.REPLACE)
369             .onUnmappableCharacter(CodingErrorAction.REPLACE);
370     byte[] bytes = new byte[numBytes];
371     int maxChars = (int) (decoder.maxCharsPerByte() * numBytes) + 1;
372     char[] charsDecoded = new char[(int) (decoder.maxCharsPerByte() * numBytes) + 1];
373     int maxBytes = (int) (encoder.maxBytesPerChar() * maxChars) + 1;
374     byte[] bytesReencoded = new byte[maxBytes];
375 
376     ByteBuffer bb = ByteBuffer.wrap(bytes);
377     CharBuffer cb = CharBuffer.wrap(charsDecoded);
378     ByteBuffer bbReencoded = ByteBuffer.wrap(bytesReencoded);
379     if (lim == -1) {
380       lim = 1L << (numBytes * 8);
381     }
382     long count = 0;
383     long countRoundTripped = 0;
384     for (long byteChar = start; byteChar < lim; byteChar++) {
385       bb.rewind();
386       bb.limit(bytes.length);
387       cb.rewind();
388       cb.limit(charsDecoded.length);
389       bbReencoded.rewind();
390       bbReencoded.limit(bytesReencoded.length);
391       encoder.reset();
392       decoder.reset();
393       long tmpByteChar = byteChar;
394       for (int i = 0; i < bytes.length; i++) {
395         bytes[bytes.length - i - 1] = (byte) tmpByteChar;
396         tmpByteChar = tmpByteChar >> 8;
397       }
398       boolean isRoundTrippable = factory.newByteString(bytes).isValidUtf8();
399       CoderResult result = decoder.decode(bb, cb, true);
400       assertFalse(result.isError());
401       result = decoder.flush(cb);
402       assertFalse(result.isError());
403 
404       int charLen = cb.position();
405       cb.rewind();
406       cb.limit(charLen);
407       result = encoder.encode(cb, bbReencoded, true);
408       assertFalse(result.isError());
409       result = encoder.flush(bbReencoded);
410       assertFalse(result.isError());
411 
412       boolean bytesEqual = true;
413       int bytesLen = bbReencoded.position();
414       if (bytesLen != numBytes) {
415         bytesEqual = false;
416       } else {
417         for (int i = 0; i < numBytes; i++) {
418           if (bytes[i] != bytesReencoded[i]) {
419             bytesEqual = false;
420             break;
421           }
422         }
423       }
424       if (bytesEqual != isRoundTrippable) {
425         outputFailure(byteChar, bytes, bytesReencoded, bytesLen);
426       }
427 
428       count++;
429       if (isRoundTrippable) {
430         countRoundTripped++;
431       }
432       if (byteChar != 0 && byteChar % 1000000 == 0) {
433         logger.info("Processed " + (byteChar / 1000000) + " million characters");
434       }
435     }
436     logger.info("Round tripped " + countRoundTripped + " of " + count);
437     assertEquals(expectedCount, countRoundTripped);
438   }
439 
440   private static void outputFailure(long byteChar, byte[] bytes, byte[] after) {
441     outputFailure(byteChar, bytes, after, after.length);
442   }
443 
444   private static void outputFailure(long byteChar, byte[] bytes, byte[] after, int len) {
445     fail(
446         String.format(
447             "Failure: (%s) %s => %s",
448             Long.toHexString(byteChar), toHexString(bytes), toHexString(after, len)));
449   }
450 
451   private static String toHexString(byte[] b) {
452     return toHexString(b, b.length);
453   }
454 
455   private static String toHexString(byte[] b, int len) {
456     StringBuilder s = new StringBuilder();
457     s.append("\"");
458     for (int i = 0; i < len; i++) {
459       if (i > 0) {
460         s.append(" ");
461       }
462       s.append(String.format("%02x", b[i] & 0xFF));
463     }
464     s.append("\"");
465     return s.toString();
466   }
467 }
468