1 // Protocol Buffers - Google's data interchange format 2 // Copyright 2008 Google Inc. All rights reserved. 3 // 4 // Use of this source code is governed by a BSD-style 5 // license that can be found in the LICENSE file or at 6 // https://developers.google.com/open-source/licenses/bsd 7 8 package com.google.protobuf; 9 10 import com.google.protobuf.Utf8.Processor; 11 import com.google.protobuf.Utf8.SafeProcessor; 12 import com.google.protobuf.Utf8.UnsafeProcessor; 13 import java.nio.ByteBuffer; 14 import java.util.ArrayList; 15 import java.util.List; 16 import java.util.logging.Logger; 17 import junit.framework.TestCase; 18 19 public class DecodeUtf8Test extends TestCase { 20 private static Logger logger = Logger.getLogger(DecodeUtf8Test.class.getName()); 21 22 private static final Processor SAFE_PROCESSOR = new SafeProcessor(); 23 private static final Processor UNSAFE_PROCESSOR = new UnsafeProcessor(); 24 testRoundTripAllValidChars()25 public void testRoundTripAllValidChars() throws Exception { 26 for (int i = Character.MIN_CODE_POINT; i < Character.MAX_CODE_POINT; i++) { 27 if (i < Character.MIN_SURROGATE || i > Character.MAX_SURROGATE) { 28 String str = new String(Character.toChars(i)); 29 assertRoundTrips(str); 30 } 31 } 32 } 33 34 // Test all 1, 2, 3 invalid byte combinations. Valid ones would have been covered above. 35 testOneByte()36 public void testOneByte() throws Exception { 37 int valid = 0; 38 ByteBuffer buffer = ByteBuffer.allocateDirect(1); 39 for (int i = Byte.MIN_VALUE; i <= Byte.MAX_VALUE; i++) { 40 ByteString bs = ByteString.copyFrom(new byte[] {(byte) i}); 41 if (bs.isValidUtf8()) { 42 valid++; 43 } else { 44 assertInvalid(bs.toByteArray(), buffer); 45 } 46 } 47 assertEquals(IsValidUtf8TestUtil.EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, valid); 48 } 49 testTwoBytes()50 public void testTwoBytes() throws Exception { 51 int valid = 0; 52 ByteBuffer buffer = ByteBuffer.allocateDirect(2); 53 for (int i = Byte.MIN_VALUE; i <= Byte.MAX_VALUE; i++) { 54 for (int j = Byte.MIN_VALUE; j <= Byte.MAX_VALUE; j++) { 55 ByteString bs = ByteString.copyFrom(new byte[] {(byte) i, (byte) j}); 56 if (bs.isValidUtf8()) { 57 valid++; 58 } else { 59 assertInvalid(bs.toByteArray(), buffer); 60 } 61 } 62 } 63 assertEquals(IsValidUtf8TestUtil.EXPECTED_TWO_BYTE_ROUNDTRIPPABLE_COUNT, valid); 64 } 65 testThreeBytes()66 public void testThreeBytes() throws Exception { 67 int count = 0; 68 int valid = 0; 69 ByteBuffer buffer = ByteBuffer.allocateDirect(3); 70 for (int i = Byte.MIN_VALUE; i <= Byte.MAX_VALUE; i++) { 71 for (int j = Byte.MIN_VALUE; j <= Byte.MAX_VALUE; j++) { 72 for (int k = Byte.MIN_VALUE; k <= Byte.MAX_VALUE; k++) { 73 byte[] bytes = new byte[] {(byte) i, (byte) j, (byte) k}; 74 ByteString bs = ByteString.copyFrom(bytes); 75 if (bs.isValidUtf8()) { 76 valid++; 77 } else { 78 assertInvalid(bytes, buffer); 79 } 80 count++; 81 if (count % 1000000L == 0) { 82 logger.info("Processed " + (count / 1000000L) + " million characters"); 83 } 84 } 85 } 86 } 87 assertEquals(IsValidUtf8TestUtil.EXPECTED_THREE_BYTE_ROUNDTRIPPABLE_COUNT, valid); 88 } 89 90 /** Tests that round tripping of a sample of four byte permutations work. */ testInvalid_4BytesSamples()91 public void testInvalid_4BytesSamples() throws Exception { 92 // Bad trailing bytes 93 assertInvalid(0xF0, 0xA4, 0xAD, 0x7F); 94 assertInvalid(0xF0, 0xA4, 0xAD, 0xC0); 95 96 // Special cases for byte2 97 assertInvalid(0xF0, 0x8F, 0xAD, 0xA2); 98 assertInvalid(0xF4, 0x90, 0xAD, 0xA2); 99 } 100 testRealStrings()101 public void testRealStrings() throws Exception { 102 // English 103 assertRoundTrips("The quick brown fox jumps over the lazy dog"); 104 // German 105 assertRoundTrips("Quizdeltagerne spiste jordb\u00e6r med fl\u00f8de, mens cirkusklovnen"); 106 // Japanese 107 assertRoundTrips("\u3044\u308d\u306f\u306b\u307b\u3078\u3068\u3061\u308a\u306c\u308b\u3092"); 108 // Hebrew 109 assertRoundTrips( 110 "\u05d3\u05d2 \u05e1\u05e7\u05e8\u05df \u05e9\u05d8 \u05d1\u05d9\u05dd " 111 + "\u05de\u05d0\u05d5\u05db\u05d6\u05d1 \u05d5\u05dc\u05e4\u05ea\u05e2" 112 + " \u05de\u05e6\u05d0 \u05dc\u05d5 \u05d7\u05d1\u05e8\u05d4 " 113 + "\u05d0\u05d9\u05da \u05d4\u05e7\u05dc\u05d9\u05d8\u05d4"); 114 // Thai 115 assertRoundTrips( 116 " \u0e08\u0e07\u0e1d\u0e48\u0e32\u0e1f\u0e31\u0e19\u0e1e\u0e31\u0e12" 117 + "\u0e19\u0e32\u0e27\u0e34\u0e0a\u0e32\u0e01\u0e32\u0e23"); 118 // Chinese 119 assertRoundTrips( 120 "\u8fd4\u56de\u94fe\u4e2d\u7684\u4e0b\u4e00\u4e2a\u4ee3\u7406\u9879\u9009\u62e9\u5668"); 121 // Chinese with 4-byte chars 122 assertRoundTrips( 123 "\uD841\uDF0E\uD841\uDF31\uD841\uDF79\uD843\uDC53\uD843\uDC78" 124 + "\uD843\uDC96\uD843\uDCCF\uD843\uDCD5\uD843\uDD15\uD843\uDD7C\uD843\uDD7F" 125 + "\uD843\uDE0E\uD843\uDE0F\uD843\uDE77\uD843\uDE9D\uD843\uDEA2"); 126 // Mixed 127 assertRoundTrips( 128 "The quick brown \u3044\u308d\u306f\u306b\u307b\u3078\u8fd4\u56de\u94fe" 129 + "\u4e2d\u7684\u4e0b\u4e00"); 130 } 131 testOverlong()132 public void testOverlong() throws Exception { 133 assertInvalid(0xc0, 0xaf); 134 assertInvalid(0xe0, 0x80, 0xaf); 135 assertInvalid(0xf0, 0x80, 0x80, 0xaf); 136 137 // Max overlong 138 assertInvalid(0xc1, 0xbf); 139 assertInvalid(0xe0, 0x9f, 0xbf); 140 assertInvalid(0xf0, 0x8f, 0xbf, 0xbf); 141 142 // null overlong 143 assertInvalid(0xc0, 0x80); 144 assertInvalid(0xe0, 0x80, 0x80); 145 assertInvalid(0xf0, 0x80, 0x80, 0x80); 146 } 147 testIllegalCodepoints()148 public void testIllegalCodepoints() throws Exception { 149 // Single surrogate 150 assertInvalid(0xed, 0xa0, 0x80); 151 assertInvalid(0xed, 0xad, 0xbf); 152 assertInvalid(0xed, 0xae, 0x80); 153 assertInvalid(0xed, 0xaf, 0xbf); 154 assertInvalid(0xed, 0xb0, 0x80); 155 assertInvalid(0xed, 0xbe, 0x80); 156 assertInvalid(0xed, 0xbf, 0xbf); 157 158 // Paired surrogates 159 assertInvalid(0xed, 0xa0, 0x80, 0xed, 0xb0, 0x80); 160 assertInvalid(0xed, 0xa0, 0x80, 0xed, 0xbf, 0xbf); 161 assertInvalid(0xed, 0xad, 0xbf, 0xed, 0xb0, 0x80); 162 assertInvalid(0xed, 0xad, 0xbf, 0xed, 0xbf, 0xbf); 163 assertInvalid(0xed, 0xae, 0x80, 0xed, 0xb0, 0x80); 164 assertInvalid(0xed, 0xae, 0x80, 0xed, 0xbf, 0xbf); 165 assertInvalid(0xed, 0xaf, 0xbf, 0xed, 0xb0, 0x80); 166 assertInvalid(0xed, 0xaf, 0xbf, 0xed, 0xbf, 0xbf); 167 } 168 testBufferSlice()169 public void testBufferSlice() throws Exception { 170 String str = "The quick brown fox jumps over the lazy dog"; 171 assertRoundTrips(str, 10, 4); 172 assertRoundTrips(str, str.length(), 0); 173 } 174 testInvalidBufferSlice()175 public void testInvalidBufferSlice() throws Exception { 176 byte[] bytes = "The quick brown fox jumps over the lazy dog".getBytes(Internal.UTF_8); 177 assertInvalidSlice(bytes, bytes.length - 3, 4); 178 assertInvalidSlice(bytes, bytes.length, 1); 179 assertInvalidSlice(bytes, bytes.length + 1, 0); 180 assertInvalidSlice(bytes, 0, bytes.length + 1); 181 } 182 assertInvalid(int... bytesAsInt)183 private void assertInvalid(int... bytesAsInt) throws Exception { 184 byte[] bytes = new byte[bytesAsInt.length]; 185 for (int i = 0; i < bytesAsInt.length; i++) { 186 bytes[i] = (byte) bytesAsInt[i]; 187 } 188 assertInvalid(bytes, null); 189 } 190 191 // Attempts to decode the byte array in several ways and asserts that it always generates an 192 // exception. Allocating a direct ByteBuffer is slow, so the caller can optionally provide a 193 // buffer to reuse. If buffer is non-null, it must be a direct-allocated ByteBuffer of the 194 // appropriate size. assertInvalid(byte[] bytes, ByteBuffer buffer)195 private void assertInvalid(byte[] bytes, ByteBuffer buffer) throws Exception { 196 try { 197 UNSAFE_PROCESSOR.decodeUtf8(bytes, 0, bytes.length); 198 fail(); 199 } catch (InvalidProtocolBufferException e) { 200 // Expected. 201 } 202 try { 203 SAFE_PROCESSOR.decodeUtf8(bytes, 0, bytes.length); 204 fail(); 205 } catch (InvalidProtocolBufferException e) { 206 // Expected. 207 } 208 209 if (buffer == null) { 210 buffer = ByteBuffer.allocateDirect(bytes.length); 211 } 212 buffer.put(bytes); 213 buffer.flip(); 214 try { 215 UNSAFE_PROCESSOR.decodeUtf8(buffer, 0, bytes.length); 216 fail(); 217 } catch (InvalidProtocolBufferException e) { 218 // Expected. 219 } 220 try { 221 SAFE_PROCESSOR.decodeUtf8(buffer, 0, bytes.length); 222 fail(); 223 } catch (InvalidProtocolBufferException e) { 224 // Expected. 225 } 226 buffer.clear(); 227 } 228 assertInvalidSlice(byte[] bytes, int index, int size)229 private void assertInvalidSlice(byte[] bytes, int index, int size) throws Exception { 230 try { 231 UNSAFE_PROCESSOR.decodeUtf8(bytes, index, size); 232 fail(); 233 } catch (IndexOutOfBoundsException e) { 234 // Expected. 235 } 236 try { 237 SAFE_PROCESSOR.decodeUtf8(bytes, index, size); 238 fail(); 239 } catch (IndexOutOfBoundsException e) { 240 // Expected. 241 } 242 243 ByteBuffer direct = ByteBuffer.allocateDirect(bytes.length); 244 direct.put(bytes); 245 direct.flip(); 246 try { 247 UNSAFE_PROCESSOR.decodeUtf8(direct, index, size); 248 fail(); 249 } catch (IndexOutOfBoundsException e) { 250 // Expected. 251 } 252 try { 253 SAFE_PROCESSOR.decodeUtf8(direct, index, size); 254 fail(); 255 } catch (IndexOutOfBoundsException e) { 256 // Expected. 257 } 258 259 ByteBuffer heap = ByteBuffer.allocate(bytes.length); 260 heap.put(bytes); 261 heap.flip(); 262 try { 263 UNSAFE_PROCESSOR.decodeUtf8(heap, index, size); 264 fail(); 265 } catch (IndexOutOfBoundsException e) { 266 // Expected. 267 } 268 try { 269 SAFE_PROCESSOR.decodeUtf8(heap, index, size); 270 fail(); 271 } catch (IndexOutOfBoundsException e) { 272 // Expected. 273 } 274 } 275 assertRoundTrips(String str)276 private void assertRoundTrips(String str) throws Exception { 277 assertRoundTrips(str, 0, -1); 278 } 279 assertRoundTrips(String str, int index, int size)280 private void assertRoundTrips(String str, int index, int size) throws Exception { 281 byte[] bytes = str.getBytes(Internal.UTF_8); 282 if (size == -1) { 283 size = bytes.length; 284 } 285 assertDecode( 286 new String(bytes, index, size, Internal.UTF_8), 287 UNSAFE_PROCESSOR.decodeUtf8(bytes, index, size)); 288 assertDecode( 289 new String(bytes, index, size, Internal.UTF_8), 290 SAFE_PROCESSOR.decodeUtf8(bytes, index, size)); 291 292 ByteBuffer direct = ByteBuffer.allocateDirect(bytes.length); 293 direct.put(bytes); 294 direct.flip(); 295 assertDecode( 296 new String(bytes, index, size, Internal.UTF_8), 297 UNSAFE_PROCESSOR.decodeUtf8(direct, index, size)); 298 assertDecode( 299 new String(bytes, index, size, Internal.UTF_8), 300 SAFE_PROCESSOR.decodeUtf8(direct, index, size)); 301 302 ByteBuffer heap = ByteBuffer.allocate(bytes.length); 303 heap.put(bytes); 304 heap.flip(); 305 assertDecode( 306 new String(bytes, index, size, Internal.UTF_8), 307 UNSAFE_PROCESSOR.decodeUtf8(heap, index, size)); 308 assertDecode( 309 new String(bytes, index, size, Internal.UTF_8), 310 SAFE_PROCESSOR.decodeUtf8(heap, index, size)); 311 } 312 assertDecode(String expected, String actual)313 private void assertDecode(String expected, String actual) { 314 if (!expected.equals(actual)) { 315 fail("Failure: Expected (" + codepoints(expected) + ") Actual (" + codepoints(actual) + ")"); 316 } 317 } 318 codepoints(String str)319 private List<String> codepoints(String str) { 320 List<String> codepoints = new ArrayList<String>(); 321 for (int i = 0; i < str.length(); i++) { 322 codepoints.add(Long.toHexString(str.charAt(i))); 323 } 324 return codepoints; 325 } 326 } 327