1 package com.fasterxml.jackson.core.json; 2 3 import java.io.*; 4 5 import com.fasterxml.jackson.core.*; 6 import com.fasterxml.jackson.core.format.InputAccessor; 7 import com.fasterxml.jackson.core.format.MatchStrength; 8 import com.fasterxml.jackson.core.io.*; 9 import com.fasterxml.jackson.core.sym.ByteQuadsCanonicalizer; 10 import com.fasterxml.jackson.core.sym.CharsToNameCanonicalizer; 11 12 /** 13 * This class is used to determine the encoding of byte stream 14 * that is to contain JSON content. Rules are fairly simple, and 15 * defined in JSON specification (RFC-4627 or newer), except 16 * for BOM handling, which is a property of underlying 17 * streams. 18 */ 19 public final class ByteSourceJsonBootstrapper 20 { 21 public final static byte UTF8_BOM_1 = (byte) 0xEF; 22 public final static byte UTF8_BOM_2 = (byte) 0xBB; 23 public final static byte UTF8_BOM_3 = (byte) 0xBF; 24 25 /* 26 /********************************************************** 27 /* Configuration 28 /********************************************************** 29 */ 30 31 private final IOContext _context; 32 33 private final InputStream _in; 34 35 /* 36 /********************************************************** 37 /* Input buffering 38 /********************************************************** 39 */ 40 41 private final byte[] _inputBuffer; 42 43 private int _inputPtr; 44 45 private int _inputEnd; 46 47 /** 48 * Flag that indicates whether buffer above is to be recycled 49 * after being used or not. 50 */ 51 private final boolean _bufferRecyclable; 52 53 /* 54 /********************************************************** 55 /* Input location 56 /********************************************************** 57 */ 58 59 /** 60 * Current number of input units (bytes or chars) that were processed in 61 * previous blocks, 62 * before contents of current input buffer. 63 *<p> 64 * Note: includes possible BOMs, if those were part of the input. 65 */ 66 // private int _inputProcessed; 67 68 /* 69 /********************************************************** 70 /* Data gathered 71 /********************************************************** 72 */ 73 74 /** 75 * Whether input has been detected to be in Big-Endian encoding or not. 76 */ 77 private boolean _bigEndian = true; 78 79 private int _bytesPerChar; // 0 means "dunno yet" 80 81 /* 82 /********************************************************** 83 /* Life-cycle 84 /********************************************************** 85 */ 86 ByteSourceJsonBootstrapper(IOContext ctxt, InputStream in)87 public ByteSourceJsonBootstrapper(IOContext ctxt, InputStream in) { 88 _context = ctxt; 89 _in = in; 90 _inputBuffer = ctxt.allocReadIOBuffer(); 91 _inputEnd = _inputPtr = 0; 92 // _inputProcessed = 0; 93 _bufferRecyclable = true; 94 } 95 ByteSourceJsonBootstrapper(IOContext ctxt, byte[] inputBuffer, int inputStart, int inputLen)96 public ByteSourceJsonBootstrapper(IOContext ctxt, byte[] inputBuffer, int inputStart, int inputLen) { 97 _context = ctxt; 98 _in = null; 99 _inputBuffer = inputBuffer; 100 _inputPtr = inputStart; 101 _inputEnd = (inputStart + inputLen); 102 // Need to offset this for correct location info 103 // _inputProcessed = -inputStart; 104 _bufferRecyclable = false; 105 } 106 107 /* 108 /********************************************************** 109 /* Encoding detection during bootstrapping 110 /********************************************************** 111 */ 112 113 /** 114 * Method that should be called after constructing an instace. 115 * It will figure out encoding that content uses, to allow 116 * for instantiating a proper scanner object. 117 */ detectEncoding()118 public JsonEncoding detectEncoding() throws IOException 119 { 120 boolean foundEncoding = false; 121 122 // First things first: BOM handling 123 /* Note: we can require 4 bytes to be read, since no 124 * combination of BOM + valid JSON content can have 125 * shorter length (shortest valid JSON content is single 126 * digit char, but BOMs are chosen such that combination 127 * is always at least 4 chars long) 128 */ 129 if (ensureLoaded(4)) { 130 int quad = (_inputBuffer[_inputPtr] << 24) 131 | ((_inputBuffer[_inputPtr+1] & 0xFF) << 16) 132 | ((_inputBuffer[_inputPtr+2] & 0xFF) << 8) 133 | (_inputBuffer[_inputPtr+3] & 0xFF); 134 135 if (handleBOM(quad)) { 136 foundEncoding = true; 137 } else { 138 /* If no BOM, need to auto-detect based on first char; 139 * this works since it must be 7-bit ascii (wrt. unicode 140 * compatible encodings, only ones JSON can be transferred 141 * over) 142 */ 143 // UTF-32? 144 if (checkUTF32(quad)) { 145 foundEncoding = true; 146 } else if (checkUTF16(quad >>> 16)) { 147 foundEncoding = true; 148 } 149 } 150 } else if (ensureLoaded(2)) { 151 int i16 = ((_inputBuffer[_inputPtr] & 0xFF) << 8) 152 | (_inputBuffer[_inputPtr+1] & 0xFF); 153 if (checkUTF16(i16)) { 154 foundEncoding = true; 155 } 156 } 157 158 JsonEncoding enc; 159 160 /* Not found yet? As per specs, this means it must be UTF-8. */ 161 if (!foundEncoding) { 162 enc = JsonEncoding.UTF8; 163 } else { 164 switch (_bytesPerChar) { 165 case 1: enc = JsonEncoding.UTF8; 166 break; 167 case 2: enc = _bigEndian ? JsonEncoding.UTF16_BE : JsonEncoding.UTF16_LE; 168 break; 169 case 4: enc = _bigEndian ? JsonEncoding.UTF32_BE : JsonEncoding.UTF32_LE; 170 break; 171 default: throw new RuntimeException("Internal error"); // should never get here 172 } 173 } 174 _context.setEncoding(enc); 175 return enc; 176 } 177 178 /** 179 * Helper method that may be called to see if given {@link DataInput} 180 * has BOM marker, and if so, to skip it. 181 * @throws IOException 182 * 183 * @since 2.8 184 */ skipUTF8BOM(DataInput input)185 public static int skipUTF8BOM(DataInput input) throws IOException 186 { 187 int b = input.readUnsignedByte(); 188 if (b != 0xEF) { 189 return b; 190 } 191 // since this is not legal byte in JSON otherwise, except 192 // that we do get BOM; if not, report error 193 b = input.readUnsignedByte(); 194 if (b != 0xBB) { 195 throw new IOException("Unexpected byte 0x"+Integer.toHexString(b) 196 +" following 0xEF; should get 0xBB as part of UTF-8 BOM"); 197 } 198 b = input.readUnsignedByte(); 199 if (b != 0xBF) { 200 throw new IOException("Unexpected byte 0x"+Integer.toHexString(b) 201 +" following 0xEF 0xBB; should get 0xBF as part of UTF-8 BOM"); 202 } 203 return input.readUnsignedByte(); 204 } 205 206 /* 207 /********************************************************** 208 /* Constructing a Reader 209 /********************************************************** 210 */ 211 212 @SuppressWarnings("resource") constructReader()213 public Reader constructReader() throws IOException 214 { 215 JsonEncoding enc = _context.getEncoding(); 216 switch (enc.bits()) { 217 case 8: // only in non-common case where we don't want to do direct mapping 218 case 16: 219 { 220 // First: do we have a Stream? If not, need to create one: 221 InputStream in = _in; 222 223 if (in == null) { 224 in = new ByteArrayInputStream(_inputBuffer, _inputPtr, _inputEnd); 225 } else { 226 /* Also, if we have any read but unused input (usually true), 227 * need to merge that input in: 228 */ 229 if (_inputPtr < _inputEnd) { 230 in = new MergedStream(_context, in, _inputBuffer, _inputPtr, _inputEnd); 231 } 232 } 233 return new InputStreamReader(in, enc.getJavaName()); 234 } 235 case 32: 236 return new UTF32Reader(_context, _in, _inputBuffer, _inputPtr, _inputEnd, 237 _context.getEncoding().isBigEndian()); 238 } 239 throw new RuntimeException("Internal error"); // should never get here 240 } 241 constructParser(int parserFeatures, ObjectCodec codec, ByteQuadsCanonicalizer rootByteSymbols, CharsToNameCanonicalizer rootCharSymbols, int factoryFeatures)242 public JsonParser constructParser(int parserFeatures, ObjectCodec codec, 243 ByteQuadsCanonicalizer rootByteSymbols, CharsToNameCanonicalizer rootCharSymbols, 244 int factoryFeatures) throws IOException 245 { 246 int prevInputPtr = _inputPtr; 247 JsonEncoding enc = detectEncoding(); 248 int bytesProcessed = _inputPtr - prevInputPtr; 249 250 if (enc == JsonEncoding.UTF8) { 251 /* and without canonicalization, byte-based approach is not performant; just use std UTF-8 reader 252 * (which is ok for larger input; not so hot for smaller; but this is not a common case) 253 */ 254 if (JsonFactory.Feature.CANONICALIZE_FIELD_NAMES.enabledIn(factoryFeatures)) { 255 ByteQuadsCanonicalizer can = rootByteSymbols.makeChild(factoryFeatures); 256 return new UTF8StreamJsonParser(_context, parserFeatures, _in, codec, can, 257 _inputBuffer, _inputPtr, _inputEnd, bytesProcessed, _bufferRecyclable); 258 } 259 } 260 return new ReaderBasedJsonParser(_context, parserFeatures, constructReader(), codec, 261 rootCharSymbols.makeChild(factoryFeatures)); 262 } 263 264 /* 265 /********************************************************** 266 /* Encoding detection for data format auto-detection 267 /********************************************************** 268 */ 269 270 /** 271 * Current implementation is not as thorough as other functionality 272 * ({@link com.fasterxml.jackson.core.json.ByteSourceJsonBootstrapper}); 273 * supports UTF-8, for example. But it should work, for now, and can 274 * be improved as necessary. 275 */ hasJSONFormat(InputAccessor acc)276 public static MatchStrength hasJSONFormat(InputAccessor acc) throws IOException 277 { 278 // Ideally we should see "[" or "{"; but if not, we'll accept double-quote (String) 279 // in future could also consider accepting non-standard matches? 280 281 if (!acc.hasMoreBytes()) { 282 return MatchStrength.INCONCLUSIVE; 283 } 284 byte b = acc.nextByte(); 285 // Very first thing, a UTF-8 BOM? 286 if (b == UTF8_BOM_1) { // yes, looks like UTF-8 BOM 287 if (!acc.hasMoreBytes()) { 288 return MatchStrength.INCONCLUSIVE; 289 } 290 if (acc.nextByte() != UTF8_BOM_2) { 291 return MatchStrength.NO_MATCH; 292 } 293 if (!acc.hasMoreBytes()) { 294 return MatchStrength.INCONCLUSIVE; 295 } 296 if (acc.nextByte() != UTF8_BOM_3) { 297 return MatchStrength.NO_MATCH; 298 } 299 if (!acc.hasMoreBytes()) { 300 return MatchStrength.INCONCLUSIVE; 301 } 302 b = acc.nextByte(); 303 } 304 // Then possible leading space 305 int ch = skipSpace(acc, b); 306 if (ch < 0) { 307 return MatchStrength.INCONCLUSIVE; 308 } 309 // First, let's see if it looks like a structured type: 310 if (ch == '{') { // JSON object? 311 // Ideally we need to find either double-quote or closing bracket 312 ch = skipSpace(acc); 313 if (ch < 0) { 314 return MatchStrength.INCONCLUSIVE; 315 } 316 if (ch == '"' || ch == '}') { 317 return MatchStrength.SOLID_MATCH; 318 } 319 // ... should we allow non-standard? Let's not yet... can add if need be 320 return MatchStrength.NO_MATCH; 321 } 322 MatchStrength strength; 323 324 if (ch == '[') { 325 ch = skipSpace(acc); 326 if (ch < 0) { 327 return MatchStrength.INCONCLUSIVE; 328 } 329 // closing brackets is easy; but for now, let's also accept opening... 330 if (ch == ']' || ch == '[') { 331 return MatchStrength.SOLID_MATCH; 332 } 333 return MatchStrength.SOLID_MATCH; 334 } else { 335 // plain old value is not very convincing... 336 strength = MatchStrength.WEAK_MATCH; 337 } 338 339 if (ch == '"') { // string value 340 return strength; 341 } 342 if (ch <= '9' && ch >= '0') { // number 343 return strength; 344 } 345 if (ch == '-') { // negative number 346 ch = skipSpace(acc); 347 if (ch < 0) { 348 return MatchStrength.INCONCLUSIVE; 349 } 350 return (ch <= '9' && ch >= '0') ? strength : MatchStrength.NO_MATCH; 351 } 352 // or one of literals 353 if (ch == 'n') { // null 354 return tryMatch(acc, "ull", strength); 355 } 356 if (ch == 't') { // true 357 return tryMatch(acc, "rue", strength); 358 } 359 if (ch == 'f') { // false 360 return tryMatch(acc, "alse", strength); 361 } 362 return MatchStrength.NO_MATCH; 363 } 364 tryMatch(InputAccessor acc, String matchStr, MatchStrength fullMatchStrength)365 private static MatchStrength tryMatch(InputAccessor acc, String matchStr, MatchStrength fullMatchStrength) 366 throws IOException 367 { 368 for (int i = 0, len = matchStr.length(); i < len; ++i) { 369 if (!acc.hasMoreBytes()) { 370 return MatchStrength.INCONCLUSIVE; 371 } 372 if (acc.nextByte() != matchStr.charAt(i)) { 373 return MatchStrength.NO_MATCH; 374 } 375 } 376 return fullMatchStrength; 377 } 378 skipSpace(InputAccessor acc)379 private static int skipSpace(InputAccessor acc) throws IOException 380 { 381 if (!acc.hasMoreBytes()) { 382 return -1; 383 } 384 return skipSpace(acc, acc.nextByte()); 385 } 386 skipSpace(InputAccessor acc, byte b)387 private static int skipSpace(InputAccessor acc, byte b) throws IOException 388 { 389 while (true) { 390 int ch = (int) b & 0xFF; 391 if (!(ch == ' ' || ch == '\r' || ch == '\n' || ch == '\t')) { 392 return ch; 393 } 394 if (!acc.hasMoreBytes()) { 395 return -1; 396 } 397 b = acc.nextByte(); 398 } 399 } 400 401 /* 402 /********************************************************** 403 /* Internal methods, parsing 404 /********************************************************** 405 */ 406 407 /** 408 * @return True if a BOM was succesfully found, and encoding 409 * thereby recognized. 410 */ handleBOM(int quad)411 private boolean handleBOM(int quad) throws IOException 412 { 413 /* Handling of (usually) optional BOM (required for 414 * multi-byte formats); first 32-bit charsets: 415 */ 416 switch (quad) { 417 case 0x0000FEFF: 418 _bigEndian = true; 419 _inputPtr += 4; 420 _bytesPerChar = 4; 421 return true; 422 case 0xFFFE0000: // UCS-4, LE? 423 _inputPtr += 4; 424 _bytesPerChar = 4; 425 _bigEndian = false; 426 return true; 427 case 0x0000FFFE: // UCS-4, in-order... 428 reportWeirdUCS4("2143"); // throws exception 429 break; // never gets here 430 case 0xFEFF0000: // UCS-4, in-order... 431 reportWeirdUCS4("3412"); // throws exception 432 break; // never gets here 433 default: 434 } 435 // Ok, if not, how about 16-bit encoding BOMs? 436 int msw = quad >>> 16; 437 if (msw == 0xFEFF) { // UTF-16, BE 438 _inputPtr += 2; 439 _bytesPerChar = 2; 440 _bigEndian = true; 441 return true; 442 } 443 if (msw == 0xFFFE) { // UTF-16, LE 444 _inputPtr += 2; 445 _bytesPerChar = 2; 446 _bigEndian = false; 447 return true; 448 } 449 // And if not, then UTF-8 BOM? 450 if ((quad >>> 8) == 0xEFBBBF) { // UTF-8 451 _inputPtr += 3; 452 _bytesPerChar = 1; 453 _bigEndian = true; // doesn't really matter 454 return true; 455 } 456 return false; 457 } 458 checkUTF32(int quad)459 private boolean checkUTF32(int quad) throws IOException 460 { 461 /* Handling of (usually) optional BOM (required for 462 * multi-byte formats); first 32-bit charsets: 463 */ 464 if ((quad >> 8) == 0) { // 0x000000?? -> UTF32-BE 465 _bigEndian = true; 466 } else if ((quad & 0x00FFFFFF) == 0) { // 0x??000000 -> UTF32-LE 467 _bigEndian = false; 468 } else if ((quad & ~0x00FF0000) == 0) { // 0x00??0000 -> UTF32-in-order 469 reportWeirdUCS4("3412"); 470 } else if ((quad & ~0x0000FF00) == 0) { // 0x0000??00 -> UTF32-in-order 471 reportWeirdUCS4("2143"); 472 } else { 473 // Can not be valid UTF-32 encoded JSON... 474 return false; 475 } 476 // Not BOM (just regular content), nothing to skip past: 477 //_inputPtr += 4; 478 _bytesPerChar = 4; 479 return true; 480 } 481 checkUTF16(int i16)482 private boolean checkUTF16(int i16) 483 { 484 if ((i16 & 0xFF00) == 0) { // UTF-16BE 485 _bigEndian = true; 486 } else if ((i16 & 0x00FF) == 0) { // UTF-16LE 487 _bigEndian = false; 488 } else { // nope, not UTF-16 489 return false; 490 } 491 // Not BOM (just regular content), nothing to skip past: 492 //_inputPtr += 2; 493 _bytesPerChar = 2; 494 return true; 495 } 496 497 /* 498 /********************************************************** 499 /* Internal methods, problem reporting 500 /********************************************************** 501 */ 502 reportWeirdUCS4(String type)503 private void reportWeirdUCS4(String type) throws IOException { 504 throw new CharConversionException("Unsupported UCS-4 endianness ("+type+") detected"); 505 } 506 507 /* 508 /********************************************************** 509 /* Internal methods, raw input access 510 /********************************************************** 511 */ 512 ensureLoaded(int minimum)513 protected boolean ensureLoaded(int minimum) throws IOException { 514 /* Let's assume here buffer has enough room -- this will always 515 * be true for the limited used this method gets 516 */ 517 int gotten = (_inputEnd - _inputPtr); 518 while (gotten < minimum) { 519 int count; 520 521 if (_in == null) { // block source 522 count = -1; 523 } else { 524 count = _in.read(_inputBuffer, _inputEnd, _inputBuffer.length - _inputEnd); 525 } 526 if (count < 1) { 527 return false; 528 } 529 _inputEnd += count; 530 gotten += count; 531 } 532 return true; 533 } 534 } 535