1 package com.fasterxml.jackson.core.sym; 2 3 import java.util.Arrays; 4 import java.util.concurrent.atomic.AtomicReference; 5 6 import com.fasterxml.jackson.core.JsonFactory; 7 import com.fasterxml.jackson.core.util.InternCache; 8 9 /** 10 * Replacement for <code>BytesToNameCanonicalizer</code> which aims at more localized 11 * memory access due to flattening of name quad data. 12 * Performance improvement modest for simple JSON document data binding (maybe 3%), 13 * but should help more for larger symbol tables, or for binary formats like Smile. 14 *<p> 15 * Hash area is divided into 4 sections: 16 *<ol> 17 * <li>Primary area (1/2 of total size), direct match from hash (LSB)</li> 18 * <li>Secondary area (1/4 of total size), match from {@code hash (LSB) >> 1}</li> 19 * <li>Tertiary area (1/8 of total size), match from {@code hash (LSB) >> 2}</li> 20 * <li>Spill-over area (remaining 1/8) with linear scan, insertion order</li> 21 * <li></li> 22 * </ol> 23 * and within every area, entries are 4 {@code int}s, where 1 - 3 ints contain 1 - 12 24 * UTF-8 encoded bytes of name (null-padded), and last int is offset in 25 * {@code _names} that contains actual name Strings. 26 * 27 * @since 2.6 28 */ 29 public final class ByteQuadsCanonicalizer 30 { 31 /** 32 * Initial size of the primary hash area. Each entry consumes 4 ints (16 bytes), 33 * and secondary area is same as primary; so default size will use 2kB of memory 34 * (plus 64x4 or 64x8 (256/512 bytes) for references to Strings, and Strings 35 * themselves). 36 */ 37 private static final int DEFAULT_T_SIZE = 64; 38 // private static final int DEFAULT_T_SIZE = 256; 39 40 /** 41 * Let's not expand symbol tables past some maximum size; 42 * with unique (~= random) names. 43 * Size is in 44 */ 45 private static final int MAX_T_SIZE = 0x10000; // 64k entries == 2M mem hash area 46 47 /** 48 * No point in trying to construct tiny tables, just need to resize soon. 49 */ 50 private final static int MIN_HASH_SIZE = 16; 51 52 /** 53 * Let's only share reasonably sized symbol tables. Max size set to 3/4 of 8k; 54 * this corresponds to 256k main hash index. This should allow for enough distinct 55 * names for almost any case, while preventing ballooning for cases where names 56 * are unique (or close thereof). 57 */ 58 final static int MAX_ENTRIES_FOR_REUSE = 6000; 59 60 /* 61 /********************************************************** 62 /* Linkage, needed for merging symbol tables 63 /********************************************************** 64 */ 65 66 /** 67 * Reference to the root symbol table, for child tables, so 68 * that they can merge table information back as necessary. 69 */ 70 final private ByteQuadsCanonicalizer _parent; 71 72 /** 73 * Member that is only used by the root table instance: root 74 * passes immutable state info child instances, and children 75 * may return new state if they add entries to the table. 76 * Child tables do NOT use the reference. 77 */ 78 final private AtomicReference<TableInfo> _tableInfo; 79 80 /** 81 * Seed value we use as the base to make hash codes non-static between 82 * different runs, but still stable for lifetime of a single symbol table 83 * instance. 84 * This is done for security reasons, to avoid potential DoS attack via 85 * hash collisions. 86 */ 87 final private int _seed; 88 89 /* 90 /********************************************************** 91 /* Configuration 92 /********************************************************** 93 */ 94 95 /** 96 * Whether canonical symbol Strings are to be intern()ed before added 97 * to the table or not. 98 *<p> 99 * NOTE: non-final to allow disabling intern()ing in case of excessive 100 * collisions. 101 */ 102 private boolean _intern; 103 104 /** 105 * Flag that indicates whether we should throw an exception if enough 106 * hash collisions are detected (true); or just worked around (false). 107 * 108 * @since 2.4 109 */ 110 private final boolean _failOnDoS; 111 112 /* 113 /********************************************************** 114 /* First, main hash area info 115 /********************************************************** 116 */ 117 118 /** 119 * Primary hash information area: consists of <code>2 * _hashSize</code> 120 * entries of 16 bytes (4 ints), arranged in a cascading lookup 121 * structure (details of which may be tweaked depending on expected rates 122 * of collisions). 123 */ 124 private int[] _hashArea; 125 126 /** 127 * Number of slots for primary entries within {@link #_hashArea}; which is 128 * at most <code>1/8</code> of actual size of the underlying array (4-int slots, 129 * primary covers only half of the area; plus, additional area for longer 130 * symbols after hash area). 131 */ 132 private int _hashSize; 133 134 /** 135 * Offset within {@link #_hashArea} where secondary entries start 136 */ 137 private int _secondaryStart; 138 139 /** 140 * Offset within {@link #_hashArea} where tertiary entries start 141 */ 142 private int _tertiaryStart; 143 144 /** 145 * Constant that determines size of buckets for tertiary entries: 146 * <code>1 << _tertiaryShift</code> is the size, and shift value 147 * is also used for translating from primary offset into 148 * tertiary bucket (shift right by <code>4 + _tertiaryShift</code>). 149 *<p> 150 * Default value is 2, for buckets of 4 slots; grows bigger with 151 * bigger table sizes. 152 */ 153 private int _tertiaryShift; 154 155 /** 156 * Total number of Strings in the symbol table; only used for child tables. 157 */ 158 private int _count; 159 160 /** 161 * Array that contains <code>String</code> instances matching 162 * entries in {@link #_hashArea}. 163 * Contains nulls for unused entries. Note that this size is twice 164 * that of {@link #_hashArea} 165 */ 166 private String[] _names; 167 168 /* 169 /********************************************************** 170 /* Then information on collisions etc 171 /********************************************************** 172 */ 173 174 /** 175 * Pointer to the offset within spill-over area where there is room 176 * for more spilled over entries (if any). 177 * Spill over area is within fixed-size portion of {@link #_hashArea}. 178 */ 179 private int _spilloverEnd; 180 181 /** 182 * Offset within {@link #_hashArea} that follows main slots and contains 183 * quads for longer names (13 bytes or longer), and points to the 184 * first available int that may be used for appending quads of the next 185 * long name. 186 * Note that long name area follows immediately after the fixed-size 187 * main hash area ({@link #_hashArea}). 188 */ 189 private int _longNameOffset; 190 191 /* 192 /********************************************************** 193 /* Sharing, versioning 194 /********************************************************** 195 */ 196 197 // // // Which of the buffers may be shared (and are copy-on-write)? 198 199 /** 200 * Flag that indicates whether underlying data structures for 201 * the main hash area are shared or not. If they are, then they 202 * need to be handled in copy-on-write way, i.e. if they need 203 * to be modified, a copy needs to be made first; at this point 204 * it will not be shared any more, and can be modified. 205 *<p> 206 * This flag needs to be checked both when adding new main entries, 207 * and when adding new collision list queues (i.e. creating a new 208 * collision list head entry) 209 */ 210 private boolean _hashShared; 211 212 /* 213 /********************************************************** 214 /* Life-cycle: constructors 215 /********************************************************** 216 */ 217 218 /** 219 * Constructor used for creating per-<code>JsonFactory</code> "root" 220 * symbol tables: ones used for merging and sharing common symbols 221 * 222 * @param sz Initial primary hash area size 223 * @param intern Whether Strings contained should be {@link String#intern}ed 224 * @param seed Random seed valued used to make it more difficult to cause 225 * collisions (used for collision-based DoS attacks). 226 */ ByteQuadsCanonicalizer(int sz, boolean intern, int seed, boolean failOnDoS)227 private ByteQuadsCanonicalizer(int sz, boolean intern, int seed, boolean failOnDoS) { 228 _parent = null; 229 _seed = seed; 230 _intern = intern; 231 _failOnDoS = failOnDoS; 232 // Sanity check: let's now allow hash sizes below certain minimum value 233 if (sz < MIN_HASH_SIZE) { 234 sz = MIN_HASH_SIZE; 235 } else { 236 // Also; size must be 2^N; otherwise hash algorithm won't 237 // work... so let's just pad it up, if so 238 if ((sz & (sz - 1)) != 0) { // only true if it's 2^N 239 int curr = MIN_HASH_SIZE; 240 while (curr < sz) { 241 curr += curr; 242 } 243 sz = curr; 244 } 245 } 246 _tableInfo = new AtomicReference<TableInfo>(TableInfo.createInitial(sz)); 247 } 248 249 /** 250 * Constructor used when creating a child instance 251 */ ByteQuadsCanonicalizer(ByteQuadsCanonicalizer parent, boolean intern, int seed, boolean failOnDoS, TableInfo state)252 private ByteQuadsCanonicalizer(ByteQuadsCanonicalizer parent, boolean intern, 253 int seed, boolean failOnDoS, TableInfo state) 254 { 255 _parent = parent; 256 _seed = seed; 257 _intern = intern; 258 _failOnDoS = failOnDoS; 259 _tableInfo = null; // not used by child tables 260 261 // Then copy shared state 262 _count = state.count; 263 _hashSize = state.size; 264 _secondaryStart = _hashSize << 2; // right after primary area 265 _tertiaryStart = _secondaryStart + (_secondaryStart >> 1); // right after secondary 266 _tertiaryShift = state.tertiaryShift; 267 268 _hashArea = state.mainHash; 269 _names = state.names; 270 271 _spilloverEnd = state.spilloverEnd; 272 _longNameOffset = state.longNameOffset; 273 274 // and then set other state to reflect sharing status 275 _hashShared = true; 276 } 277 278 /* 279 /********************************************************** 280 /* Life-cycle: factory methods, merging 281 /********************************************************** 282 */ 283 284 /** 285 * Factory method to call to create a symbol table instance with a 286 * randomized seed value. 287 */ createRoot()288 public static ByteQuadsCanonicalizer createRoot() { 289 // Need to use a variable seed, to thwart hash-collision based attacks. 290 // 14-Feb-2017, tatu: Does this actually help? 291 long now = System.currentTimeMillis(); 292 // ensure it's not 0; and might as well require to be odd so: 293 int seed = (((int) now) + ((int) (now >>> 32))) | 1; 294 return createRoot(seed); 295 } 296 297 // Factory method that should only be called from unit tests, where seed 298 // value should remain the same. createRoot(int seed)299 protected static ByteQuadsCanonicalizer createRoot(int seed) { 300 return new ByteQuadsCanonicalizer(DEFAULT_T_SIZE, true, seed, true); 301 } 302 303 /** 304 * Factory method used to create actual symbol table instance to 305 * use for parsing. 306 */ makeChild(int flags)307 public ByteQuadsCanonicalizer makeChild(int flags) { 308 return new ByteQuadsCanonicalizer(this, 309 JsonFactory.Feature.INTERN_FIELD_NAMES.enabledIn(flags), 310 _seed, 311 JsonFactory.Feature.FAIL_ON_SYMBOL_HASH_OVERFLOW.enabledIn(flags), 312 _tableInfo.get()); 313 } 314 315 /** 316 * Method called by the using code to indicate it is done with this instance. 317 * This lets instance merge accumulated changes into parent (if need be), 318 * safely and efficiently, and without calling code having to know about parent 319 * information. 320 */ release()321 public void release() 322 { 323 // we will try to merge if child table has new entries 324 // 28-Jul-2019, tatu: From [core#548]: do not share if immediate rehash needed 325 if ((_parent != null) && maybeDirty()) { 326 _parent.mergeChild(new TableInfo(this)); 327 // Let's also mark this instance as dirty, so that just in 328 // case release was too early, there's no corruption of possibly shared data. 329 _hashShared = true; 330 } 331 } 332 mergeChild(TableInfo childState)333 private void mergeChild(TableInfo childState) 334 { 335 final int childCount = childState.count; 336 TableInfo currState = _tableInfo.get(); 337 338 // Should usually grow; but occasionally could also shrink if (but only if) 339 // collision list overflow ends up clearing some collision lists. 340 if (childCount == currState.count) { 341 return; 342 } 343 344 // One caveat: let's try to avoid problems with degenerate cases of documents with 345 // generated "random" names: for these, symbol tables would bloat indefinitely. 346 // One way to do this is to just purge tables if they grow 347 // too large, and that's what we'll do here. 348 if (childCount > MAX_ENTRIES_FOR_REUSE) { 349 // At any rate, need to clean up the tables 350 childState = TableInfo.createInitial(DEFAULT_T_SIZE); 351 } 352 _tableInfo.compareAndSet(currState, childState); 353 } 354 355 /* 356 /********************************************************** 357 /* API, accessors 358 /********************************************************** 359 */ 360 size()361 public int size() 362 { 363 if (_tableInfo != null) { // root table 364 return _tableInfo.get().count; 365 } 366 // nope, child table 367 return _count; 368 } 369 370 /** 371 * Returns number of primary slots table has currently 372 */ bucketCount()373 public int bucketCount() { return _hashSize; } 374 375 /** 376 * Method called to check to quickly see if a child symbol table 377 * may have gotten additional entries. Used for checking to see 378 * if a child table should be merged into shared table. 379 */ maybeDirty()380 public boolean maybeDirty() { return !_hashShared; } 381 hashSeed()382 public int hashSeed() { return _seed; } 383 384 /** 385 * Method mostly needed by unit tests; calculates number of 386 * entries that are in the primary slot set. These are 387 * "perfect" entries, accessible with a single lookup 388 */ primaryCount()389 public int primaryCount() 390 { 391 int count = 0; 392 for (int offset = 3, end = _secondaryStart; offset < end; offset += 4) { 393 if (_hashArea[offset] != 0) { 394 ++count; 395 } 396 } 397 return count; 398 } 399 400 /** 401 * Method mostly needed by unit tests; calculates number of entries 402 * in secondary buckets 403 */ secondaryCount()404 public int secondaryCount() { 405 int count = 0; 406 int offset = _secondaryStart + 3; 407 for (int end = _tertiaryStart; offset < end; offset += 4) { 408 if (_hashArea[offset] != 0) { 409 ++count; 410 } 411 } 412 return count; 413 } 414 415 /** 416 * Method mostly needed by unit tests; calculates number of entries 417 * in tertiary buckets 418 */ tertiaryCount()419 public int tertiaryCount() { 420 int count = 0; 421 int offset = _tertiaryStart + 3; // to 1.5x, starting point of tertiary 422 for (int end = offset + _hashSize; offset < end; offset += 4) { 423 if (_hashArea[offset] != 0) { 424 ++count; 425 } 426 } 427 return count; 428 } 429 430 /** 431 * Method mostly needed by unit tests; calculates number of entries 432 * in shared spillover area 433 */ spilloverCount()434 public int spilloverCount() { 435 // difference between spillover end, start, divided by 4 (four ints per slot) 436 return (_spilloverEnd - _spilloverStart()) >> 2; 437 } 438 totalCount()439 public int totalCount() 440 { 441 int count = 0; 442 for (int offset = 3, end = (_hashSize << 3); offset < end; offset += 4) { 443 if (_hashArea[offset] != 0) { 444 ++count; 445 } 446 } 447 return count; 448 } 449 450 @Override toString()451 public String toString() { 452 int pri = primaryCount(); 453 int sec = secondaryCount(); 454 int tert = tertiaryCount(); 455 int spill = spilloverCount(); 456 int total = totalCount(); 457 return String.format("[%s: size=%d, hashSize=%d, %d/%d/%d/%d pri/sec/ter/spill (=%s), total:%d]", 458 getClass().getName(), _count, _hashSize, 459 pri, sec, tert, spill, (pri+sec+tert+spill), total); 460 } 461 462 /* 463 /********************************************************** 464 /* Public API, accessing symbols 465 /********************************************************** 466 */ 467 findName(int q1)468 public String findName(int q1) 469 { 470 int offset = _calcOffset(calcHash(q1)); 471 // first: primary match? 472 final int[] hashArea = _hashArea; 473 474 int len = hashArea[offset+3]; 475 476 if (len == 1) { 477 if (hashArea[offset] == q1) { 478 return _names[offset >> 2]; 479 } 480 } else if (len == 0) { // empty slot; unlikely but avoid further lookups if so 481 return null; 482 } 483 // secondary? single slot shared by N/2 primaries 484 int offset2 = _secondaryStart + ((offset >> 3) << 2); 485 486 len = hashArea[offset2+3]; 487 488 if (len == 1) { 489 if (hashArea[offset2] == q1) { 490 return _names[offset2 >> 2]; 491 } 492 } else if (len == 0) { // empty slot; unlikely but avoid further lookups if so 493 return null; 494 } 495 496 // tertiary lookup & spillovers best to offline 497 return _findSecondary(offset, q1); 498 } 499 findName(int q1, int q2)500 public String findName(int q1, int q2) 501 { 502 int offset = _calcOffset(calcHash(q1, q2)); 503 504 final int[] hashArea = _hashArea; 505 506 int len = hashArea[offset+3]; 507 508 if (len == 2) { 509 if ((q1 == hashArea[offset]) && (q2 == hashArea[offset+1])) { 510 return _names[offset >> 2]; 511 } 512 } else if (len == 0) { // empty slot; unlikely but avoid further lookups if so 513 return null; 514 } 515 // secondary? 516 int offset2 = _secondaryStart + ((offset >> 3) << 2); 517 518 len = hashArea[offset2+3]; 519 520 if (len == 2) { 521 if ((q1 == hashArea[offset2]) && (q2 == hashArea[offset2+1])) { 522 return _names[offset2 >> 2]; 523 } 524 } else if (len == 0) { // empty slot? Short-circuit if no more spillovers 525 return null; 526 } 527 return _findSecondary(offset, q1, q2); 528 } 529 findName(int q1, int q2, int q3)530 public String findName(int q1, int q2, int q3) 531 { 532 int offset = _calcOffset(calcHash(q1, q2, q3)); 533 final int[] hashArea = _hashArea; 534 int len = hashArea[offset+3]; 535 536 if (len == 3) { 537 if ((q1 == hashArea[offset]) && (hashArea[offset+1] == q2) && (hashArea[offset+2] == q3)) { 538 return _names[offset >> 2]; 539 } 540 } else if (len == 0) { // empty slot; unlikely but avoid further lookups if so 541 return null; 542 } 543 // secondary? 544 int offset2 = _secondaryStart + ((offset >> 3) << 2); 545 546 len = hashArea[offset2+3]; 547 548 if (len == 3) { 549 if ((q1 == hashArea[offset2]) && (hashArea[offset2+1] == q2) && (hashArea[offset2+2] == q3)) { 550 return _names[offset2 >> 2]; 551 } 552 } else if (len == 0) { // empty slot? Short-circuit if no more spillovers 553 return null; 554 } 555 return _findSecondary(offset, q1, q2, q3); 556 } 557 findName(int[] q, int qlen)558 public String findName(int[] q, int qlen) 559 { 560 /* This version differs significantly, because longer names do not fit within cell. 561 * Rather, they contain hash in main slot, and offset+length to extension area 562 * that contains actual quads. 563 */ 564 if (qlen < 4) { // another sanity check 565 switch (qlen) { 566 case 3: 567 return findName(q[0], q[1], q[2]); 568 case 2: 569 return findName(q[0], q[1]); 570 case 1: 571 return findName(q[0]); 572 default: // if 0 ever passed 573 return ""; 574 } 575 } 576 final int hash = calcHash(q, qlen); 577 int offset = _calcOffset(hash); 578 579 final int[] hashArea = _hashArea; 580 581 final int len = hashArea[offset+3]; 582 583 if ((hash == hashArea[offset]) && (len == qlen)) { 584 // probable but not guaranteed: verify 585 if (_verifyLongName(q, qlen, hashArea[offset+1])) { 586 return _names[offset >> 2]; 587 } 588 } 589 if (len == 0) { // empty slot; unlikely but avoid further lookups if so 590 return null; 591 } 592 // secondary? 593 int offset2 = _secondaryStart + ((offset >> 3) << 2); 594 595 final int len2 = hashArea[offset2+3]; 596 if ((hash == hashArea[offset2]) && (len2 == qlen)) { 597 if (_verifyLongName(q, qlen, hashArea[offset2+1])) { 598 return _names[offset2 >> 2]; 599 } 600 } 601 return _findSecondary(offset, hash, q, qlen); 602 } 603 _calcOffset(int hash)604 private final int _calcOffset(int hash) 605 { 606 // NOTE: simple for initial impl, but we may want to interleave it a bit 607 // in near future 608 // So: first, hash into primary hash index 609 int ix = hash & (_hashSize-1); 610 // keeping in mind we have 4 ints per entry 611 return (ix << 2); 612 } 613 614 /* 615 /********************************************************** 616 /* Access from spill-over areas 617 /********************************************************** 618 */ 619 _findSecondary(int origOffset, int q1)620 private String _findSecondary(int origOffset, int q1) 621 { 622 // tertiary area division is dynamic. First; its size is N/4 compared to 623 // primary hash size; and offsets are for 4 int slots. So to get to logical 624 // index would shift by 4. But! Tertiary area is further split into buckets, 625 // determined by shift value. And finally, from bucket back into physical offsets 626 int offset = _tertiaryStart + ((origOffset >> (_tertiaryShift + 2)) << _tertiaryShift); 627 final int[] hashArea = _hashArea; 628 final int bucketSize = (1 << _tertiaryShift); 629 for (int end = offset + bucketSize; offset < end; offset += 4) { 630 int len = hashArea[offset+3]; 631 if ((q1 == hashArea[offset]) && (1 == len)) { 632 return _names[offset >> 2]; 633 } 634 if (len == 0) { 635 return null; 636 } 637 } 638 // but if tertiary full, check out spill-over area as last resort 639 // shared spillover starts at 7/8 of the main hash area 640 // (which is sized at 2 * _hashSize), so: 641 for (offset = _spilloverStart(); offset < _spilloverEnd; offset += 4) { 642 if ((q1 == hashArea[offset]) && (1 == hashArea[offset+3])) { 643 return _names[offset >> 2]; 644 } 645 } 646 return null; 647 } 648 _findSecondary(int origOffset, int q1, int q2)649 private String _findSecondary(int origOffset, int q1, int q2) 650 { 651 int offset = _tertiaryStart + ((origOffset >> (_tertiaryShift + 2)) << _tertiaryShift); 652 final int[] hashArea = _hashArea; 653 654 final int bucketSize = (1 << _tertiaryShift); 655 for (int end = offset + bucketSize; offset < end; offset += 4) { 656 int len = hashArea[offset+3]; 657 if ((q1 == hashArea[offset]) && (q2 == hashArea[offset+1]) && (2 == len)) { 658 return _names[offset >> 2]; 659 } 660 if (len == 0) { 661 return null; 662 } 663 } 664 for (offset = _spilloverStart(); offset < _spilloverEnd; offset += 4) { 665 if ((q1 == hashArea[offset]) && (q2 == hashArea[offset+1]) && (2 == hashArea[offset+3])) { 666 return _names[offset >> 2]; 667 } 668 } 669 return null; 670 } 671 _findSecondary(int origOffset, int q1, int q2, int q3)672 private String _findSecondary(int origOffset, int q1, int q2, int q3) 673 { 674 int offset = _tertiaryStart + ((origOffset >> (_tertiaryShift + 2)) << _tertiaryShift); 675 final int[] hashArea = _hashArea; 676 677 final int bucketSize = (1 << _tertiaryShift); 678 for (int end = offset + bucketSize; offset < end; offset += 4) { 679 int len = hashArea[offset+3]; 680 if ((q1 == hashArea[offset]) && (q2 == hashArea[offset+1]) && (q3 == hashArea[offset+2]) && (3 == len)) { 681 return _names[offset >> 2]; 682 } 683 if (len == 0) { 684 return null; 685 } 686 } 687 for (offset = _spilloverStart(); offset < _spilloverEnd; offset += 4) { 688 if ((q1 == hashArea[offset]) && (q2 == hashArea[offset+1]) && (q3 == hashArea[offset+2]) 689 && (3 == hashArea[offset+3])) { 690 return _names[offset >> 2]; 691 } 692 } 693 return null; 694 } 695 _findSecondary(int origOffset, int hash, int[] q, int qlen)696 private String _findSecondary(int origOffset, int hash, int[] q, int qlen) 697 { 698 int offset = _tertiaryStart + ((origOffset >> (_tertiaryShift + 2)) << _tertiaryShift); 699 final int[] hashArea = _hashArea; 700 701 final int bucketSize = (1 << _tertiaryShift); 702 for (int end = offset + bucketSize; offset < end; offset += 4) { 703 int len = hashArea[offset+3]; 704 if ((hash == hashArea[offset]) && (qlen == len)) { 705 if (_verifyLongName(q, qlen, hashArea[offset+1])) { 706 return _names[offset >> 2]; 707 } 708 } 709 if (len == 0) { 710 return null; 711 } 712 } 713 for (offset = _spilloverStart(); offset < _spilloverEnd; offset += 4) { 714 if ((hash == hashArea[offset]) && (qlen == hashArea[offset+3])) { 715 if (_verifyLongName(q, qlen, hashArea[offset+1])) { 716 return _names[offset >> 2]; 717 } 718 } 719 } 720 return null; 721 } 722 _verifyLongName(int[] q, int qlen, int spillOffset)723 private boolean _verifyLongName(int[] q, int qlen, int spillOffset) 724 { 725 final int[] hashArea = _hashArea; 726 // spillOffset assumed to be physical index right into quad string 727 int ix = 0; 728 729 switch (qlen) { 730 default: 731 return _verifyLongName2(q, qlen, spillOffset); 732 case 8: 733 if (q[ix++] != hashArea[spillOffset++]) return false; 734 case 7: 735 if (q[ix++] != hashArea[spillOffset++]) return false; 736 case 6: 737 if (q[ix++] != hashArea[spillOffset++]) return false; 738 case 5: 739 if (q[ix++] != hashArea[spillOffset++]) return false; 740 case 4: // always at least 4 741 if (q[ix++] != hashArea[spillOffset++]) return false; 742 if (q[ix++] != hashArea[spillOffset++]) return false; 743 if (q[ix++] != hashArea[spillOffset++]) return false; 744 if (q[ix++] != hashArea[spillOffset++]) return false; 745 } 746 return true; 747 } 748 _verifyLongName2(int[] q, int qlen, int spillOffset)749 private boolean _verifyLongName2(int[] q, int qlen, int spillOffset) 750 { 751 int ix = 0; 752 do { 753 if (q[ix++] != _hashArea[spillOffset++]) { 754 return false; 755 } 756 } while (ix < qlen); 757 return true; 758 } 759 760 /* 761 /********************************************************** 762 /* API, mutators 763 /********************************************************** 764 */ 765 addName(String name, int q1)766 public String addName(String name, int q1) { 767 _verifySharing(); 768 if (_intern) { 769 name = InternCache.instance.intern(name); 770 } 771 int offset = _findOffsetForAdd(calcHash(q1)); 772 _hashArea[offset] = q1; 773 _hashArea[offset+3] = 1; 774 _names[offset >> 2] = name; 775 ++_count; 776 return name; 777 } 778 addName(String name, int q1, int q2)779 public String addName(String name, int q1, int q2) { 780 _verifySharing(); 781 if (_intern) { 782 name = InternCache.instance.intern(name); 783 } 784 int hash = (q2 == 0) ? calcHash(q1) : calcHash(q1, q2); 785 int offset = _findOffsetForAdd(hash); 786 _hashArea[offset] = q1; 787 _hashArea[offset+1] = q2; 788 _hashArea[offset+3] = 2; 789 _names[offset >> 2] = name; 790 ++_count; 791 return name; 792 } 793 addName(String name, int q1, int q2, int q3)794 public String addName(String name, int q1, int q2, int q3) { 795 _verifySharing(); 796 if (_intern) { 797 name = InternCache.instance.intern(name); 798 } 799 int offset = _findOffsetForAdd(calcHash(q1, q2, q3)); 800 _hashArea[offset] = q1; 801 _hashArea[offset+1] = q2; 802 _hashArea[offset+2] = q3; 803 _hashArea[offset+3] = 3; 804 _names[offset >> 2] = name; 805 ++_count; 806 return name; 807 } 808 addName(String name, int[] q, int qlen)809 public String addName(String name, int[] q, int qlen) 810 { 811 _verifySharing(); 812 if (_intern) { 813 name = InternCache.instance.intern(name); 814 } 815 int offset; 816 817 switch (qlen) { 818 case 1: 819 { 820 offset = _findOffsetForAdd(calcHash(q[0])); 821 _hashArea[offset] = q[0]; 822 _hashArea[offset+3] = 1; 823 } 824 break; 825 case 2: 826 { 827 offset = _findOffsetForAdd(calcHash(q[0], q[1])); 828 _hashArea[offset] = q[0]; 829 _hashArea[offset+1] = q[1]; 830 _hashArea[offset+3] = 2; 831 } 832 break; 833 case 3: 834 { 835 offset = _findOffsetForAdd(calcHash(q[0], q[1], q[2])); 836 _hashArea[offset] = q[0]; 837 _hashArea[offset+1] = q[1]; 838 _hashArea[offset+2] = q[2]; 839 _hashArea[offset+3] = 3; 840 } 841 break; 842 default: 843 final int hash = calcHash(q, qlen); 844 offset = _findOffsetForAdd(hash); 845 846 _hashArea[offset] = hash; 847 int longStart = _appendLongName(q, qlen); 848 _hashArea[offset+1] = longStart; 849 _hashArea[offset+3] = qlen; 850 } 851 // plus add the actual String 852 _names[offset >> 2] = name; 853 854 // and finally; see if we really should rehash. 855 ++_count; 856 return name; 857 } 858 _verifySharing()859 private void _verifySharing() 860 { 861 if (_hashShared) { 862 _hashArea = Arrays.copyOf(_hashArea, _hashArea.length); 863 _names = Arrays.copyOf(_names, _names.length); 864 _hashShared = false; 865 } 866 } 867 868 /** 869 * Method called to find the location within hash table to add a new symbol in. 870 */ _findOffsetForAdd(int hash)871 private int _findOffsetForAdd(int hash) 872 { 873 // first, check the primary: if slot found, no need for resize 874 int offset = _calcOffset(hash); 875 final int[] hashArea = _hashArea; 876 if (hashArea[offset+3] == 0) { 877 //System.err.printf(" PRImary slot #%d, hash %X\n", (offset>>2), hash & 0x7F); 878 return offset; 879 } 880 881 // Otherwise let's see if we are due resize(): 882 if (_checkNeedForRehash()) { 883 return _resizeAndFindOffsetForAdd(hash); 884 } 885 886 // If not, proceed with secondary slot 887 int offset2 = _secondaryStart + ((offset >> 3) << 2); 888 if (hashArea[offset2+3] == 0) { 889 //System.err.printf(" SECondary slot #%d (start x%X), hash %X\n",(offset >> 3), _secondaryStart, (hash & 0x7F)); 890 return offset2; 891 } 892 // if not, tertiary? 893 894 offset2 = _tertiaryStart + ((offset >> (_tertiaryShift + 2)) << _tertiaryShift); 895 final int bucketSize = (1 << _tertiaryShift); 896 for (int end = offset2 + bucketSize; offset2 < end; offset2 += 4) { 897 if (hashArea[offset2+3] == 0) { 898 //System.err.printf(" TERtiary slot x%X (from x%X, start x%X), hash %X.\n", offset2, ((offset >> (_tertiaryShift + 2)) << _tertiaryShift), _tertiaryStart, (hash & 0x7F)); 899 return offset2; 900 } 901 } 902 903 // and if even tertiary full, append at the end of spill area 904 offset = _spilloverEnd; 905 _spilloverEnd += 4; 906 907 //System.err.printf(" SPIll-over at x%X; start x%X; end x%X, hash %X\n", offset, _spilloverStart(), _hashArea.length, (hash & 0x7F)); 908 909 // one caveat: in the unlikely event if spill-over filling up, 910 // check if that could be considered a DoS attack; handle appropriately 911 // (NOTE: approximate for now; we could verify details if that becomes necessary) 912 /* 31-Jul-2015, tatu: Note that spillover area does NOT end at end of array, 913 * since "long names" area follows. Instead, need to calculate from hash size. 914 */ 915 final int end = (_hashSize << 3); 916 if (_spilloverEnd >= end) { 917 if (_failOnDoS) { 918 _reportTooManyCollisions(); 919 } 920 return _resizeAndFindOffsetForAdd(hash); 921 } 922 return offset; 923 } 924 925 // @since 2.10 _resizeAndFindOffsetForAdd(int hash)926 private int _resizeAndFindOffsetForAdd(int hash) 927 { 928 // First things first: we need to resize+rehash (or, if too big, nuke contents) 929 rehash(); 930 931 // Copy of main _findOffsetForAdd except for checks to resize: can not be needed 932 int offset = _calcOffset(hash); 933 final int[] hashArea = _hashArea; 934 if (hashArea[offset+3] == 0) { 935 return offset; 936 } 937 int offset2 = _secondaryStart + ((offset >> 3) << 2); 938 if (hashArea[offset2+3] == 0) { 939 return offset2; 940 } 941 offset2 = _tertiaryStart + ((offset >> (_tertiaryShift + 2)) << _tertiaryShift); 942 final int bucketSize = (1 << _tertiaryShift); 943 for (int end = offset2 + bucketSize; offset2 < end; offset2 += 4) { 944 if (hashArea[offset2+3] == 0) { 945 return offset2; 946 } 947 } 948 offset = _spilloverEnd; 949 _spilloverEnd += 4; 950 return offset; 951 } 952 953 // Helper method for checking if we should simply rehash() before add _checkNeedForRehash()954 private boolean _checkNeedForRehash() { 955 // Yes if above 80%, or above 50% AND have ~1% spill-overs 956 if (_count > (_hashSize >> 1)) { // over 50% 957 int spillCount = (_spilloverEnd - _spilloverStart()) >> 2; 958 if ((spillCount > (1 + _count >> 7)) 959 || (_count > (_hashSize * 0.80))) { 960 return true; 961 } 962 } 963 return false; 964 } 965 _appendLongName(int[] quads, int qlen)966 private int _appendLongName(int[] quads, int qlen) 967 { 968 int start = _longNameOffset; 969 970 // note: at this point we must already be shared. But may not have enough space 971 if ((start + qlen) > _hashArea.length) { 972 // try to increment in reasonable chunks; at least space that we need 973 int toAdd = (start + qlen) - _hashArea.length; 974 // but at least 1/8 of regular hash area size or 16kB (whichever smaller) 975 int minAdd = Math.min(4096, _hashSize); 976 977 int newSize = _hashArea.length + Math.max(toAdd, minAdd); 978 _hashArea = Arrays.copyOf(_hashArea, newSize); 979 } 980 System.arraycopy(quads, 0, _hashArea, start, qlen); 981 _longNameOffset += qlen; 982 return start; 983 } 984 985 /* 986 /********************************************************** 987 /* Hash calculation 988 /********************************************************** 989 */ 990 991 /* Note on hash calculation: we try to make it more difficult to 992 * generate collisions automatically; part of this is to avoid 993 * simple "multiply-add" algorithm (like JDK String.hashCode()), 994 * and add bit of shifting. And other part is to make this 995 * non-linear, at least for shorter symbols. 996 */ 997 998 // JDK uses 31; other fine choices are 33 and 65599, let's use 33 999 // as it seems to give fewest collisions for us 1000 // (see [http://www.cse.yorku.ca/~oz/hash.html] for details) 1001 private final static int MULT = 33; 1002 private final static int MULT2 = 65599; 1003 private final static int MULT3 = 31; 1004 calcHash(int q1)1005 public int calcHash(int q1) 1006 { 1007 int hash = q1 ^ _seed; 1008 /* 29-Mar-2015, tatu: Earlier used 15 + 9 right shifts, which worked ok 1009 * except for one specific problem case: numbers. So needed to make sure 1010 * that all 4 least-significant bits participate in hash. Couple of ways 1011 * to work it out, but this is the simplest, fast and seems to do ok. 1012 */ 1013 hash += (hash >>> 16); // to xor hi- and low- 16-bits 1014 hash ^= (hash << 3); // shuffle back a bit 1015 hash += (hash >>> 12); // and bit more 1016 return hash; 1017 } 1018 calcHash(int q1, int q2)1019 public int calcHash(int q1, int q2) 1020 { 1021 // For two quads, let's change algorithm a bit, to spice 1022 // things up (can do bit more processing anyway) 1023 int hash = q1; 1024 1025 hash += (hash >>> 15); // try mixing first and second byte pairs first 1026 hash ^= (hash >>> 9); // as well as lowest 2 bytes 1027 hash += (q2 * MULT); // then add second quad 1028 hash ^= _seed; 1029 hash += (hash >>> 16); // and shuffle some more 1030 hash ^= (hash >>> 4); 1031 hash += (hash << 3); 1032 1033 return hash; 1034 } 1035 calcHash(int q1, int q2, int q3)1036 public int calcHash(int q1, int q2, int q3) 1037 { // use same algorithm as multi-byte, tested to work well 1038 int hash = q1 ^ _seed; 1039 hash += (hash >>> 9); 1040 hash *= MULT3; 1041 hash += q2; 1042 hash *= MULT; 1043 hash += (hash >>> 15); 1044 hash ^= q3; 1045 // 26-Mar-2015, tatu: As per two-quad case, a short shift seems to help more here 1046 hash += (hash >>> 4); 1047 1048 hash += (hash >>> 15); 1049 hash ^= (hash << 9); 1050 1051 return hash; 1052 } 1053 calcHash(int[] q, int qlen)1054 public int calcHash(int[] q, int qlen) 1055 { 1056 if (qlen < 4) { 1057 throw new IllegalArgumentException(); 1058 } 1059 /* And then change handling again for "multi-quad" case; mostly 1060 * to make calculation of collisions less fun. For example, 1061 * add seed bit later in the game, and switch plus/xor around, 1062 * use different shift lengths. 1063 */ 1064 int hash = q[0] ^ _seed; 1065 hash += (hash >>> 9); 1066 hash += q[1]; 1067 hash += (hash >>> 15); 1068 hash *= MULT; 1069 hash ^= q[2]; 1070 hash += (hash >>> 4); 1071 1072 for (int i = 3; i < qlen; ++i) { 1073 int next = q[i]; 1074 next = next ^ (next >> 21); 1075 hash += next; 1076 } 1077 hash *= MULT2; 1078 1079 // and finally shuffle some more once done 1080 hash += (hash >>> 19); 1081 hash ^= (hash << 5); 1082 return hash; 1083 } 1084 1085 /* 1086 /********************************************************** 1087 /* Rehashing 1088 /********************************************************** 1089 */ 1090 rehash()1091 private void rehash() 1092 { 1093 // Note: since we'll make copies, no need to unshare, can just mark as such: 1094 _hashShared = false; 1095 1096 // And then we can first deal with the main hash area. Since we are expanding 1097 // linearly (double up), we know there'll be no collisions during this phase. 1098 final int[] oldHashArea = _hashArea; 1099 final String[] oldNames = _names; 1100 final int oldSize = _hashSize; 1101 final int oldCount = _count; 1102 final int newSize = oldSize + oldSize; 1103 final int oldEnd = _spilloverEnd; 1104 1105 /* 13-Mar-2010, tatu: Let's guard against OOME that could be caused by 1106 * large documents with unique (or mostly so) names 1107 */ 1108 if (newSize > MAX_T_SIZE) { 1109 nukeSymbols(true); 1110 return; 1111 } 1112 // double up main hash area, but do not expand long-name area: 1113 _hashArea = new int[oldHashArea.length + (oldSize<<3)]; 1114 _hashSize = newSize; 1115 _secondaryStart = (newSize << 2); // 4 ints per entry 1116 _tertiaryStart = _secondaryStart + (_secondaryStart >> 1); // right after secondary 1117 _tertiaryShift = _calcTertiaryShift(newSize); 1118 1119 // and simply double up name array 1120 _names = new String[oldNames.length << 1]; 1121 nukeSymbols(false); 1122 1123 // Plus we can scan only through the primary hash area, looking for non-empty 1124 // slots, without worrying about ordering. This should never reduce priority 1125 // of existing entries: primaries remain primaries; however, due to increased 1126 // space, secondaries may become primaries etc 1127 1128 int copyCount = 0; 1129 int[] q = new int[16]; 1130 for (int offset = 0, end = oldEnd; offset < end; offset += 4) { 1131 int len = oldHashArea[offset+3]; 1132 if (len == 0) { // empty slot, skip 1133 continue; 1134 } 1135 ++copyCount; 1136 String name = oldNames[offset>>2]; 1137 switch (len) { 1138 case 1: 1139 q[0] = oldHashArea[offset]; 1140 addName(name, q, 1); 1141 break; 1142 case 2: 1143 q[0] = oldHashArea[offset]; 1144 q[1] = oldHashArea[offset+1]; 1145 addName(name, q, 2); 1146 break; 1147 case 3: 1148 q[0] = oldHashArea[offset]; 1149 q[1] = oldHashArea[offset+1]; 1150 q[2] = oldHashArea[offset+2]; 1151 addName(name, q, 3); 1152 break; 1153 default: 1154 if (len > q.length) { 1155 q = new int[len]; 1156 } 1157 // #0 is hash, #1 offset 1158 int qoff = oldHashArea[offset+1]; 1159 System.arraycopy(oldHashArea, qoff, q, 0, len); 1160 addName(name, q, len); 1161 break; 1162 } 1163 } 1164 1165 // Sanity checks: since corruption difficult to detect, assert explicitly 1166 // with production code 1167 if (copyCount != oldCount) { 1168 throw new IllegalStateException("Failed rehash(): old count="+oldCount+", copyCount="+copyCount); 1169 } 1170 } 1171 1172 /** 1173 * Helper method called to empty all shared symbols, but to leave 1174 * arrays allocated 1175 */ nukeSymbols(boolean fill)1176 private void nukeSymbols(boolean fill) { 1177 _count = 0; 1178 // reset spill-over to empty (starting at 7/8 of hash area) 1179 _spilloverEnd = _spilloverStart(); 1180 // and long name area to empty, starting immediately after hash area 1181 _longNameOffset = _hashSize << 3; 1182 if (fill) { 1183 Arrays.fill(_hashArea, 0); 1184 Arrays.fill(_names, null); 1185 } 1186 } 1187 1188 /* 1189 /********************************************************** 1190 /* Helper methods 1191 /********************************************************** 1192 */ 1193 1194 /** 1195 * Helper method that calculates start of the spillover area 1196 */ _spilloverStart()1197 private final int _spilloverStart() { 1198 // we'll need slot at 1.75x of hashSize, but with 4-ints per slot. 1199 // So basically multiply by 7 1200 int offset = _hashSize; 1201 return (offset << 3) - offset; 1202 } 1203 _reportTooManyCollisions()1204 protected void _reportTooManyCollisions() 1205 { 1206 // First: do not fuzz about small symbol tables; may get balanced by doubling up 1207 if (_hashSize <= 1024) { // would have spill-over area of 128 entries 1208 return; 1209 } 1210 throw new IllegalStateException("Spill-over slots in symbol table with "+_count 1211 +" entries, hash area of "+_hashSize+" slots is now full (all " 1212 +(_hashSize >> 3)+" slots -- suspect a DoS attack based on hash collisions." 1213 +" You can disable the check via `JsonFactory.Feature.FAIL_ON_SYMBOL_HASH_OVERFLOW`"); 1214 } 1215 _calcTertiaryShift(int primarySlots)1216 static int _calcTertiaryShift(int primarySlots) 1217 { 1218 // first: we only get 1/4 of slots of primary, to divide 1219 int tertSlots = (primarySlots) >> 2; 1220 1221 // default is for buckets of 4 slots (each 4 ints, i.e. 1 << 4) 1222 if (tertSlots < 64) { 1223 return 4; 1224 } 1225 if (tertSlots <= 256) { // buckets of 8 slots (up to 256 == 32 x 8) 1226 return 5; 1227 } 1228 if (tertSlots <= 1024) { // buckets of 16 slots (up to 1024 == 64 x 16) 1229 return 6; 1230 } 1231 // and biggest buckets have 32 slots 1232 return 7; 1233 } 1234 1235 /* 1236 /********************************************************** 1237 /* Helper classes 1238 /********************************************************** 1239 */ 1240 1241 /** 1242 * Immutable value class used for sharing information as efficiently 1243 * as possible, by only require synchronization of reference manipulation 1244 * but not access to contents. 1245 * 1246 * @since 2.1 1247 */ 1248 private final static class TableInfo 1249 { 1250 public final int size; 1251 public final int count; 1252 public final int tertiaryShift; 1253 public final int[] mainHash; 1254 public final String[] names; 1255 public final int spilloverEnd; 1256 public final int longNameOffset; 1257 TableInfo(int size, int count, int tertiaryShift, int[] mainHash, String[] names, int spilloverEnd, int longNameOffset)1258 public TableInfo(int size, int count, int tertiaryShift, 1259 int[] mainHash, String[] names, int spilloverEnd, int longNameOffset) 1260 { 1261 this.size = size; 1262 this.count = count; 1263 this.tertiaryShift = tertiaryShift; 1264 this.mainHash = mainHash; 1265 this.names = names; 1266 this.spilloverEnd = spilloverEnd; 1267 this.longNameOffset = longNameOffset; 1268 } 1269 TableInfo(ByteQuadsCanonicalizer src)1270 public TableInfo(ByteQuadsCanonicalizer src) 1271 { 1272 size = src._hashSize; 1273 count = src._count; 1274 tertiaryShift = src._tertiaryShift; 1275 mainHash = src._hashArea; 1276 names = src._names; 1277 spilloverEnd = src._spilloverEnd; 1278 longNameOffset = src._longNameOffset; 1279 } 1280 createInitial(int sz)1281 public static TableInfo createInitial(int sz) { 1282 int hashAreaSize = sz << 3; 1283 int tertShift = _calcTertiaryShift(sz); 1284 1285 return new TableInfo(sz, // hashSize 1286 0, // count 1287 tertShift, 1288 new int[hashAreaSize], // mainHash, 2x slots, 4 ints per slot 1289 new String[sz << 1], // names == 2x slots 1290 hashAreaSize - sz, // at 7/8 of the total area 1291 hashAreaSize // longNameOffset, immediately after main hashes 1292 ); 1293 } 1294 } 1295 } 1296