1 package com.fasterxml.jackson.core.sym; 2 3 import java.util.Arrays; 4 import java.util.BitSet; 5 import java.util.concurrent.atomic.AtomicReference; 6 7 import com.fasterxml.jackson.core.JsonFactory; 8 import com.fasterxml.jackson.core.util.InternCache; 9 10 /** 11 * This class is a kind of specialized type-safe Map, from char array to 12 * String value. Specialization means that in addition to type-safety 13 * and specific access patterns (key char array, Value optionally interned 14 * String; values added on access if necessary), and that instances are 15 * meant to be used concurrently, but by using well-defined mechanisms 16 * to obtain such concurrently usable instances. Main use for the class 17 * is to store symbol table information for things like compilers and 18 * parsers; especially when number of symbols (keywords) is limited. 19 *<p> 20 * For optimal performance, usage pattern should be one where matches 21 * should be very common (especially after "warm-up"), and as with most hash-based 22 * maps/sets, that hash codes are uniformly distributed. Also, collisions 23 * are slightly more expensive than with HashMap or HashSet, since hash codes 24 * are not used in resolving collisions; that is, equals() comparison is 25 * done with all symbols in same bucket index.<br> 26 * Finally, rehashing is also more expensive, as hash codes are not 27 * stored; rehashing requires all entries' hash codes to be recalculated. 28 * Reason for not storing hash codes is reduced memory usage, hoping 29 * for better memory locality. 30 *<p> 31 * Usual usage pattern is to create a single "master" instance, and either 32 * use that instance in sequential fashion, or to create derived "child" 33 * instances, which after use, are asked to return possible symbol additions 34 * to master instance. In either case benefit is that symbol table gets 35 * initialized so that further uses are more efficient, as eventually all 36 * symbols needed will already be in symbol table. At that point no more 37 * Symbol String allocations are needed, nor changes to symbol table itself. 38 *<p> 39 * Note that while individual SymbolTable instances are NOT thread-safe 40 * (much like generic collection classes), concurrently used "child" 41 * instances can be freely used without synchronization. However, using 42 * master table concurrently with child instances can only be done if 43 * access to master instance is read-only (i.e. no modifications done). 44 */ 45 public final class CharsToNameCanonicalizer 46 { 47 /* If we use "multiply-add" based hash algorithm, this is the multiplier 48 * we use. 49 *<p> 50 * Note that JDK uses 31; but it seems that 33 produces fewer collisions, 51 * at least with tests we have. 52 */ 53 public final static int HASH_MULT = 33; 54 55 /** 56 * Default initial table size. Shouldn't be miniscule (as there's 57 * cost to both array realloc and rehashing), but let's keep 58 * it reasonably small. For systems that properly 59 * reuse factories it doesn't matter either way; but when 60 * recreating factories often, initial overhead may dominate. 61 */ 62 private static final int DEFAULT_T_SIZE = 64; 63 64 /** 65 * Let's not expand symbol tables past some maximum size; 66 * this should protected against OOMEs caused by large documents 67 * with unique (~= random) names. 68 */ 69 private static final int MAX_T_SIZE = 0x10000; // 64k entries == 256k mem 70 71 /** 72 * Let's only share reasonably sized symbol tables. Max size set to 3/4 of 16k; 73 * this corresponds to 64k main hash index. This should allow for enough distinct 74 * names for almost any case. 75 */ 76 static final int MAX_ENTRIES_FOR_REUSE = 12000; 77 78 /** 79 * Also: to thwart attacks based on hash collisions (which may or may not 80 * be cheap to calculate), we will need to detect "too long" 81 * collision chains. Let's start with static value of 100 entries 82 * for the longest legal chain. 83 *<p> 84 * Note: longest chain we have been able to produce without malicious 85 * intent has been 38 (with "com.fasterxml.jackson.core.main.TestWithTonsaSymbols"); 86 * our setting should be reasonable here. 87 * 88 * @since 2.1 89 */ 90 static final int MAX_COLL_CHAIN_LENGTH = 100; 91 92 /* 93 /********************************************************** 94 /* Configuration 95 /********************************************************** 96 */ 97 98 /** 99 * Sharing of learnt symbols is done by optional linking of symbol 100 * table instances with their parents. When parent linkage is 101 * defined, and child instance is released (call to <code>release</code>), 102 * parent's shared tables may be updated from the child instance. 103 */ 104 final private CharsToNameCanonicalizer _parent; 105 106 /** 107 * Member that is only used by the root table instance: root 108 * passes immutable state info child instances, and children 109 * may return new state if they add entries to the table. 110 * Child tables do NOT use the reference. 111 */ 112 final private AtomicReference<TableInfo> _tableInfo; 113 114 /** 115 * Seed value we use as the base to make hash codes non-static between 116 * different runs, but still stable for lifetime of a single symbol table 117 * instance. 118 * This is done for security reasons, to avoid potential DoS attack via 119 * hash collisions. 120 * 121 * @since 2.1 122 */ 123 final private int _seed; 124 125 final private int _flags; 126 127 /** 128 * Whether any canonicalization should be attempted (whether using 129 * intern or not. 130 *<p> 131 * NOTE: non-final since we may need to disable this with overflow. 132 */ 133 private boolean _canonicalize; 134 135 /* 136 /********************************************************** 137 /* Actual symbol table data 138 /********************************************************** 139 */ 140 141 /** 142 * Primary matching symbols; it's expected most match occur from 143 * here. 144 */ 145 private String[] _symbols; 146 147 /** 148 * Overflow buckets; if primary doesn't match, lookup is done 149 * from here. 150 *<p> 151 * Note: Number of buckets is half of number of symbol entries, on 152 * assumption there's less need for buckets. 153 */ 154 private Bucket[] _buckets; 155 156 /** 157 * Current size (number of entries); needed to know if and when 158 * rehash. 159 */ 160 private int _size; 161 162 /** 163 * Limit that indicates maximum size this instance can hold before 164 * it needs to be expanded and rehashed. Calculated using fill 165 * factor passed in to constructor. 166 */ 167 private int _sizeThreshold; 168 169 /** 170 * Mask used to get index from hash values; equal to 171 * <code>_buckets.length - 1</code>, when _buckets.length is 172 * a power of two. 173 */ 174 private int _indexMask; 175 176 /** 177 * We need to keep track of the longest collision list; this is needed 178 * both to indicate problems with attacks and to allow flushing for 179 * other cases. 180 * 181 * @since 2.1 182 */ 183 private int _longestCollisionList; 184 185 /* 186 /********************************************************** 187 /* State regarding shared arrays 188 /********************************************************** 189 */ 190 191 /** 192 * Flag that indicates whether underlying data structures for 193 * the main hash area are shared or not. If they are, then they 194 * need to be handled in copy-on-write way, i.e. if they need 195 * to be modified, a copy needs to be made first; at this point 196 * it will not be shared any more, and can be modified. 197 *<p> 198 * This flag needs to be checked both when adding new main entries, 199 * and when adding new collision list queues (i.e. creating a new 200 * collision list head entry) 201 */ 202 private boolean _hashShared; 203 204 /* 205 /********************************************************** 206 /* Bit of DoS detection goodness 207 /********************************************************** 208 */ 209 210 /** 211 * Lazily constructed structure that is used to keep track of 212 * collision buckets that have overflowed once: this is used 213 * to detect likely attempts at denial-of-service attacks that 214 * uses hash collisions. 215 * 216 * @since 2.4 217 */ 218 private BitSet _overflows; 219 220 /* 221 /********************************************************** 222 /* Life-cycle: constructors 223 /********************************************************** 224 */ 225 226 /** 227 * Main method for constructing a root symbol table instance. 228 */ CharsToNameCanonicalizer(int seed)229 private CharsToNameCanonicalizer(int seed) 230 { 231 _parent = null; 232 _seed = seed; 233 234 // these settings don't really matter for the bootstrap instance 235 _canonicalize = true; 236 _flags = -1; 237 // And we'll also set flags so no copying of buckets is needed: 238 _hashShared = false; // doesn't really matter for root instance 239 _longestCollisionList = 0; 240 241 _tableInfo = new AtomicReference<TableInfo>(TableInfo.createInitial(DEFAULT_T_SIZE)); 242 // and actually do NOT assign buffers so we'll find if anyone tried to 243 // use root instance 244 } 245 246 /** 247 * Internal constructor used when creating child instances. 248 */ CharsToNameCanonicalizer(CharsToNameCanonicalizer parent, int flags, int seed, TableInfo parentState)249 private CharsToNameCanonicalizer(CharsToNameCanonicalizer parent, int flags, int seed, 250 TableInfo parentState) 251 { 252 _parent = parent; 253 _seed = seed; 254 _tableInfo = null; // not used by child tables 255 _flags = flags; 256 _canonicalize = JsonFactory.Feature.CANONICALIZE_FIELD_NAMES.enabledIn(flags); 257 258 // Then copy shared state 259 _symbols = parentState.symbols; 260 _buckets = parentState.buckets; 261 262 _size = parentState.size; 263 _longestCollisionList = parentState.longestCollisionList; 264 265 // Hard-coded fill factor, 75% 266 int arrayLen = (_symbols.length); 267 _sizeThreshold = _thresholdSize(arrayLen); 268 _indexMask = (arrayLen - 1); 269 270 // Need to make copies of arrays, if/when adding new entries 271 _hashShared = true; 272 } 273 _thresholdSize(int hashAreaSize)274 private static int _thresholdSize(int hashAreaSize) { return hashAreaSize - (hashAreaSize >> 2); } 275 276 /* 277 /********************************************************** 278 /* Life-cycle: factory methods, merging 279 /********************************************************** 280 */ 281 282 /** 283 * Method called to create root canonicalizer for a {@link com.fasterxml.jackson.core.JsonFactory} 284 * instance. Root instance is never used directly; its main use is for 285 * storing and sharing underlying symbol arrays as needed. 286 */ createRoot()287 public static CharsToNameCanonicalizer createRoot() { 288 // Need to use a variable seed, to thwart hash-collision based attacks. 289 // 14-Feb-2017, tatu: not sure it actually helps, at all, since it won't 290 // change mixing or any of the steps. Should likely just remove in future. 291 long now = System.currentTimeMillis(); 292 // ensure it's not 0; and might as well require to be odd so: 293 int seed = (((int) now) + ((int) (now >>> 32))) | 1; 294 return createRoot(seed); 295 } 296 createRoot(int seed)297 protected static CharsToNameCanonicalizer createRoot(int seed) { 298 return new CharsToNameCanonicalizer(seed); 299 } 300 /** 301 * "Factory" method; will create a new child instance of this symbol 302 * table. It will be a copy-on-write instance, ie. it will only use 303 * read-only copy of parent's data, but when changes are needed, a 304 * copy will be created. 305 *<p> 306 * Note: while this method is synchronized, it is generally not 307 * safe to both use makeChild/mergeChild, AND to use instance 308 * actively. Instead, a separate 'root' instance should be used 309 * on which only makeChild/mergeChild are called, but instance itself 310 * is not used as a symbol table. 311 */ makeChild(int flags)312 public CharsToNameCanonicalizer makeChild(int flags) { 313 return new CharsToNameCanonicalizer(this, flags, _seed, _tableInfo.get()); 314 } 315 316 /** 317 * Method called by the using code to indicate it is done with this instance. 318 * This lets instance merge accumulated changes into parent (if need be), 319 * safely and efficiently, and without calling code having to know about parent 320 * information. 321 */ release()322 public void release() { 323 // If nothing has been added, nothing to do 324 if (!maybeDirty()) { return; } 325 326 // we will try to merge if child table has new entries 327 if (_parent != null && _canonicalize) { // canonicalize set to false if max size was reached 328 _parent.mergeChild(new TableInfo(this)); 329 // Let's also mark this instance as dirty, so that just in 330 // case release was too early, there's no corruption of possibly shared data. 331 _hashShared = true; 332 } 333 } 334 335 /** 336 * Method that allows contents of child table to potentially be 337 * "merged in" with contents of this symbol table. 338 *<p> 339 * Note that caller has to make sure symbol table passed in is 340 * really a child or sibling of this symbol table. 341 */ mergeChild(TableInfo childState)342 private void mergeChild(TableInfo childState) 343 { 344 final int childCount = childState.size; 345 TableInfo currState = _tableInfo.get(); 346 347 // Should usually grow; but occasionally could also shrink if (but only if) 348 // collision list overflow ends up clearing some collision lists. 349 if (childCount == currState.size) { 350 return; 351 } 352 // One caveat: let's try to avoid problems with degenerate cases of documents with 353 // generated "random" names: for these, symbol tables would bloat indefinitely. 354 // One way to do this is to just purge tables if they grow 355 // too large, and that's what we'll do here. 356 if (childCount > MAX_ENTRIES_FOR_REUSE) { 357 // At any rate, need to clean up the tables 358 childState = TableInfo.createInitial(DEFAULT_T_SIZE); 359 } 360 _tableInfo.compareAndSet(currState, childState); 361 } 362 363 /* 364 /********************************************************** 365 /* Public API, generic accessors: 366 /********************************************************** 367 */ 368 size()369 public int size() { 370 if (_tableInfo != null) { // root table 371 return _tableInfo.get().size; 372 } 373 // nope, child table 374 return _size; 375 } 376 377 /** 378 * Method for checking number of primary hash buckets this symbol 379 * table uses. 380 * 381 * @since 2.1 382 */ bucketCount()383 public int bucketCount() { return _symbols.length; } maybeDirty()384 public boolean maybeDirty() { return !_hashShared; } hashSeed()385 public int hashSeed() { return _seed; } 386 387 /** 388 * Method mostly needed by unit tests; calculates number of 389 * entries that are in collision list. Value can be at most 390 * ({@link #size} - 1), but should usually be much lower, ideally 0. 391 * 392 * @since 2.1 393 */ collisionCount()394 public int collisionCount() { 395 int count = 0; 396 397 for (Bucket bucket : _buckets) { 398 if (bucket != null) { 399 count += bucket.length; 400 } 401 } 402 return count; 403 } 404 405 /** 406 * Method mostly needed by unit tests; calculates length of the 407 * longest collision chain. This should typically be a low number, 408 * but may be up to {@link #size} - 1 in the pathological case 409 * 410 * @since 2.1 411 */ maxCollisionLength()412 public int maxCollisionLength() { return _longestCollisionList; } 413 414 /* 415 /********************************************************** 416 /* Public API, accessing symbols: 417 /********************************************************** 418 */ 419 findSymbol(char[] buffer, int start, int len, int h)420 public String findSymbol(char[] buffer, int start, int len, int h) 421 { 422 if (len < 1) { // empty Strings are simplest to handle up front 423 return ""; 424 } 425 if (!_canonicalize) { // [JACKSON-259] 426 return new String(buffer, start, len); 427 } 428 429 /* Related to problems with sub-standard hashing (somewhat 430 * relevant for collision attacks too), let's try little 431 * bit of shuffling to improve hash codes. 432 * (note, however, that this can't help with full collisions) 433 */ 434 int index = _hashToIndex(h); 435 String sym = _symbols[index]; 436 437 // Optimal case; checking existing primary symbol for hash index: 438 if (sym != null) { 439 // Let's inline primary String equality checking: 440 if (sym.length() == len) { 441 int i = 0; 442 while (sym.charAt(i) == buffer[start+i]) { 443 // Optimal case; primary match found 444 if (++i == len) { 445 return sym; 446 } 447 } 448 } 449 Bucket b = _buckets[index>>1]; 450 if (b != null) { 451 sym = b.has(buffer, start, len); 452 if (sym != null) { 453 return sym; 454 } 455 sym = _findSymbol2(buffer, start, len, b.next); 456 if (sym != null) { 457 return sym; 458 } 459 } 460 } 461 return _addSymbol(buffer, start, len, h, index); 462 } 463 _findSymbol2(char[] buffer, int start, int len, Bucket b)464 private String _findSymbol2(char[] buffer, int start, int len, Bucket b) { 465 while (b != null) { 466 String sym = b.has(buffer, start, len); 467 if (sym != null) { 468 return sym; 469 } 470 b = b.next; 471 } 472 return null; 473 } 474 _addSymbol(char[] buffer, int start, int len, int h, int index)475 private String _addSymbol(char[] buffer, int start, int len, int h, int index) 476 { 477 if (_hashShared) { //need to do copy-on-write? 478 copyArrays(); 479 _hashShared = false; 480 } else if (_size >= _sizeThreshold) { // Need to expand? 481 rehash(); 482 // Need to recalc hash; rare occurrence (index mask has been 483 // recalculated as part of rehash) 484 index = _hashToIndex(calcHash(buffer, start, len)); 485 } 486 487 String newSymbol = new String(buffer, start, len); 488 if (JsonFactory.Feature.INTERN_FIELD_NAMES.enabledIn(_flags)) { 489 newSymbol = InternCache.instance.intern(newSymbol); 490 } 491 ++_size; 492 // Ok; do we need to add primary entry, or a bucket? 493 if (_symbols[index] == null) { 494 _symbols[index] = newSymbol; 495 } else { 496 final int bix = (index >> 1); 497 Bucket newB = new Bucket(newSymbol, _buckets[bix]); 498 int collLen = newB.length; 499 if (collLen > MAX_COLL_CHAIN_LENGTH) { 500 // 23-May-2014, tatu: Instead of throwing an exception right away, 501 // let's handle in bit smarter way. 502 _handleSpillOverflow(bix, newB, index); 503 } else { 504 _buckets[bix] = newB; 505 _longestCollisionList = Math.max(collLen, _longestCollisionList); 506 } 507 } 508 return newSymbol; 509 } 510 511 /** 512 * Method called when an overflow bucket has hit the maximum expected length: 513 * this may be a case of DoS attack. Deal with it based on settings by either 514 * clearing up bucket (to avoid indefinite expansion) or throwing exception. 515 * Currently the first overflow for any single bucket DOES NOT throw an exception, 516 * only second time (per symbol table instance) 517 */ _handleSpillOverflow(int bucketIndex, Bucket newBucket, int mainIndex)518 private void _handleSpillOverflow(int bucketIndex, Bucket newBucket, int mainIndex) 519 { 520 if (_overflows == null) { 521 _overflows = new BitSet(); 522 _overflows.set(bucketIndex); 523 } else { 524 if (_overflows.get(bucketIndex)) { 525 // Has happened once already for this bucket index, so probably not coincidental... 526 if (JsonFactory.Feature.FAIL_ON_SYMBOL_HASH_OVERFLOW.enabledIn(_flags)) { 527 reportTooManyCollisions(MAX_COLL_CHAIN_LENGTH); 528 } 529 // but even if we don't fail, we will stop canonicalizing as safety measure 530 // (so as not to cause problems with PermGen) 531 _canonicalize = false; 532 } else { 533 _overflows.set(bucketIndex); 534 } 535 } 536 537 // regardless, if we get this far, clear up the bucket, adjust size appropriately. 538 _symbols[mainIndex] = newBucket.symbol; 539 _buckets[bucketIndex] = null; 540 // newBucket contains new symbol; but we will 541 _size -= (newBucket.length); 542 // we could calculate longest; but for now just mark as invalid 543 _longestCollisionList = -1; 544 } 545 546 /** 547 * Helper method that takes in a "raw" hash value, shuffles it as necessary, 548 * and truncates to be used as the index. 549 */ _hashToIndex(int rawHash)550 public int _hashToIndex(int rawHash) { 551 // doing these seems to help a bit 552 rawHash += (rawHash >>> 15); 553 rawHash ^= (rawHash << 7); 554 rawHash += (rawHash >>> 3); 555 return (rawHash & _indexMask); 556 } 557 558 /** 559 * Implementation of a hashing method for variable length 560 * Strings. Most of the time intention is that this calculation 561 * is done by caller during parsing, not here; however, sometimes 562 * it needs to be done for parsed "String" too. 563 * 564 * @param len Length of String; has to be at least 1 (caller guarantees 565 * this pre-condition) 566 */ calcHash(char[] buffer, int start, int len)567 public int calcHash(char[] buffer, int start, int len) { 568 int hash = _seed; 569 for (int i = start, end = start+len; i < end; ++i) { 570 hash = (hash * HASH_MULT) + (int) buffer[i]; 571 } 572 // NOTE: shuffling, if any, is done in 'findSymbol()', not here: 573 return (hash == 0) ? 1 : hash; 574 } 575 calcHash(String key)576 public int calcHash(String key) 577 { 578 final int len = key.length(); 579 580 int hash = _seed; 581 for (int i = 0; i < len; ++i) { 582 hash = (hash * HASH_MULT) + (int) key.charAt(i); 583 } 584 // NOTE: shuffling, if any, is done in 'findSymbol()', not here: 585 return (hash == 0) ? 1 : hash; 586 } 587 588 /* 589 /********************************************************** 590 /* Internal methods 591 /********************************************************** 592 */ 593 594 /** 595 * Method called when copy-on-write is needed; generally when first 596 * change is made to a derived symbol table. 597 */ copyArrays()598 private void copyArrays() { 599 final String[] oldSyms = _symbols; 600 _symbols = Arrays.copyOf(oldSyms, oldSyms.length); 601 final Bucket[] oldBuckets = _buckets; 602 _buckets = Arrays.copyOf(oldBuckets, oldBuckets.length); 603 } 604 605 /** 606 * Method called when size (number of entries) of symbol table grows 607 * so big that load factor is exceeded. Since size has to remain 608 * power of two, arrays will then always be doubled. Main work 609 * is really redistributing old entries into new String/Bucket 610 * entries. 611 */ rehash()612 private void rehash() { 613 final int size = _symbols.length; 614 int newSize = size + size; 615 616 /* 12-Mar-2010, tatu: Let's actually limit maximum size we are 617 * prepared to use, to guard against OOME in case of unbounded 618 * name sets (unique [non-repeating] names) 619 */ 620 if (newSize > MAX_T_SIZE) { 621 // If this happens, there's no point in either growing or shrinking hash areas. 622 // Rather, let's just cut our losses and stop canonicalizing. 623 _size = 0; 624 _canonicalize = false; 625 // in theory, could just leave these as null, but... 626 _symbols = new String[DEFAULT_T_SIZE]; 627 _buckets = new Bucket[DEFAULT_T_SIZE>>1]; 628 _indexMask = DEFAULT_T_SIZE-1; 629 _hashShared = false; 630 return; 631 } 632 633 final String[] oldSyms = _symbols; 634 final Bucket[] oldBuckets = _buckets; 635 _symbols = new String[newSize]; 636 _buckets = new Bucket[newSize >> 1]; 637 // Let's update index mask, threshold, now (needed for rehashing) 638 _indexMask = newSize - 1; 639 _sizeThreshold = _thresholdSize(newSize); 640 641 int count = 0; // let's do sanity check 642 643 // Need to do two loops, unfortunately, since spill-over area is 644 // only half the size: 645 int maxColl = 0; 646 for (int i = 0; i < size; ++i) { 647 String symbol = oldSyms[i]; 648 if (symbol != null) { 649 ++count; 650 int index = _hashToIndex(calcHash(symbol)); 651 if (_symbols[index] == null) { 652 _symbols[index] = symbol; 653 } else { 654 int bix = (index >> 1); 655 Bucket newB = new Bucket(symbol, _buckets[bix]); 656 _buckets[bix] = newB; 657 maxColl = Math.max(maxColl, newB.length); 658 } 659 } 660 } 661 662 final int bucketSize = (size >> 1); 663 for (int i = 0; i < bucketSize; ++i) { 664 Bucket b = oldBuckets[i]; 665 while (b != null) { 666 ++count; 667 String symbol = b.symbol; 668 int index = _hashToIndex(calcHash(symbol)); 669 if (_symbols[index] == null) { 670 _symbols[index] = symbol; 671 } else { 672 int bix = (index >> 1); 673 Bucket newB = new Bucket(symbol, _buckets[bix]); 674 _buckets[bix] = newB; 675 maxColl = Math.max(maxColl, newB.length); 676 } 677 b = b.next; 678 } 679 } 680 _longestCollisionList = maxColl; 681 _overflows = null; 682 683 if (count != _size) { 684 throw new IllegalStateException(String.format( 685 "Internal error on SymbolTable.rehash(): had %d entries; now have %d", 686 _size, count)); 687 } 688 } 689 690 /** 691 * @since 2.1 692 */ reportTooManyCollisions(int maxLen)693 protected void reportTooManyCollisions(int maxLen) { 694 throw new IllegalStateException("Longest collision chain in symbol table (of size "+_size 695 +") now exceeds maximum, "+maxLen+" -- suspect a DoS attack based on hash collisions"); 696 } 697 698 // since 2.10, for tests only 699 /** 700 * Diagnostics method that will verify that internal data structures are consistent; 701 * not meant as user-facing method but only for test suites and possible troubleshooting. 702 * 703 * @since 2.10 704 */ verifyInternalConsistency()705 protected void verifyInternalConsistency() { 706 int count = 0; 707 final int size = _symbols.length; 708 709 for (int i = 0; i < size; ++i) { 710 String symbol = _symbols[i]; 711 if (symbol != null) { 712 ++count; 713 } 714 } 715 716 final int bucketSize = (size >> 1); 717 for (int i = 0; i < bucketSize; ++i) { 718 for (Bucket b = _buckets[i]; b != null; b = b.next) { 719 ++count; 720 } 721 } 722 if (count != _size) { 723 throw new IllegalStateException(String.format("Internal error: expected internal size %d vs calculated count %d", 724 _size, count)); 725 } 726 } 727 728 // For debugging, comment out 729 /* 730 @Override 731 public String toString() 732 { 733 StringBuilder sb = new StringBuilder(); 734 int primaryCount = 0; 735 for (String s : _symbols) { 736 if (s != null) ++primaryCount; 737 } 738 739 sb.append("[BytesToNameCanonicalizer, size: "); 740 sb.append(_size); 741 sb.append('/'); 742 sb.append(_symbols.length); 743 sb.append(", "); 744 sb.append(primaryCount); 745 sb.append('/'); 746 sb.append(_size - primaryCount); 747 sb.append(" coll; avg length: "); 748 749 // Average length: minimum of 1 for all (1 == primary hit); 750 // and then 1 per each traversal for collisions/buckets 751 //int maxDist = 1; 752 int pathCount = _size; 753 for (Bucket b : _buckets) { 754 if (b != null) { 755 int spillLen = b.length; 756 for (int j = 1; j <= spillLen; ++j) { 757 pathCount += j; 758 } 759 } 760 } 761 double avgLength; 762 763 if (_size == 0) { 764 avgLength = 0.0; 765 } else { 766 avgLength = (double) pathCount / (double) _size; 767 } 768 // let's round up a bit (two 2 decimal places) 769 //avgLength -= (avgLength % 0.01); 770 771 sb.append(avgLength); 772 sb.append(']'); 773 return sb.toString(); 774 } 775 */ 776 777 /* 778 /********************************************************** 779 /* Helper classes 780 /********************************************************** 781 */ 782 783 /** 784 * This class is a symbol table entry. Each entry acts as a node 785 * in a linked list. 786 */ 787 static final class Bucket 788 { 789 public final String symbol; 790 public final Bucket next; 791 public final int length; 792 Bucket(String s, Bucket n)793 public Bucket(String s, Bucket n) { 794 symbol = s; 795 next = n; 796 length = (n == null) ? 1 : n.length+1; 797 } 798 has(char[] buf, int start, int len)799 public String has(char[] buf, int start, int len) { 800 if (symbol.length() != len) { 801 return null; 802 } 803 int i = 0; 804 do { 805 if (symbol.charAt(i) != buf[start+i]) { 806 return null; 807 } 808 } while (++i < len); 809 return symbol; 810 } 811 } 812 813 /** 814 * Immutable value class used for sharing information as efficiently 815 * as possible, by only require synchronization of reference manipulation 816 * but not access to contents. 817 * 818 * @since 2.8.7 819 */ 820 private final static class TableInfo 821 { 822 final int size; 823 final int longestCollisionList; 824 final String[] symbols; 825 final Bucket[] buckets; 826 TableInfo(int size, int longestCollisionList, String[] symbols, Bucket[] buckets)827 public TableInfo(int size, int longestCollisionList, 828 String[] symbols, Bucket[] buckets) 829 { 830 this.size = size; 831 this.longestCollisionList = longestCollisionList; 832 this.symbols = symbols; 833 this.buckets = buckets; 834 } 835 TableInfo(CharsToNameCanonicalizer src)836 public TableInfo(CharsToNameCanonicalizer src) 837 { 838 this.size = src._size; 839 this.longestCollisionList = src._longestCollisionList; 840 this.symbols = src._symbols; 841 this.buckets = src._buckets; 842 } 843 createInitial(int sz)844 public static TableInfo createInitial(int sz) { 845 return new TableInfo(0, 846 0, // longestCollisionList 847 new String[sz], new Bucket[sz >> 1]); 848 } 849 } 850 } 851