• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 package com.fasterxml.jackson.core.sym;
2 
3 import java.util.Arrays;
4 import java.util.BitSet;
5 import java.util.concurrent.atomic.AtomicReference;
6 
7 import com.fasterxml.jackson.core.JsonFactory;
8 import com.fasterxml.jackson.core.util.InternCache;
9 
10 /**
11  * This class is a kind of specialized type-safe Map, from char array to
12  * String value. Specialization means that in addition to type-safety
13  * and specific access patterns (key char array, Value optionally interned
14  * String; values added on access if necessary), and that instances are
15  * meant to be used concurrently, but by using well-defined mechanisms
16  * to obtain such concurrently usable instances. Main use for the class
17  * is to store symbol table information for things like compilers and
18  * parsers; especially when number of symbols (keywords) is limited.
19  *<p>
20  * For optimal performance, usage pattern should be one where matches
21  * should be very common (especially after "warm-up"), and as with most hash-based
22  * maps/sets, that hash codes are uniformly distributed. Also, collisions
23  * are slightly more expensive than with HashMap or HashSet, since hash codes
24  * are not used in resolving collisions; that is, equals() comparison is
25  * done with all symbols in same bucket index.<br>
26  * Finally, rehashing is also more expensive, as hash codes are not
27  * stored; rehashing requires all entries' hash codes to be recalculated.
28  * Reason for not storing hash codes is reduced memory usage, hoping
29  * for better memory locality.
30  *<p>
31  * Usual usage pattern is to create a single "master" instance, and either
32  * use that instance in sequential fashion, or to create derived "child"
33  * instances, which after use, are asked to return possible symbol additions
34  * to master instance. In either case benefit is that symbol table gets
35  * initialized so that further uses are more efficient, as eventually all
36  * symbols needed will already be in symbol table. At that point no more
37  * Symbol String allocations are needed, nor changes to symbol table itself.
38  *<p>
39  * Note that while individual SymbolTable instances are NOT thread-safe
40  * (much like generic collection classes), concurrently used "child"
41  * instances can be freely used without synchronization. However, using
42  * master table concurrently with child instances can only be done if
43  * access to master instance is read-only (i.e. no modifications done).
44  */
45 public final class CharsToNameCanonicalizer
46 {
47     /* If we use "multiply-add" based hash algorithm, this is the multiplier
48      * we use.
49      *<p>
50      * Note that JDK uses 31; but it seems that 33 produces fewer collisions,
51      * at least with tests we have.
52      */
53     public final static int HASH_MULT = 33;
54 
55     /**
56      * Default initial table size. Shouldn't be miniscule (as there's
57      * cost to both array realloc and rehashing), but let's keep
58      * it reasonably small. For systems that properly
59      * reuse factories it doesn't matter either way; but when
60      * recreating factories often, initial overhead may dominate.
61      */
62     private static final int DEFAULT_T_SIZE = 64;
63 
64     /**
65      * Let's not expand symbol tables past some maximum size;
66      * this should protected against OOMEs caused by large documents
67      * with unique (~= random) names.
68      */
69     private static final int MAX_T_SIZE = 0x10000; // 64k entries == 256k mem
70 
71     /**
72      * Let's only share reasonably sized symbol tables. Max size set to 3/4 of 16k;
73      * this corresponds to 64k main hash index. This should allow for enough distinct
74      * names for almost any case.
75      */
76     static final int MAX_ENTRIES_FOR_REUSE = 12000;
77 
78     /**
79      * Also: to thwart attacks based on hash collisions (which may or may not
80      * be cheap to calculate), we will need to detect "too long"
81      * collision chains. Let's start with static value of 100 entries
82      * for the longest legal chain.
83      *<p>
84      * Note: longest chain we have been able to produce without malicious
85      * intent has been 38 (with "com.fasterxml.jackson.core.main.TestWithTonsaSymbols");
86      * our setting should be reasonable here.
87      *
88      * @since 2.1
89      */
90     static final int MAX_COLL_CHAIN_LENGTH = 100;
91 
92     /*
93     /**********************************************************
94     /* Configuration
95     /**********************************************************
96      */
97 
98     /**
99      * Sharing of learnt symbols is done by optional linking of symbol
100      * table instances with their parents. When parent linkage is
101      * defined, and child instance is released (call to <code>release</code>),
102      * parent's shared tables may be updated from the child instance.
103      */
104     final private CharsToNameCanonicalizer _parent;
105 
106     /**
107      * Member that is only used by the root table instance: root
108      * passes immutable state info child instances, and children
109      * may return new state if they add entries to the table.
110      * Child tables do NOT use the reference.
111      */
112     final private AtomicReference<TableInfo> _tableInfo;
113 
114     /**
115      * Seed value we use as the base to make hash codes non-static between
116      * different runs, but still stable for lifetime of a single symbol table
117      * instance.
118      * This is done for security reasons, to avoid potential DoS attack via
119      * hash collisions.
120      *
121      * @since 2.1
122      */
123     final private int _seed;
124 
125     final private int _flags;
126 
127     /**
128      * Whether any canonicalization should be attempted (whether using
129      * intern or not.
130      *<p>
131      * NOTE: non-final since we may need to disable this with overflow.
132      */
133     private boolean _canonicalize;
134 
135     /*
136     /**********************************************************
137     /* Actual symbol table data
138     /**********************************************************
139      */
140 
141     /**
142      * Primary matching symbols; it's expected most match occur from
143      * here.
144      */
145     private String[] _symbols;
146 
147     /**
148      * Overflow buckets; if primary doesn't match, lookup is done
149      * from here.
150      *<p>
151      * Note: Number of buckets is half of number of symbol entries, on
152      * assumption there's less need for buckets.
153      */
154     private Bucket[] _buckets;
155 
156     /**
157      * Current size (number of entries); needed to know if and when
158      * rehash.
159      */
160     private int _size;
161 
162     /**
163      * Limit that indicates maximum size this instance can hold before
164      * it needs to be expanded and rehashed. Calculated using fill
165      * factor passed in to constructor.
166      */
167     private int _sizeThreshold;
168 
169     /**
170      * Mask used to get index from hash values; equal to
171      * <code>_buckets.length - 1</code>, when _buckets.length is
172      * a power of two.
173      */
174     private int _indexMask;
175 
176     /**
177      * We need to keep track of the longest collision list; this is needed
178      * both to indicate problems with attacks and to allow flushing for
179      * other cases.
180      *
181      * @since 2.1
182      */
183     private int _longestCollisionList;
184 
185     /*
186     /**********************************************************
187     /* State regarding shared arrays
188     /**********************************************************
189      */
190 
191     /**
192      * Flag that indicates whether underlying data structures for
193      * the main hash area are shared or not. If they are, then they
194      * need to be handled in copy-on-write way, i.e. if they need
195      * to be modified, a copy needs to be made first; at this point
196      * it will not be shared any more, and can be modified.
197      *<p>
198      * This flag needs to be checked both when adding new main entries,
199      * and when adding new collision list queues (i.e. creating a new
200      * collision list head entry)
201      */
202     private boolean _hashShared;
203 
204     /*
205     /**********************************************************
206     /* Bit of DoS detection goodness
207     /**********************************************************
208      */
209 
210     /**
211      * Lazily constructed structure that is used to keep track of
212      * collision buckets that have overflowed once: this is used
213      * to detect likely attempts at denial-of-service attacks that
214      * uses hash collisions.
215      *
216      * @since 2.4
217      */
218     private BitSet _overflows;
219 
220     /*
221     /**********************************************************
222     /* Life-cycle: constructors
223     /**********************************************************
224      */
225 
226     /**
227      * Main method for constructing a root symbol table instance.
228      */
CharsToNameCanonicalizer(int seed)229     private CharsToNameCanonicalizer(int seed)
230     {
231         _parent = null;
232         _seed = seed;
233 
234         // these settings don't really matter for the bootstrap instance
235         _canonicalize = true;
236         _flags = -1;
237         // And we'll also set flags so no copying of buckets is needed:
238         _hashShared = false; // doesn't really matter for root instance
239         _longestCollisionList = 0;
240 
241         _tableInfo = new AtomicReference<TableInfo>(TableInfo.createInitial(DEFAULT_T_SIZE));
242         // and actually do NOT assign buffers so we'll find if anyone tried to
243         // use root instance
244     }
245 
246     /**
247      * Internal constructor used when creating child instances.
248      */
CharsToNameCanonicalizer(CharsToNameCanonicalizer parent, int flags, int seed, TableInfo parentState)249     private CharsToNameCanonicalizer(CharsToNameCanonicalizer parent, int flags, int seed,
250             TableInfo parentState)
251     {
252         _parent = parent;
253         _seed = seed;
254         _tableInfo = null; // not used by child tables
255         _flags = flags;
256         _canonicalize = JsonFactory.Feature.CANONICALIZE_FIELD_NAMES.enabledIn(flags);
257 
258         // Then copy shared state
259         _symbols = parentState.symbols;
260         _buckets = parentState.buckets;
261 
262         _size = parentState.size;
263         _longestCollisionList = parentState.longestCollisionList;
264 
265         // Hard-coded fill factor, 75%
266         int arrayLen = (_symbols.length);
267         _sizeThreshold = _thresholdSize(arrayLen);
268         _indexMask =  (arrayLen - 1);
269 
270         // Need to make copies of arrays, if/when adding new entries
271         _hashShared = true;
272     }
273 
_thresholdSize(int hashAreaSize)274     private static int _thresholdSize(int hashAreaSize) { return hashAreaSize - (hashAreaSize >> 2); }
275 
276     /*
277     /**********************************************************
278     /* Life-cycle: factory methods, merging
279     /**********************************************************
280      */
281 
282     /**
283      * Method called to create root canonicalizer for a {@link com.fasterxml.jackson.core.JsonFactory}
284      * instance. Root instance is never used directly; its main use is for
285      * storing and sharing underlying symbol arrays as needed.
286      */
createRoot()287     public static CharsToNameCanonicalizer createRoot() {
288         // Need to use a variable seed, to thwart hash-collision based attacks.
289         // 14-Feb-2017, tatu: not sure it actually helps, at all, since it won't
290         //   change mixing or any of the steps. Should likely just remove in future.
291         long now = System.currentTimeMillis();
292         // ensure it's not 0; and might as well require to be odd so:
293         int seed = (((int) now) + ((int) (now >>> 32))) | 1;
294         return createRoot(seed);
295     }
296 
createRoot(int seed)297     protected static CharsToNameCanonicalizer createRoot(int seed) {
298         return new CharsToNameCanonicalizer(seed);
299     }
300     /**
301      * "Factory" method; will create a new child instance of this symbol
302      * table. It will be a copy-on-write instance, ie. it will only use
303      * read-only copy of parent's data, but when changes are needed, a
304      * copy will be created.
305      *<p>
306      * Note: while this method is synchronized, it is generally not
307      * safe to both use makeChild/mergeChild, AND to use instance
308      * actively. Instead, a separate 'root' instance should be used
309      * on which only makeChild/mergeChild are called, but instance itself
310      * is not used as a symbol table.
311      */
makeChild(int flags)312     public CharsToNameCanonicalizer makeChild(int flags) {
313         return new CharsToNameCanonicalizer(this, flags, _seed, _tableInfo.get());
314     }
315 
316     /**
317      * Method called by the using code to indicate it is done with this instance.
318      * This lets instance merge accumulated changes into parent (if need be),
319      * safely and efficiently, and without calling code having to know about parent
320      * information.
321      */
release()322     public void release() {
323         // If nothing has been added, nothing to do
324         if (!maybeDirty()) { return; }
325 
326         // we will try to merge if child table has new entries
327         if (_parent != null && _canonicalize) { // canonicalize set to false if max size was reached
328             _parent.mergeChild(new TableInfo(this));
329             // Let's also mark this instance as dirty, so that just in
330             // case release was too early, there's no corruption of possibly shared data.
331             _hashShared = true;
332         }
333     }
334 
335     /**
336      * Method that allows contents of child table to potentially be
337      * "merged in" with contents of this symbol table.
338      *<p>
339      * Note that caller has to make sure symbol table passed in is
340      * really a child or sibling of this symbol table.
341      */
mergeChild(TableInfo childState)342     private void mergeChild(TableInfo childState)
343     {
344         final int childCount = childState.size;
345         TableInfo currState = _tableInfo.get();
346 
347         // Should usually grow; but occasionally could also shrink if (but only if)
348         // collision list overflow ends up clearing some collision lists.
349         if (childCount == currState.size) {
350             return;
351         }
352         // One caveat: let's try to avoid problems with  degenerate cases of documents with
353         // generated "random" names: for these, symbol tables would bloat indefinitely.
354         // One way to do this is to just purge tables if they grow
355         // too large, and that's what we'll do here.
356         if (childCount > MAX_ENTRIES_FOR_REUSE) {
357             // At any rate, need to clean up the tables
358             childState = TableInfo.createInitial(DEFAULT_T_SIZE);
359         }
360         _tableInfo.compareAndSet(currState, childState);
361     }
362 
363     /*
364     /**********************************************************
365     /* Public API, generic accessors:
366     /**********************************************************
367      */
368 
size()369     public int size() {
370         if (_tableInfo != null) { // root table
371             return _tableInfo.get().size;
372         }
373         // nope, child table
374         return _size;
375     }
376 
377     /**
378      * Method for checking number of primary hash buckets this symbol
379      * table uses.
380      *
381      * @since 2.1
382      */
bucketCount()383     public int bucketCount() {  return _symbols.length; }
maybeDirty()384     public boolean maybeDirty() { return !_hashShared; }
hashSeed()385     public int hashSeed() { return _seed; }
386 
387     /**
388      * Method mostly needed by unit tests; calculates number of
389      * entries that are in collision list. Value can be at most
390      * ({@link #size} - 1), but should usually be much lower, ideally 0.
391      *
392      * @since 2.1
393      */
collisionCount()394     public int collisionCount() {
395         int count = 0;
396 
397         for (Bucket bucket : _buckets) {
398             if (bucket != null) {
399                 count += bucket.length;
400             }
401         }
402         return count;
403     }
404 
405     /**
406      * Method mostly needed by unit tests; calculates length of the
407      * longest collision chain. This should typically be a low number,
408      * but may be up to {@link #size} - 1 in the pathological case
409      *
410      * @since 2.1
411      */
maxCollisionLength()412     public int maxCollisionLength() { return _longestCollisionList; }
413 
414     /*
415     /**********************************************************
416     /* Public API, accessing symbols:
417     /**********************************************************
418      */
419 
findSymbol(char[] buffer, int start, int len, int h)420     public String findSymbol(char[] buffer, int start, int len, int h)
421     {
422         if (len < 1) { // empty Strings are simplest to handle up front
423             return "";
424         }
425         if (!_canonicalize) { // [JACKSON-259]
426             return new String(buffer, start, len);
427         }
428 
429         /* Related to problems with sub-standard hashing (somewhat
430          * relevant for collision attacks too), let's try little
431          * bit of shuffling to improve hash codes.
432          * (note, however, that this can't help with full collisions)
433          */
434         int index = _hashToIndex(h);
435         String sym = _symbols[index];
436 
437         // Optimal case; checking existing primary symbol for hash index:
438         if (sym != null) {
439             // Let's inline primary String equality checking:
440             if (sym.length() == len) {
441                 int i = 0;
442                 while (sym.charAt(i) == buffer[start+i]) {
443                     // Optimal case; primary match found
444                     if (++i == len) {
445                         return sym;
446                     }
447                 }
448             }
449             Bucket b = _buckets[index>>1];
450             if (b != null) {
451                 sym = b.has(buffer, start, len);
452                 if (sym != null) {
453                     return sym;
454                 }
455                 sym = _findSymbol2(buffer, start, len, b.next);
456                 if (sym != null) {
457                     return sym;
458                 }
459             }
460         }
461         return _addSymbol(buffer, start, len, h, index);
462     }
463 
_findSymbol2(char[] buffer, int start, int len, Bucket b)464     private String _findSymbol2(char[] buffer, int start, int len, Bucket b) {
465         while (b != null) {
466             String sym = b.has(buffer, start, len);
467             if (sym != null) {
468                 return sym;
469             }
470             b = b.next;
471         }
472         return null;
473     }
474 
_addSymbol(char[] buffer, int start, int len, int h, int index)475     private String _addSymbol(char[] buffer, int start, int len, int h, int index)
476     {
477         if (_hashShared) { //need to do copy-on-write?
478             copyArrays();
479             _hashShared = false;
480         } else if (_size >= _sizeThreshold) { // Need to expand?
481             rehash();
482             // Need to recalc hash; rare occurrence (index mask has been
483              // recalculated as part of rehash)
484             index = _hashToIndex(calcHash(buffer, start, len));
485         }
486 
487         String newSymbol = new String(buffer, start, len);
488         if (JsonFactory.Feature.INTERN_FIELD_NAMES.enabledIn(_flags)) {
489             newSymbol = InternCache.instance.intern(newSymbol);
490         }
491         ++_size;
492         // Ok; do we need to add primary entry, or a bucket?
493         if (_symbols[index] == null) {
494             _symbols[index] = newSymbol;
495         } else {
496             final int bix = (index >> 1);
497             Bucket newB = new Bucket(newSymbol, _buckets[bix]);
498             int collLen = newB.length;
499             if (collLen > MAX_COLL_CHAIN_LENGTH) {
500                 // 23-May-2014, tatu: Instead of throwing an exception right away,
501                 //    let's handle in bit smarter way.
502                 _handleSpillOverflow(bix, newB, index);
503             } else {
504                 _buckets[bix] = newB;
505                 _longestCollisionList = Math.max(collLen, _longestCollisionList);
506             }
507         }
508         return newSymbol;
509     }
510 
511     /**
512      * Method called when an overflow bucket has hit the maximum expected length:
513      * this may be a case of DoS attack. Deal with it based on settings by either
514      * clearing up bucket (to avoid indefinite expansion) or throwing exception.
515      * Currently the first overflow for any single bucket DOES NOT throw an exception,
516      * only second time (per symbol table instance)
517      */
_handleSpillOverflow(int bucketIndex, Bucket newBucket, int mainIndex)518     private void _handleSpillOverflow(int bucketIndex, Bucket newBucket, int mainIndex)
519     {
520         if (_overflows == null) {
521             _overflows = new BitSet();
522             _overflows.set(bucketIndex);
523         } else {
524             if (_overflows.get(bucketIndex)) {
525                 // Has happened once already for this bucket index, so probably not coincidental...
526                 if (JsonFactory.Feature.FAIL_ON_SYMBOL_HASH_OVERFLOW.enabledIn(_flags)) {
527                     reportTooManyCollisions(MAX_COLL_CHAIN_LENGTH);
528                 }
529                 // but even if we don't fail, we will stop canonicalizing as safety measure
530                 // (so as not to cause problems with PermGen)
531                 _canonicalize = false;
532             } else {
533                 _overflows.set(bucketIndex);
534             }
535         }
536 
537         // regardless, if we get this far, clear up the bucket, adjust size appropriately.
538         _symbols[mainIndex] = newBucket.symbol;
539         _buckets[bucketIndex] = null;
540         // newBucket contains new symbol; but we will
541         _size -= (newBucket.length);
542         // we could calculate longest; but for now just mark as invalid
543         _longestCollisionList = -1;
544     }
545 
546     /**
547      * Helper method that takes in a "raw" hash value, shuffles it as necessary,
548      * and truncates to be used as the index.
549      */
_hashToIndex(int rawHash)550     public int _hashToIndex(int rawHash) {
551         // doing these seems to help a bit
552         rawHash += (rawHash >>> 15);
553         rawHash ^= (rawHash << 7);
554         rawHash += (rawHash >>> 3);
555         return (rawHash & _indexMask);
556     }
557 
558     /**
559      * Implementation of a hashing method for variable length
560      * Strings. Most of the time intention is that this calculation
561      * is done by caller during parsing, not here; however, sometimes
562      * it needs to be done for parsed "String" too.
563      *
564      * @param len Length of String; has to be at least 1 (caller guarantees
565      *   this pre-condition)
566      */
calcHash(char[] buffer, int start, int len)567     public int calcHash(char[] buffer, int start, int len) {
568         int hash = _seed;
569         for (int i = start, end = start+len; i < end; ++i) {
570             hash = (hash * HASH_MULT) + (int) buffer[i];
571         }
572         // NOTE: shuffling, if any, is done in 'findSymbol()', not here:
573         return (hash == 0) ? 1 : hash;
574     }
575 
calcHash(String key)576     public int calcHash(String key)
577     {
578         final int len = key.length();
579 
580         int hash = _seed;
581         for (int i = 0; i < len; ++i) {
582             hash = (hash * HASH_MULT) + (int) key.charAt(i);
583         }
584         // NOTE: shuffling, if any, is done in 'findSymbol()', not here:
585         return (hash == 0) ? 1 : hash;
586     }
587 
588     /*
589     /**********************************************************
590     /* Internal methods
591     /**********************************************************
592      */
593 
594     /**
595      * Method called when copy-on-write is needed; generally when first
596      * change is made to a derived symbol table.
597      */
copyArrays()598     private void copyArrays() {
599         final String[] oldSyms = _symbols;
600         _symbols = Arrays.copyOf(oldSyms, oldSyms.length);
601         final Bucket[] oldBuckets = _buckets;
602         _buckets = Arrays.copyOf(oldBuckets, oldBuckets.length);
603     }
604 
605     /**
606      * Method called when size (number of entries) of symbol table grows
607      * so big that load factor is exceeded. Since size has to remain
608      * power of two, arrays will then always be doubled. Main work
609      * is really redistributing old entries into new String/Bucket
610      * entries.
611      */
rehash()612     private void rehash() {
613         final int size = _symbols.length;
614         int newSize = size + size;
615 
616         /* 12-Mar-2010, tatu: Let's actually limit maximum size we are
617          *    prepared to use, to guard against OOME in case of unbounded
618          *    name sets (unique [non-repeating] names)
619          */
620         if (newSize > MAX_T_SIZE) {
621             // If this happens, there's no point in either growing or shrinking hash areas.
622             // Rather, let's just cut our losses and stop canonicalizing.
623             _size = 0;
624             _canonicalize = false;
625             // in theory, could just leave these as null, but...
626             _symbols = new String[DEFAULT_T_SIZE];
627             _buckets = new Bucket[DEFAULT_T_SIZE>>1];
628             _indexMask = DEFAULT_T_SIZE-1;
629             _hashShared = false;
630             return;
631         }
632 
633         final String[] oldSyms = _symbols;
634         final Bucket[] oldBuckets = _buckets;
635         _symbols = new String[newSize];
636         _buckets = new Bucket[newSize >> 1];
637         // Let's update index mask, threshold, now (needed for rehashing)
638         _indexMask = newSize - 1;
639         _sizeThreshold = _thresholdSize(newSize);
640 
641         int count = 0; // let's do sanity check
642 
643         // Need to do two loops, unfortunately, since spill-over area is
644         // only half the size:
645         int maxColl = 0;
646         for (int i = 0; i < size; ++i) {
647             String symbol = oldSyms[i];
648             if (symbol != null) {
649                 ++count;
650                 int index = _hashToIndex(calcHash(symbol));
651                 if (_symbols[index] == null) {
652                     _symbols[index] = symbol;
653                 } else {
654                     int bix = (index >> 1);
655                     Bucket newB = new Bucket(symbol, _buckets[bix]);
656                     _buckets[bix] = newB;
657                     maxColl = Math.max(maxColl, newB.length);
658                 }
659             }
660         }
661 
662         final int bucketSize = (size >> 1);
663         for (int i = 0; i < bucketSize; ++i) {
664             Bucket b = oldBuckets[i];
665             while (b != null) {
666                 ++count;
667                 String symbol = b.symbol;
668                 int index = _hashToIndex(calcHash(symbol));
669                 if (_symbols[index] == null) {
670                     _symbols[index] = symbol;
671                 } else {
672                     int bix = (index >> 1);
673                     Bucket newB = new Bucket(symbol, _buckets[bix]);
674                     _buckets[bix] = newB;
675                     maxColl = Math.max(maxColl, newB.length);
676                 }
677                 b = b.next;
678             }
679         }
680         _longestCollisionList = maxColl;
681         _overflows = null;
682 
683         if (count != _size) {
684             throw new IllegalStateException(String.format(
685                     "Internal error on SymbolTable.rehash(): had %d entries; now have %d",
686                     _size, count));
687         }
688     }
689 
690     /**
691      * @since 2.1
692      */
reportTooManyCollisions(int maxLen)693     protected void reportTooManyCollisions(int maxLen) {
694         throw new IllegalStateException("Longest collision chain in symbol table (of size "+_size
695                 +") now exceeds maximum, "+maxLen+" -- suspect a DoS attack based on hash collisions");
696     }
697 
698     // since 2.10, for tests only
699     /**
700      * Diagnostics method that will verify that internal data structures are consistent;
701      * not meant as user-facing method but only for test suites and possible troubleshooting.
702      *
703      * @since 2.10
704      */
verifyInternalConsistency()705     protected void verifyInternalConsistency() {
706         int count = 0;
707         final int size = _symbols.length;
708 
709         for (int i = 0; i < size; ++i) {
710             String symbol = _symbols[i];
711             if (symbol != null) {
712                 ++count;
713             }
714         }
715 
716         final int bucketSize = (size >> 1);
717         for (int i = 0; i < bucketSize; ++i) {
718             for (Bucket b = _buckets[i]; b != null; b = b.next) {
719                 ++count;
720             }
721         }
722         if (count != _size) {
723             throw new IllegalStateException(String.format("Internal error: expected internal size %d vs calculated count %d",
724                     _size, count));
725         }
726     }
727 
728     // For debugging, comment out
729     /*
730     @Override
731     public String toString()
732     {
733         StringBuilder sb = new StringBuilder();
734         int primaryCount = 0;
735         for (String s : _symbols) {
736             if (s != null) ++primaryCount;
737         }
738 
739         sb.append("[BytesToNameCanonicalizer, size: ");
740         sb.append(_size);
741         sb.append('/');
742         sb.append(_symbols.length);
743         sb.append(", ");
744         sb.append(primaryCount);
745         sb.append('/');
746         sb.append(_size - primaryCount);
747         sb.append(" coll; avg length: ");
748 
749         // Average length: minimum of 1 for all (1 == primary hit);
750         // and then 1 per each traversal for collisions/buckets
751         //int maxDist = 1;
752         int pathCount = _size;
753         for (Bucket b : _buckets) {
754             if (b != null) {
755                 int spillLen = b.length;
756                 for (int j = 1; j <= spillLen; ++j) {
757                     pathCount += j;
758                 }
759             }
760         }
761         double avgLength;
762 
763         if (_size == 0) {
764             avgLength = 0.0;
765         } else {
766             avgLength = (double) pathCount / (double) _size;
767         }
768         // let's round up a bit (two 2 decimal places)
769         //avgLength -= (avgLength % 0.01);
770 
771         sb.append(avgLength);
772         sb.append(']');
773         return sb.toString();
774     }
775 */
776 
777     /*
778     /**********************************************************
779     /* Helper classes
780     /**********************************************************
781      */
782 
783     /**
784      * This class is a symbol table entry. Each entry acts as a node
785      * in a linked list.
786      */
787     static final class Bucket
788     {
789         public final String symbol;
790         public final Bucket next;
791         public final int length;
792 
Bucket(String s, Bucket n)793         public Bucket(String s, Bucket n) {
794             symbol = s;
795             next = n;
796             length = (n == null) ? 1 : n.length+1;
797         }
798 
has(char[] buf, int start, int len)799         public String has(char[] buf, int start, int len) {
800             if (symbol.length() != len) {
801                 return null;
802             }
803             int i = 0;
804             do {
805                 if (symbol.charAt(i) != buf[start+i]) {
806                     return null;
807                 }
808             } while (++i < len);
809             return symbol;
810         }
811     }
812 
813     /**
814      * Immutable value class used for sharing information as efficiently
815      * as possible, by only require synchronization of reference manipulation
816      * but not access to contents.
817      *
818      * @since 2.8.7
819      */
820     private final static class TableInfo
821     {
822         final int size;
823         final int longestCollisionList;
824         final String[] symbols;
825         final Bucket[] buckets;
826 
TableInfo(int size, int longestCollisionList, String[] symbols, Bucket[] buckets)827         public TableInfo(int size, int longestCollisionList,
828                 String[] symbols, Bucket[] buckets)
829         {
830             this.size = size;
831             this.longestCollisionList = longestCollisionList;
832             this.symbols = symbols;
833             this.buckets = buckets;
834         }
835 
TableInfo(CharsToNameCanonicalizer src)836         public TableInfo(CharsToNameCanonicalizer src)
837         {
838             this.size = src._size;
839             this.longestCollisionList = src._longestCollisionList;
840             this.symbols = src._symbols;
841             this.buckets = src._buckets;
842         }
843 
createInitial(int sz)844         public static TableInfo createInitial(int sz) {
845             return new TableInfo(0,
846                     0, // longestCollisionList
847                     new String[sz], new Bucket[sz >> 1]);
848         }
849     }
850 }
851