• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* GENERATED SOURCE. DO NOT MODIFY. */
2 // © 2018 and later: Unicode, Inc. and others.
3 // License & terms of use: http://www.unicode.org/copyright.html#License
4 
5 // created: 2018may04 Markus W. Scherer
6 
7 package ohos.global.icu.util;
8 
9 import java.util.Arrays;
10 
11 /**
12  * Mutable Unicode code point trie.
13  * Fast map from Unicode code points (U+0000..U+10FFFF) to 32-bit integer values.
14  * For details see http://site.icu-project.org/design/struct/utrie
15  *
16  * <p>Setting values (especially ranges) and lookup is fast.
17  * The mutable trie is only somewhat space-efficient.
18  * It builds a compacted, immutable {@link CodePointTrie}.
19  *
20  * <p>This trie can be modified while iterating over its contents.
21  * For example, it is possible to merge its values with those from another
22  * set of ranges (e.g., another @{link CodePointMap}):
23  * Iterate over those source ranges; for each of them iterate over this trie;
24  * add the source value into the value of each trie range.
25  *
26  * @hide exposed on OHOS
27  */
28 public final class MutableCodePointTrie extends CodePointMap implements Cloneable {
29     /**
30      * Constructs a mutable trie that initially maps each Unicode code point to the same value.
31      * It uses 32-bit data values until
32      * {@link #buildImmutable(ohos.global.icu.util.CodePointTrie.Type, ohos.global.icu.util.CodePointTrie.ValueWidth)}
33      * is called.
34      * buildImmutable() takes a valueWidth parameter which
35      * determines the number of bits in the data value in the resulting {@link CodePointTrie}.
36      *
37      * @param initialValue the initial value that is set for all code points
38      * @param errorValue the value for out-of-range code points and ill-formed UTF-8/16
39      */
MutableCodePointTrie(int initialValue, int errorValue)40     public MutableCodePointTrie(int initialValue, int errorValue) {
41         index = new int[BMP_I_LIMIT];
42         index3NullOffset = -1;
43         data = new int[INITIAL_DATA_LENGTH];
44         dataNullOffset = -1;
45         origInitialValue = initialValue;
46         this.initialValue = initialValue;
47         this.errorValue = errorValue;
48         highValue = initialValue;
49     }
50 
51     /**
52      * Clones this mutable trie.
53      *
54      * @return the clone
55      */
56     @Override
clone()57     public MutableCodePointTrie clone() {
58         try {
59             MutableCodePointTrie builder = (MutableCodePointTrie) super.clone();
60             int iCapacity = highStart <= BMP_LIMIT ? BMP_I_LIMIT : I_LIMIT;
61             builder.index = new int[iCapacity];
62             builder.flags = new byte[UNICODE_LIMIT >> CodePointTrie.SHIFT_3];
63             for (int i = 0, iLimit = highStart >> CodePointTrie.SHIFT_3; i < iLimit; ++i) {
64                 builder.index[i] = index[i];
65                 builder.flags[i] = flags[i];
66             }
67             builder.index3NullOffset = index3NullOffset;
68             builder.data = data.clone();
69             builder.dataLength = dataLength;
70             builder.dataNullOffset = dataNullOffset;
71             builder.origInitialValue = origInitialValue;
72             builder.initialValue = initialValue;
73             builder.errorValue = errorValue;
74             builder.highStart = highStart;
75             builder.highValue = highValue;
76             assert index16 == null;
77             return builder;
78         } catch (CloneNotSupportedException ignored) {
79             // Unreachable: Cloning *is* supported.
80             return null;
81         }
82     }
83 
84     /**
85      * Creates a mutable trie with the same contents as the {@link CodePointMap}.
86      *
87      * @param map the source map or trie
88      * @return the mutable trie
89      */
fromCodePointMap(CodePointMap map)90     public static MutableCodePointTrie fromCodePointMap(CodePointMap map) {
91         // TODO: Consider special code branch for map instanceof CodePointTrie?
92         // Use the highValue as the initialValue to reduce the highStart.
93         int errorValue = map.get(-1);
94         int initialValue = map.get(MAX_UNICODE);
95         MutableCodePointTrie mutableTrie = new MutableCodePointTrie(initialValue, errorValue);
96         CodePointMap.Range range = new CodePointMap.Range();
97         int start = 0;
98         while (map.getRange(start, null, range)) {
99             int end = range.getEnd();
100             int value = range.getValue();
101             if (value != initialValue) {
102                 if (start == end) {
103                     mutableTrie.set(start, value);
104                 } else {
105                     mutableTrie.setRange(start, end, value);
106                 }
107             }
108             start = end + 1;
109         }
110         return mutableTrie;
111     }
112 
clear()113     private void clear() {
114         index3NullOffset = dataNullOffset = -1;
115         dataLength = 0;
116         highValue = initialValue = origInitialValue;
117         highStart = 0;
118         index16 = null;
119     }
120 
121     /**
122      * {@inheritDoc}
123      */
124     @Override
get(int c)125     public int get(int c) {
126         if (c < 0 || MAX_UNICODE < c) {
127             return errorValue;
128         }
129         if (c >= highStart) {
130             return highValue;
131         }
132         int i = c >> CodePointTrie.SHIFT_3;
133         if (flags[i] == ALL_SAME) {
134             return index[i];
135         } else {
136             return data[index[i] + (c & CodePointTrie.SMALL_DATA_MASK)];
137         }
138     }
139 
maybeFilterValue(int value, int initialValue, int nullValue, ValueFilter filter)140     private static final int maybeFilterValue(int value, int initialValue, int nullValue,
141             ValueFilter filter) {
142         if (value == initialValue) {
143             value = nullValue;
144         } else if (filter != null) {
145             value = filter.apply(value);
146         }
147         return value;
148     }
149 
150     /**
151      * {@inheritDoc}
152      *
153      * <p>The trie can be modified between calls to this function.
154      */
155     @Override
getRange(int start, CodePointTrie.ValueFilter filter, CodePointTrie.Range range)156     public boolean getRange(int start, CodePointTrie.ValueFilter filter,
157             CodePointTrie.Range range) {
158         if (start < 0 || MAX_UNICODE < start) {
159             return false;
160         }
161         if (start >= highStart) {
162             int value = highValue;
163             if (filter != null) { value = filter.apply(value); }
164             range.set(start, MAX_UNICODE, value);
165             return true;
166         }
167         int nullValue = initialValue;
168         if (filter != null) { nullValue = filter.apply(nullValue); }
169         int c = start;
170         // Initialize to make compiler happy. Real value when haveValue is true.
171         int trieValue = 0, value = 0;
172         boolean haveValue = false;
173         int i = c >> CodePointTrie.SHIFT_3;
174         do {
175             if (flags[i] == ALL_SAME) {
176                 int trieValue2 = index[i];
177                 if (haveValue) {
178                     if (trieValue2 != trieValue) {
179                         if (filter == null ||
180                                 maybeFilterValue(trieValue2, initialValue, nullValue,
181                                         filter) != value) {
182                             range.set(start, c - 1, value);
183                             return true;
184                         }
185                         trieValue = trieValue2;  // may or may not help
186                     }
187                 } else {
188                     trieValue = trieValue2;
189                     value = maybeFilterValue(trieValue2, initialValue, nullValue, filter);
190                     haveValue = true;
191                 }
192                 c = (c + CodePointTrie.SMALL_DATA_BLOCK_LENGTH) & ~CodePointTrie.SMALL_DATA_MASK;
193             } else /* MIXED */ {
194                 int di = index[i] + (c & CodePointTrie.SMALL_DATA_MASK);
195                 int trieValue2 = data[di];
196                 if (haveValue) {
197                     if (trieValue2 != trieValue) {
198                         if (filter == null ||
199                                 maybeFilterValue(trieValue2, initialValue, nullValue,
200                                         filter) != value) {
201                             range.set(start, c - 1, value);
202                             return true;
203                         }
204                         trieValue = trieValue2;  // may or may not help
205                     }
206                 } else {
207                     trieValue = trieValue2;
208                     value = maybeFilterValue(trieValue2, initialValue, nullValue, filter);
209                     haveValue = true;
210                 }
211                 while ((++c & CodePointTrie.SMALL_DATA_MASK) != 0) {
212                     trieValue2 = data[++di];
213                     if (trieValue2 != trieValue) {
214                         if (filter == null ||
215                                 maybeFilterValue(trieValue2, initialValue, nullValue,
216                                         filter) != value) {
217                             range.set(start, c - 1, value);
218                             return true;
219                         }
220                         trieValue = trieValue2;  // may or may not help
221                     }
222                 }
223             }
224             ++i;
225         } while (c < highStart);
226         assert(haveValue);
227         if (maybeFilterValue(highValue, initialValue, nullValue, filter) != value) {
228             range.set(start, c - 1, value);
229         } else {
230             range.set(start, MAX_UNICODE, value);
231         }
232         return true;
233     }
234 
writeBlock(int block, int value)235     private void writeBlock(int block, int value) {
236         int limit = block + CodePointTrie.SMALL_DATA_BLOCK_LENGTH;
237         Arrays.fill(data, block, limit, value);
238     }
239 
240     /**
241      * Sets a value for a code point.
242      *
243      * @param c the code point
244      * @param value the value
245      */
set(int c, int value)246     public void set(int c, int value) {
247         if (c < 0 || MAX_UNICODE < c) {
248             throw new IllegalArgumentException("invalid code point");
249         }
250 
251         ensureHighStart(c);
252         int block = getDataBlock(c >> CodePointTrie.SHIFT_3);
253         data[block + (c & CodePointTrie.SMALL_DATA_MASK)] = value;
254     }
255 
fillBlock(int block, int start, int limit, int value)256     private void fillBlock(int block, int start, int limit, int value) {
257         Arrays.fill(data, block + start, block + limit, value);
258     }
259 
260     /**
261      * Sets a value for each code point [start..end].
262      * Faster and more space-efficient than setting the value for each code point separately.
263      *
264      * @param start the first code point to get the value
265      * @param end the last code point to get the value (inclusive)
266      * @param value the value
267      */
setRange(int start, int end, int value)268     public void setRange(int start, int end, int value) {
269         if (start < 0 || MAX_UNICODE < start || end < 0 || MAX_UNICODE < end || start > end) {
270             throw new IllegalArgumentException("invalid code point range");
271         }
272         ensureHighStart(end);
273 
274         int limit = end + 1;
275         if ((start & CodePointTrie.SMALL_DATA_MASK) != 0) {
276             // Set partial block at [start..following block boundary[.
277             int block = getDataBlock(start >> CodePointTrie.SHIFT_3);
278             int nextStart = (start + CodePointTrie.SMALL_DATA_MASK) & ~CodePointTrie.SMALL_DATA_MASK;
279             if (nextStart <= limit) {
280                 fillBlock(block, start & CodePointTrie.SMALL_DATA_MASK,
281                           CodePointTrie.SMALL_DATA_BLOCK_LENGTH, value);
282                 start = nextStart;
283             } else {
284                 fillBlock(block, start & CodePointTrie.SMALL_DATA_MASK,
285                           limit & CodePointTrie.SMALL_DATA_MASK, value);
286                 return;
287             }
288         }
289 
290         // Number of positions in the last, partial block.
291         int rest = limit & CodePointTrie.SMALL_DATA_MASK;
292 
293         // Round down limit to a block boundary.
294         limit &= ~CodePointTrie.SMALL_DATA_MASK;
295 
296         // Iterate over all-value blocks.
297         while (start < limit) {
298             int i = start >> CodePointTrie.SHIFT_3;
299             if (flags[i] == ALL_SAME) {
300                 index[i] = value;
301             } else /* MIXED */ {
302                 fillBlock(index[i], 0, CodePointTrie.SMALL_DATA_BLOCK_LENGTH, value);
303             }
304             start += CodePointTrie.SMALL_DATA_BLOCK_LENGTH;
305         }
306 
307         if (rest > 0) {
308             // Set partial block at [last block boundary..limit[.
309             int block = getDataBlock(start >> CodePointTrie.SHIFT_3);
310             fillBlock(block, 0, rest, value);
311         }
312     }
313 
314     /**
315      * Compacts the data and builds an immutable {@link CodePointTrie} according to the parameters.
316      * After this, the mutable trie will be empty.
317      *
318      * <p>The mutable trie stores 32-bit values until buildImmutable() is called.
319      * If values shorter than 32 bits are to be stored in the immutable trie,
320      * then the upper bits are discarded.
321      * For example, when the mutable trie contains values 0x81, -0x7f, and 0xa581,
322      * and the value width is 8 bits, then each of these is stored as 0x81
323      * and the immutable trie will return that as an unsigned value.
324      * (Some implementations may want to make productive temporary use of the upper bits
325      * until buildImmutable() discards them.)
326      *
327      * <p>Not every possible set of mappings can be built into a CodePointTrie,
328      * because of limitations resulting from speed and space optimizations.
329      * Every Unicode assigned character can be mapped to a unique value.
330      * Typical data yields data structures far smaller than the limitations.
331      *
332      * <p>It is possible to construct extremely unusual mappings that exceed the
333      * data structure limits.
334      * In such a case this function will throw an exception.
335      *
336      * @param type selects the trie type
337      * @param valueWidth selects the number of bits in a trie data value; if smaller than 32 bits,
338      *                   then the values stored in the trie will be truncated first
339      *
340      * @see #fromCodePointMap(CodePointMap)
341      */
buildImmutable(CodePointTrie.Type type, CodePointTrie.ValueWidth valueWidth)342     public CodePointTrie buildImmutable(CodePointTrie.Type type, CodePointTrie.ValueWidth valueWidth) {
343         if (type == null || valueWidth == null) {
344             throw new IllegalArgumentException("The type and valueWidth must be specified.");
345         }
346 
347         try {
348             return build(type, valueWidth);
349         } finally {
350             clear();
351         }
352     }
353 
354     private static final int MAX_UNICODE = 0x10ffff;
355 
356     private static final int UNICODE_LIMIT = 0x110000;
357     private static final int BMP_LIMIT = 0x10000;
358     private static final int ASCII_LIMIT = 0x80;
359 
360     private static final int I_LIMIT = UNICODE_LIMIT >> CodePointTrie.SHIFT_3;
361     private static final int BMP_I_LIMIT = BMP_LIMIT >> CodePointTrie.SHIFT_3;
362     private static final int ASCII_I_LIMIT = ASCII_LIMIT >> CodePointTrie.SHIFT_3;
363 
364     private static final int SMALL_DATA_BLOCKS_PER_BMP_BLOCK = (1 << (CodePointTrie.FAST_SHIFT - CodePointTrie.SHIFT_3));
365 
366     // Flag values for data blocks.
367     private static final byte ALL_SAME = 0;
368     private static final byte MIXED = 1;
369     private static final byte SAME_AS = 2;
370 
371     /** Start with allocation of 16k data entries. */
372     private static final int INITIAL_DATA_LENGTH = (1 << 14);
373 
374     /** Grow about 8x each time. */
375     private static final int MEDIUM_DATA_LENGTH = (1 << 17);
376 
377     /**
378      * Maximum length of the build-time data array.
379      * One entry per 0x110000 code points.
380      */
381     private static final int MAX_DATA_LENGTH = UNICODE_LIMIT;
382 
383     // Flag values for index-3 blocks while compacting/building.
384     private static final byte I3_NULL = 0;
385     private static final byte I3_BMP = 1;
386     private static final byte I3_16 = 2;
387     private static final byte I3_18 = 3;
388 
389     private static final int INDEX_3_18BIT_BLOCK_LENGTH = CodePointTrie.INDEX_3_BLOCK_LENGTH + CodePointTrie.INDEX_3_BLOCK_LENGTH / 8;
390 
391     private int[] index;
392     private int index3NullOffset;
393     private int[] data;
394     private int dataLength;
395     private int dataNullOffset;
396 
397     private int origInitialValue;
398     private int initialValue;
399     private int errorValue;
400     private int highStart;
401     private int highValue;
402 
403     /** Temporary array while building the final data. */
404     private char[] index16;
405     private byte[] flags = new byte[UNICODE_LIMIT >> CodePointTrie.SHIFT_3];
406 
ensureHighStart(int c)407     private void ensureHighStart(int c) {
408         if (c >= highStart) {
409             // Round up to a CodePointTrie.CP_PER_INDEX_2_ENTRY boundary to simplify compaction.
410             c = (c + CodePointTrie.CP_PER_INDEX_2_ENTRY) & ~(CodePointTrie.CP_PER_INDEX_2_ENTRY - 1);
411             int i = highStart >> CodePointTrie.SHIFT_3;
412             int iLimit = c >> CodePointTrie.SHIFT_3;
413             if (iLimit > index.length) {
414                 int[] newIndex = new int[I_LIMIT];
415                 for (int j = 0; j < i; ++j) { newIndex[j] = index[j]; }
416                 index = newIndex;
417             }
418             do {
419                 flags[i] = ALL_SAME;
420                 index[i] = initialValue;
421             } while(++i < iLimit);
422             highStart = c;
423         }
424     }
425 
allocDataBlock(int blockLength)426     private int allocDataBlock(int blockLength) {
427         int newBlock = dataLength;
428         int newTop = newBlock + blockLength;
429         if (newTop > data.length) {
430             int capacity;
431             if (data.length < MEDIUM_DATA_LENGTH) {
432                 capacity = MEDIUM_DATA_LENGTH;
433             } else if (data.length < MAX_DATA_LENGTH) {
434                 capacity = MAX_DATA_LENGTH;
435             } else {
436                 // Should never occur.
437                 // Either MAX_DATA_LENGTH is incorrect,
438                 // or the code writes more values than should be possible.
439                 throw new AssertionError();
440             }
441             int[] newData = new int[capacity];
442             for (int j = 0; j < dataLength; ++j) { newData[j] = data[j]; }
443             data = newData;
444         }
445         dataLength = newTop;
446         return newBlock;
447     }
448 
449     /**
450      * No error checking for illegal arguments.
451      * The Java version always returns non-negative values.
452      */
getDataBlock(int i)453     private int getDataBlock(int i) {
454         if (flags[i] == MIXED) {
455             return index[i];
456         }
457         if (i < BMP_I_LIMIT) {
458             int newBlock = allocDataBlock(CodePointTrie.FAST_DATA_BLOCK_LENGTH);
459             int iStart = i & ~(SMALL_DATA_BLOCKS_PER_BMP_BLOCK -1);
460             int iLimit = iStart + SMALL_DATA_BLOCKS_PER_BMP_BLOCK;
461             do {
462                 assert(flags[iStart] == ALL_SAME);
463                 writeBlock(newBlock, index[iStart]);
464                 flags[iStart] = MIXED;
465                 index[iStart++] = newBlock;
466                 newBlock += CodePointTrie.SMALL_DATA_BLOCK_LENGTH;
467             } while (iStart < iLimit);
468             return index[i];
469         } else {
470             int newBlock = allocDataBlock(CodePointTrie.SMALL_DATA_BLOCK_LENGTH);
471             if (newBlock < 0) { return newBlock; }
472             writeBlock(newBlock, index[i]);
473             flags[i] = MIXED;
474             index[i] = newBlock;
475             return newBlock;
476         }
477     }
478 
479     // compaction --------------------------------------------------------------
480 
maskValues(int mask)481     private void maskValues(int mask) {
482         initialValue &= mask;
483         errorValue &= mask;
484         highValue &= mask;
485         int iLimit = highStart >> CodePointTrie.SHIFT_3;
486         for (int i = 0; i < iLimit; ++i) {
487             if (flags[i] == ALL_SAME) {
488                 index[i] &= mask;
489             }
490         }
491         for (int i = 0; i < dataLength; ++i) {
492             data[i] &= mask;
493         }
494     }
495 
equalBlocks(int[] s, int si, int[] t, int ti, int length)496     private static boolean equalBlocks(int[] s, int si, int[] t, int ti, int length) {
497         while (length > 0 && s[si] == t[ti]) {
498             ++si;
499             ++ti;
500             --length;
501         }
502         return length == 0;
503     }
504 
equalBlocks(char[] s, int si, int[] t, int ti, int length)505     private static boolean equalBlocks(char[] s, int si, int[] t, int ti, int length) {
506         while (length > 0 && s[si] == t[ti]) {
507             ++si;
508             ++ti;
509             --length;
510         }
511         return length == 0;
512     }
513 
equalBlocks(char[] s, int si, char[] t, int ti, int length)514     private static boolean equalBlocks(char[] s, int si, char[] t, int ti, int length) {
515         while (length > 0 && s[si] == t[ti]) {
516             ++si;
517             ++ti;
518             --length;
519         }
520         return length == 0;
521     }
522 
allValuesSameAs(int[] p, int pi, int length, int value)523     private static boolean allValuesSameAs(int[] p, int pi, int length, int value) {
524         int pLimit = pi + length;
525         while (pi < pLimit && p[pi] == value) { ++pi; }
526         return pi == pLimit;
527     }
528 
529     /** Search for an identical block. */
findSameBlock(char[] p, int pStart, int length, char[] q, int qStart, int blockLength)530     private static int findSameBlock(char[] p, int pStart, int length,
531             char[] q, int qStart, int blockLength) {
532         // Ensure that we do not even partially get past length.
533         length -= blockLength;
534 
535         while (pStart <= length) {
536             if (equalBlocks(p, pStart, q, qStart, blockLength)) {
537                 return pStart;
538             }
539             ++pStart;
540         }
541         return -1;
542     }
543 
findAllSameBlock(int[] p, int start, int limit, int value, int blockLength)544     private static int findAllSameBlock(int[] p, int start, int limit,
545             int value, int blockLength) {
546         // Ensure that we do not even partially get past limit.
547         limit -= blockLength;
548 
549         for (int block = start; block <= limit; ++block) {
550             if (p[block] == value) {
551                 for (int i = 1;; ++i) {
552                     if (i == blockLength) {
553                         return block;
554                     }
555                     if (p[block + i] != value) {
556                         block += i;
557                         break;
558                     }
559                 }
560             }
561         }
562         return -1;
563     }
564 
565     /**
566      * Look for maximum overlap of the beginning of the other block
567      * with the previous, adjacent block.
568      */
getOverlap(int[] p, int length, int[] q, int qStart, int blockLength)569     private static int getOverlap(int[] p, int length, int[] q, int qStart, int blockLength) {
570         int overlap = blockLength - 1;
571         assert(overlap <= length);
572         while (overlap > 0 && !equalBlocks(p, length - overlap, q, qStart, overlap)) {
573             --overlap;
574         }
575         return overlap;
576     }
577 
getOverlap(char[] p, int length, int[] q, int qStart, int blockLength)578     private static int getOverlap(char[] p, int length, int[] q, int qStart, int blockLength) {
579         int overlap = blockLength - 1;
580         assert(overlap <= length);
581         while (overlap > 0 && !equalBlocks(p, length - overlap, q, qStart, overlap)) {
582             --overlap;
583         }
584         return overlap;
585     }
586 
getOverlap(char[] p, int length, char[] q, int qStart, int blockLength)587     private static int getOverlap(char[] p, int length, char[] q, int qStart, int blockLength) {
588         int overlap = blockLength - 1;
589         assert(overlap <= length);
590         while (overlap > 0 && !equalBlocks(p, length - overlap, q, qStart, overlap)) {
591             --overlap;
592         }
593         return overlap;
594     }
595 
getAllSameOverlap(int[] p, int length, int value, int blockLength)596     private static int getAllSameOverlap(int[] p, int length, int value, int blockLength) {
597         int min = length - (blockLength - 1);
598         int i = length;
599         while (min < i && p[i - 1] == value) { --i; }
600         return length - i;
601     }
602 
isStartOfSomeFastBlock(int dataOffset, int[] index, int fastILimit)603     private static boolean isStartOfSomeFastBlock(int dataOffset, int[] index, int fastILimit) {
604         for (int i = 0; i < fastILimit; i += SMALL_DATA_BLOCKS_PER_BMP_BLOCK) {
605             if (index[i] == dataOffset) {
606                 return true;
607             }
608         }
609         return false;
610     }
611 
612     /**
613      * Finds the start of the last range in the trie by enumerating backward.
614      * Indexes for code points higher than this will be omitted.
615      */
findHighStart()616     private int findHighStart() {
617         int i = highStart >> CodePointTrie.SHIFT_3;
618         while (i > 0) {
619             boolean match;
620             if (flags[--i] == ALL_SAME) {
621                 match = index[i] == highValue;
622             } else /* MIXED */ {
623                 int p = index[i];
624                 for (int j = 0;; ++j) {
625                     if (j == CodePointTrie.SMALL_DATA_BLOCK_LENGTH) {
626                         match = true;
627                         break;
628                     }
629                     if (data[p + j] != highValue) {
630                         match = false;
631                         break;
632                     }
633                 }
634             }
635             if (!match) {
636                 return (i + 1) << CodePointTrie.SHIFT_3;
637             }
638         }
639         return 0;
640     }
641 
642     private static final class AllSameBlocks {
643         static final int NEW_UNIQUE = -1;
644         static final int OVERFLOW = -2;
645 
AllSameBlocks()646         AllSameBlocks() {
647             mostRecent = -1;
648         }
649 
findOrAdd(int index, int count, int value)650         int findOrAdd(int index, int count, int value) {
651             if (mostRecent >= 0 && values[mostRecent] == value) {
652                 refCounts[mostRecent] += count;
653                 return indexes[mostRecent];
654             }
655             for (int i = 0; i < length; ++i) {
656                 if (values[i] == value) {
657                     mostRecent = i;
658                     refCounts[i] += count;
659                     return indexes[i];
660                 }
661             }
662             if (length == CAPACITY) {
663                 return OVERFLOW;
664             }
665             mostRecent = length;
666             indexes[length] = index;
667             values[length] = value;
668             refCounts[length++] = count;
669             return NEW_UNIQUE;
670         }
671 
672         /** Replaces the block which has the lowest reference count. */
add(int index, int count, int value)673         void add(int index, int count, int value) {
674             assert(length == CAPACITY);
675             int least = -1;
676             int leastCount = I_LIMIT;
677             for (int i = 0; i < length; ++i) {
678                 assert(values[i] != value);
679                 if (refCounts[i] < leastCount) {
680                     least = i;
681                     leastCount = refCounts[i];
682                 }
683             }
684             assert(least >= 0);
685             mostRecent = least;
686             indexes[least] = index;
687             values[least] = value;
688             refCounts[least] = count;
689         }
690 
findMostUsed()691         int findMostUsed() {
692             if (length == 0) { return -1; }
693             int max = -1;
694             int maxCount = 0;
695             for (int i = 0; i < length; ++i) {
696                 if (refCounts[i] > maxCount) {
697                     max = i;
698                     maxCount = refCounts[i];
699                 }
700             }
701             return indexes[max];
702         }
703 
704         private static final int CAPACITY = 32;
705 
706         private int length;
707         private int mostRecent;
708 
709         private int[] indexes = new int[CAPACITY];
710         private int[] values = new int[CAPACITY];
711         private int[] refCounts = new int[CAPACITY];
712     }
713 
714     // Custom hash table for mixed-value blocks to be found anywhere in the
715     // compacted data or index so far.
716     private static final class MixedBlocks {
init(int maxLength, int newBlockLength)717         void init(int maxLength, int newBlockLength) {
718             // We store actual data indexes + 1 to reserve 0 for empty entries.
719             int maxDataIndex = maxLength - newBlockLength + 1;
720             int newLength;
721             if (maxDataIndex <= 0xfff) {  // 4k
722                 newLength = 6007;
723                 shift = 12;
724                 mask = 0xfff;
725             } else if (maxDataIndex <= 0x7fff) {  // 32k
726                 newLength = 50021;
727                 shift = 15;
728                 mask = 0x7fff;
729             } else if (maxDataIndex <= 0x1ffff) {  // 128k
730                 newLength = 200003;
731                 shift = 17;
732                 mask = 0x1ffff;
733             } else {
734                 // maxDataIndex up to around MAX_DATA_LENGTH, ca. 1.1M
735                 newLength = 1500007;
736                 shift = 21;
737                 mask = 0x1fffff;
738             }
739             if (table == null || newLength > table.length) {
740                 table = new int[newLength];
741             } else {
742                 Arrays.fill(table, 0, newLength, 0);
743             }
744             length = newLength;
745 
746             blockLength = newBlockLength;
747         }
748 
extend(int[] data, int minStart, int prevDataLength, int newDataLength)749         void extend(int[] data, int minStart, int prevDataLength, int newDataLength) {
750             int start = prevDataLength - blockLength;
751             if (start >= minStart) {
752                 ++start;  // Skip the last block that we added last time.
753             } else {
754                 start = minStart;  // Begin with the first full block.
755             }
756             for (int end = newDataLength - blockLength; start <= end; ++start) {
757                 int hashCode = makeHashCode(data, start);
758                 addEntry(data, null, start, hashCode, start);
759             }
760         }
761 
extend(char[] data, int minStart, int prevDataLength, int newDataLength)762         void extend(char[] data, int minStart, int prevDataLength, int newDataLength) {
763             int start = prevDataLength - blockLength;
764             if (start >= minStart) {
765                 ++start;  // Skip the last block that we added last time.
766             } else {
767                 start = minStart;  // Begin with the first full block.
768             }
769             for (int end = newDataLength - blockLength; start <= end; ++start) {
770                 int hashCode = makeHashCode(data, start);
771                 addEntry(null, data, start, hashCode, start);
772             }
773         }
774 
findBlock(int[] data, int[] blockData, int blockStart)775         int findBlock(int[] data, int[] blockData, int blockStart) {
776             int hashCode = makeHashCode(blockData, blockStart);
777             int entryIndex = findEntry(data, null, blockData, null, blockStart, hashCode);
778             if (entryIndex >= 0) {
779                 return (table[entryIndex] & mask) - 1;
780             } else {
781                 return -1;
782             }
783         }
784 
findBlock(char[] data, int[] blockData, int blockStart)785         int findBlock(char[] data, int[] blockData, int blockStart) {
786             int hashCode = makeHashCode(blockData, blockStart);
787             int entryIndex = findEntry(null, data, blockData, null, blockStart, hashCode);
788             if (entryIndex >= 0) {
789                 return (table[entryIndex] & mask) - 1;
790             } else {
791                 return -1;
792             }
793         }
794 
findBlock(char[] data, char[] blockData, int blockStart)795         int findBlock(char[] data, char[] blockData, int blockStart) {
796             int hashCode = makeHashCode(blockData, blockStart);
797             int entryIndex = findEntry(null, data, null, blockData, blockStart, hashCode);
798             if (entryIndex >= 0) {
799                 return (table[entryIndex] & mask) - 1;
800             } else {
801                 return -1;
802             }
803         }
804 
findAllSameBlock(int[] data, int blockValue)805         int findAllSameBlock(int[] data, int blockValue) {
806             int hashCode = makeHashCode(blockValue);
807             int entryIndex = findEntry(data, blockValue, hashCode);
808             if (entryIndex >= 0) {
809                 return (table[entryIndex] & mask) - 1;
810             } else {
811                 return -1;
812             }
813         }
814 
makeHashCode(int[] blockData, int blockStart)815         private int makeHashCode(int[] blockData, int blockStart) {
816             int blockLimit = blockStart + blockLength;
817             int hashCode = blockData[blockStart++];
818             do {
819                 hashCode = 37 * hashCode + blockData[blockStart++];
820             } while (blockStart < blockLimit);
821             return hashCode;
822         }
823 
makeHashCode(char[] blockData, int blockStart)824         private int makeHashCode(char[] blockData, int blockStart) {
825             int blockLimit = blockStart + blockLength;
826             int hashCode = blockData[blockStart++];
827             do {
828                 hashCode = 37 * hashCode + blockData[blockStart++];
829             } while (blockStart < blockLimit);
830             return hashCode;
831         }
832 
makeHashCode(int blockValue)833         private int makeHashCode(int blockValue) {
834             int hashCode = blockValue;
835             for (int i = 1; i < blockLength; ++i) {
836                 hashCode = 37 * hashCode + blockValue;
837             }
838             return hashCode;
839         }
840 
addEntry(int[] data32, char[] data16, int blockStart, int hashCode, int dataIndex)841         private void addEntry(int[] data32, char[] data16, int blockStart, int hashCode, int dataIndex) {
842             assert(0 <= dataIndex && dataIndex < mask);
843             int entryIndex = findEntry(data32, data16, data32, data16, blockStart, hashCode);
844             if (entryIndex < 0) {
845                 table[~entryIndex] = (hashCode << shift) | (dataIndex + 1);
846             }
847         }
848 
findEntry(int[] data32, char[] data16, int[] blockData32, char[] blockData16, int blockStart, int hashCode)849         private int findEntry(int[] data32, char[] data16,
850                 int[] blockData32, char[] blockData16, int blockStart, int hashCode) {
851             int shiftedHashCode = hashCode << shift;
852             int initialEntryIndex = modulo(hashCode, length - 1) + 1;  // 1..length-1
853             for (int entryIndex = initialEntryIndex;;) {
854                 int entry = table[entryIndex];
855                 if (entry == 0) {
856                     return ~entryIndex;
857                 }
858                 if ((entry & ~mask) == shiftedHashCode) {
859                     int dataIndex = (entry & mask) - 1;
860                     if (data32 != null ?
861                             equalBlocks(data32, dataIndex, blockData32, blockStart, blockLength) :
862                             blockData32 != null ?
863                                 equalBlocks(data16, dataIndex, blockData32, blockStart, blockLength) :
864                                 equalBlocks(data16, dataIndex, blockData16, blockStart, blockLength)) {
865                         return entryIndex;
866                     }
867                 }
868                 entryIndex = nextIndex(initialEntryIndex, entryIndex);
869             }
870         }
871 
findEntry(int[] data, int blockValue, int hashCode)872         private int findEntry(int[] data, int blockValue, int hashCode) {
873             int shiftedHashCode = hashCode << shift;
874             int initialEntryIndex = modulo(hashCode, length - 1) + 1;  // 1..length-1
875             for (int entryIndex = initialEntryIndex;;) {
876                 int entry = table[entryIndex];
877                 if (entry == 0) {
878                     return ~entryIndex;
879                 }
880                 if ((entry & ~mask) == shiftedHashCode) {
881                     int dataIndex = (entry & mask) - 1;
882                     if (allValuesSameAs(data, dataIndex, blockLength, blockValue)) {
883                         return entryIndex;
884                     }
885                 }
886                 entryIndex = nextIndex(initialEntryIndex, entryIndex);
887             }
888         }
889 
nextIndex(int initialEntryIndex, int entryIndex)890         private int nextIndex(int initialEntryIndex, int entryIndex) {
891             // U_ASSERT(0 < initialEntryIndex && initialEntryIndex < length);
892             return (entryIndex + initialEntryIndex) % length;
893         }
894 
895         /** Ensures non-negative n % m (that is 0..m-1). */
modulo(int n, int m)896         private int modulo(int n, int m) {
897             int i = n % m;
898             if (i < 0) {
899                 i += m;
900             }
901             return i;
902         }
903 
904         // Hash table.
905         // The length is a prime number, larger than the maximum data length.
906         // The "shift" lower bits store a data index + 1.
907         // The remaining upper bits store a partial hashCode of the block data values.
908         private int[] table;
909         private int length;
910         private int shift;
911         private int mask;
912 
913         private int blockLength;
914     }
915 
compactWholeDataBlocks(int fastILimit, AllSameBlocks allSameBlocks)916     private int compactWholeDataBlocks(int fastILimit, AllSameBlocks allSameBlocks) {
917         // ASCII data will be stored as a linear table, even if the following code
918         // does not yet count it that way.
919         int newDataCapacity = ASCII_LIMIT;
920         // Add room for a small data null block in case it would match the start of
921         // a fast data block where dataNullOffset must not be set in that case.
922         newDataCapacity += CodePointTrie.SMALL_DATA_BLOCK_LENGTH;
923         // Add room for special values (errorValue, highValue) and padding.
924         newDataCapacity += 4;
925         int iLimit = highStart >> CodePointTrie.SHIFT_3;
926         int blockLength = CodePointTrie.FAST_DATA_BLOCK_LENGTH;
927         int inc = SMALL_DATA_BLOCKS_PER_BMP_BLOCK;
928         for (int i = 0; i < iLimit; i += inc) {
929             if (i == fastILimit) {
930                 blockLength = CodePointTrie.SMALL_DATA_BLOCK_LENGTH;
931                 inc = 1;
932             }
933             int value = index[i];
934             if (flags[i] == MIXED) {
935                 // Really mixed?
936                 int p = value;
937                 value = data[p];
938                 if (allValuesSameAs(data, p + 1, blockLength - 1, value)) {
939                     flags[i] = ALL_SAME;
940                     index[i] = value;
941                     // Fall through to ALL_SAME handling.
942                 } else {
943                     newDataCapacity += blockLength;
944                     continue;
945                 }
946             } else {
947                 assert(flags[i] == ALL_SAME);
948                 if (inc > 1) {
949                     // Do all of the fast-range data block's ALL_SAME parts have the same value?
950                     boolean allSame = true;
951                     int next_i = i + inc;
952                     for (int j = i + 1; j < next_i; ++j) {
953                         assert(flags[j] == ALL_SAME);
954                         if (index[j] != value) {
955                             allSame = false;
956                             break;
957                         }
958                     }
959                     if (!allSame) {
960                         // Turn it into a MIXED block.
961                         if (getDataBlock(i) < 0) {
962                             return -1;
963                         }
964                         newDataCapacity += blockLength;
965                         continue;
966                     }
967                 }
968             }
969             // Is there another ALL_SAME block with the same value?
970             int other = allSameBlocks.findOrAdd(i, inc, value);
971             if (other == AllSameBlocks.OVERFLOW) {
972                 // The fixed-size array overflowed. Slow check for a duplicate block.
973                 int jInc = SMALL_DATA_BLOCKS_PER_BMP_BLOCK;
974                 for (int j = 0;; j += jInc) {
975                     if (j == i) {
976                         allSameBlocks.add(i, inc, value);
977                         break;
978                     }
979                     if (j == fastILimit) {
980                         jInc = 1;
981                     }
982                     if (flags[j] == ALL_SAME && index[j] == value) {
983                         allSameBlocks.add(j, jInc + inc, value);
984                         other = j;
985                         break;
986                         // We could keep counting blocks with the same value
987                         // before we add the first one, which may improve compaction in rare cases,
988                         // but it would make it slower.
989                     }
990                 }
991             }
992             if (other >= 0) {
993                 flags[i] = SAME_AS;
994                 index[i] = other;
995             } else {
996                 // New unique same-value block.
997                 newDataCapacity += blockLength;
998             }
999         }
1000         return newDataCapacity;
1001     }
1002 
1003     /**
1004      * Compacts a build-time trie.
1005      *
1006      * The compaction
1007      * - removes blocks that are identical with earlier ones
1008      * - overlaps each new non-duplicate block as much as possible with the previously-written one
1009      * - works with fast-range data blocks whose length is a multiple of that of
1010      *   higher-code-point data blocks
1011      *
1012      * It does not try to find an optimal order of writing, deduplicating, and overlapping blocks.
1013      */
compactData( int fastILimit, int[] newData, int dataNullIndex, MixedBlocks mixedBlocks)1014     private int compactData(
1015             int fastILimit, int[] newData, int dataNullIndex, MixedBlocks mixedBlocks) {
1016         // The linear ASCII data has been copied into newData already.
1017         int newDataLength = 0;
1018         for (int i = 0; newDataLength < ASCII_LIMIT;
1019                 newDataLength += CodePointTrie.FAST_DATA_BLOCK_LENGTH, i += SMALL_DATA_BLOCKS_PER_BMP_BLOCK) {
1020             index[i] = newDataLength;
1021         }
1022 
1023         int blockLength = CodePointTrie.FAST_DATA_BLOCK_LENGTH;
1024         mixedBlocks.init(newData.length, blockLength);
1025         mixedBlocks.extend(newData, 0, 0, newDataLength);
1026 
1027         int iLimit = highStart >> CodePointTrie.SHIFT_3;
1028         int inc = SMALL_DATA_BLOCKS_PER_BMP_BLOCK;
1029         int fastLength = 0;
1030         for (int i = ASCII_I_LIMIT; i < iLimit; i += inc) {
1031             if (i == fastILimit) {
1032                 blockLength = CodePointTrie.SMALL_DATA_BLOCK_LENGTH;
1033                 inc = 1;
1034                 fastLength = newDataLength;
1035                 mixedBlocks.init(newData.length, blockLength);
1036                 mixedBlocks.extend(newData, 0, 0, newDataLength);
1037             }
1038             if (flags[i] == ALL_SAME) {
1039                 int value = index[i];
1040                 // Find an earlier part of the data array of length blockLength
1041                 // that is filled with this value.
1042                 int n = mixedBlocks.findAllSameBlock(newData, value);
1043                 // If we find a match, and the current block is the data null block,
1044                 // and it is not a fast block but matches the start of a fast block,
1045                 // then we need to continue looking.
1046                 // This is because this small block is shorter than the fast block,
1047                 // and not all of the rest of the fast block is filled with this value.
1048                 // Otherwise trie.getRange() would detect that the fast block starts at
1049                 // dataNullOffset and assume incorrectly that it is filled with the null value.
1050                 while (n >= 0 && i == dataNullIndex && i >= fastILimit && n < fastLength &&
1051                         isStartOfSomeFastBlock(n, index, fastILimit)) {
1052                     n = findAllSameBlock(newData, n + 1, newDataLength, value, blockLength);
1053                 }
1054                 if (n >= 0) {
1055                     index[i] = n;
1056                 } else {
1057                     n = getAllSameOverlap(newData, newDataLength, value, blockLength);
1058                     index[i] = newDataLength - n;
1059                     int prevDataLength = newDataLength;
1060                     while (n < blockLength) {
1061                         newData[newDataLength++] = value;
1062                         ++n;
1063                     }
1064                     mixedBlocks.extend(newData, 0, prevDataLength, newDataLength);
1065                 }
1066             } else if (flags[i] == MIXED) {
1067                 int block = index[i];
1068                 int n = mixedBlocks.findBlock(newData, data, block);
1069                 if (n >= 0) {
1070                     index[i] = n;
1071                 } else {
1072                     n = getOverlap(newData, newDataLength, data, block, blockLength);
1073                     index[i] = newDataLength - n;
1074                     int prevDataLength = newDataLength;
1075                     while (n < blockLength) {
1076                         newData[newDataLength++] = data[block + n++];
1077                     }
1078                     mixedBlocks.extend(newData, 0, prevDataLength, newDataLength);
1079                 }
1080             } else /* SAME_AS */ {
1081                 int j = index[i];
1082                 index[i] = index[j];
1083             }
1084         }
1085 
1086         return newDataLength;
1087     }
1088 
compactIndex(int fastILimit, MixedBlocks mixedBlocks)1089     private int compactIndex(int fastILimit, MixedBlocks mixedBlocks) {
1090         int fastIndexLength = fastILimit >> (CodePointTrie.FAST_SHIFT - CodePointTrie.SHIFT_3);
1091         if ((highStart >> CodePointTrie.FAST_SHIFT) <= fastIndexLength) {
1092             // Only the linear fast index, no multi-stage index tables.
1093             index3NullOffset = CodePointTrie.NO_INDEX3_NULL_OFFSET;
1094             return fastIndexLength;
1095         }
1096 
1097         // Condense the fast index table.
1098         // Also, does it contain an index-3 block with all dataNullOffset?
1099         char[] fastIndex = new char[fastIndexLength];
1100         int i3FirstNull = -1;
1101         for (int i = 0, j = 0; i < fastILimit; ++j) {
1102             int i3 = index[i];
1103             fastIndex[j] = (char)i3;
1104             if (i3 == dataNullOffset) {
1105                 if (i3FirstNull < 0) {
1106                     i3FirstNull = j;
1107                 } else if (index3NullOffset < 0 &&
1108                         (j - i3FirstNull + 1) == CodePointTrie.INDEX_3_BLOCK_LENGTH) {
1109                     index3NullOffset = i3FirstNull;
1110                 }
1111             } else {
1112                 i3FirstNull = -1;
1113             }
1114             // Set the index entries that compactData() skipped.
1115             // Needed when the multi-stage index covers the fast index range as well.
1116             int iNext = i + SMALL_DATA_BLOCKS_PER_BMP_BLOCK;
1117             while (++i < iNext) {
1118                 i3 += CodePointTrie.SMALL_DATA_BLOCK_LENGTH;
1119                 index[i] = i3;
1120             }
1121         }
1122 
1123         mixedBlocks.init(fastIndexLength, CodePointTrie.INDEX_3_BLOCK_LENGTH);
1124         mixedBlocks.extend(fastIndex, 0, 0, fastIndexLength);
1125 
1126         // Examine index-3 blocks. For each determine one of:
1127         // - same as the index-3 null block
1128         // - same as a fast-index block
1129         // - 16-bit indexes
1130         // - 18-bit indexes
1131         // We store this in the first flags entry for the index-3 block.
1132         //
1133         // Also determine an upper limit for the index-3 table length.
1134         int index3Capacity = 0;
1135         i3FirstNull = index3NullOffset;
1136         boolean hasLongI3Blocks = false;
1137         // If the fast index covers the whole BMP, then
1138         // the multi-stage index is only for supplementary code points.
1139         // Otherwise, the multi-stage index covers all of Unicode.
1140         int iStart = fastILimit < BMP_I_LIMIT ? 0 : BMP_I_LIMIT;
1141         int iLimit = highStart >> CodePointTrie.SHIFT_3;
1142         for (int i = iStart; i < iLimit;) {
1143             int j = i;
1144             int jLimit = i + CodePointTrie.INDEX_3_BLOCK_LENGTH;
1145             int oredI3 = 0;
1146             boolean isNull = true;
1147             do {
1148                 int i3 = index[j];
1149                 oredI3 |= i3;
1150                 if (i3 != dataNullOffset) {
1151                     isNull = false;
1152                 }
1153             } while (++j < jLimit);
1154             if (isNull) {
1155                 flags[i] = I3_NULL;
1156                 if (i3FirstNull < 0) {
1157                     if (oredI3 <= 0xffff) {
1158                         index3Capacity += CodePointTrie.INDEX_3_BLOCK_LENGTH;
1159                     } else {
1160                         index3Capacity += INDEX_3_18BIT_BLOCK_LENGTH;
1161                         hasLongI3Blocks = true;
1162                     }
1163                     i3FirstNull = 0;
1164                 }
1165             } else {
1166                 if (oredI3 <= 0xffff) {
1167                     int n = mixedBlocks.findBlock(fastIndex, index, i);
1168                     if (n >= 0) {
1169                         flags[i] = I3_BMP;
1170                         index[i] = n;
1171                     } else {
1172                         flags[i] = I3_16;
1173                         index3Capacity += CodePointTrie.INDEX_3_BLOCK_LENGTH;
1174                     }
1175                 } else {
1176                     flags[i] = I3_18;
1177                     index3Capacity += INDEX_3_18BIT_BLOCK_LENGTH;
1178                     hasLongI3Blocks = true;
1179                 }
1180             }
1181             i = j;
1182         }
1183 
1184         int index2Capacity = (iLimit - iStart) >> CodePointTrie.SHIFT_2_3;
1185 
1186         // Length of the index-1 table, rounded up.
1187         int index1Length = (index2Capacity + CodePointTrie.INDEX_2_MASK) >> CodePointTrie.SHIFT_1_2;
1188 
1189         // Index table: Fast index, index-1, index-3, index-2.
1190         // +1 for possible index table padding.
1191         int index16Capacity = fastIndexLength + index1Length + index3Capacity + index2Capacity + 1;
1192         index16 = Arrays.copyOf(fastIndex, index16Capacity);
1193 
1194         mixedBlocks.init(index16Capacity, CodePointTrie.INDEX_3_BLOCK_LENGTH);
1195         MixedBlocks longI3Blocks = null;
1196         if (hasLongI3Blocks) {
1197             longI3Blocks = new MixedBlocks();
1198             longI3Blocks.init(index16Capacity, INDEX_3_18BIT_BLOCK_LENGTH);
1199         }
1200 
1201         // Compact the index-3 table and write an uncompacted version of the index-2 table.
1202         char[] index2 = new char[index2Capacity];
1203         int i2Length = 0;
1204         i3FirstNull = index3NullOffset;
1205         int index3Start = fastIndexLength + index1Length;
1206         int indexLength = index3Start;
1207         for (int i = iStart; i < iLimit; i += CodePointTrie.INDEX_3_BLOCK_LENGTH) {
1208             int i3;
1209             byte f = flags[i];
1210             if (f == I3_NULL && i3FirstNull < 0) {
1211                 // First index-3 null block. Write & overlap it like a normal block, then remember it.
1212                 f = dataNullOffset <= 0xffff ? I3_16 : I3_18;
1213                 i3FirstNull = 0;
1214             }
1215             if (f == I3_NULL) {
1216                 i3 = index3NullOffset;
1217             } else if (f == I3_BMP) {
1218                 i3 = index[i];
1219             } else if (f == I3_16) {
1220                 int n = mixedBlocks.findBlock(index16, index, i);
1221                 if (n >= 0) {
1222                     i3 = n;
1223                 } else {
1224                     if (indexLength == index3Start) {
1225                         // No overlap at the boundary between the index-1 and index-3 tables.
1226                         n = 0;
1227                     } else {
1228                         n = getOverlap(index16, indexLength,
1229                                        index, i, CodePointTrie.INDEX_3_BLOCK_LENGTH);
1230                     }
1231                     i3 = indexLength - n;
1232                     int prevIndexLength = indexLength;
1233                     while (n < CodePointTrie.INDEX_3_BLOCK_LENGTH) {
1234                         index16[indexLength++] = (char)index[i + n++];
1235                     }
1236                     mixedBlocks.extend(index16, index3Start, prevIndexLength, indexLength);
1237                     if (hasLongI3Blocks) {
1238                         longI3Blocks.extend(index16, index3Start, prevIndexLength, indexLength);
1239                     }
1240                 }
1241             } else {
1242                 assert(f == I3_18);
1243                 assert(hasLongI3Blocks);
1244                 // Encode an index-3 block that contains one or more data indexes exceeding 16 bits.
1245                 int j = i;
1246                 int jLimit = i + CodePointTrie.INDEX_3_BLOCK_LENGTH;
1247                 int k = indexLength;
1248                 do {
1249                     ++k;
1250                     int v = index[j++];
1251                     int upperBits = (v & 0x30000) >> 2;
1252                     index16[k++] = (char)v;
1253                     v = index[j++];
1254                     upperBits |= (v & 0x30000) >> 4;
1255                     index16[k++] = (char)v;
1256                     v = index[j++];
1257                     upperBits |= (v & 0x30000) >> 6;
1258                     index16[k++] = (char)v;
1259                     v = index[j++];
1260                     upperBits |= (v & 0x30000) >> 8;
1261                     index16[k++] = (char)v;
1262                     v = index[j++];
1263                     upperBits |= (v & 0x30000) >> 10;
1264                     index16[k++] = (char)v;
1265                     v = index[j++];
1266                     upperBits |= (v & 0x30000) >> 12;
1267                     index16[k++] = (char)v;
1268                     v = index[j++];
1269                     upperBits |= (v & 0x30000) >> 14;
1270                     index16[k++] = (char)v;
1271                     v = index[j++];
1272                     upperBits |= (v & 0x30000) >> 16;
1273                     index16[k++] = (char)v;
1274                     index16[k - 9] = (char)upperBits;
1275                 } while (j < jLimit);
1276                 int n = longI3Blocks.findBlock(index16, index16, indexLength);
1277                 if (n >= 0) {
1278                     i3 = n | 0x8000;
1279                 } else {
1280                     if (indexLength == index3Start) {
1281                         // No overlap at the boundary between the index-1 and index-3 tables.
1282                         n = 0;
1283                     } else {
1284                         n = getOverlap(index16, indexLength,
1285                                        index16, indexLength, INDEX_3_18BIT_BLOCK_LENGTH);
1286                     }
1287                     i3 = (indexLength - n) | 0x8000;
1288                     int prevIndexLength = indexLength;
1289                     if (n > 0) {
1290                         int start = indexLength;
1291                         while (n < INDEX_3_18BIT_BLOCK_LENGTH) {
1292                             index16[indexLength++] = index16[start + n++];
1293                         }
1294                     } else {
1295                         indexLength += INDEX_3_18BIT_BLOCK_LENGTH;
1296                     }
1297                     mixedBlocks.extend(index16, index3Start, prevIndexLength, indexLength);
1298                     if (hasLongI3Blocks) {
1299                         longI3Blocks.extend(index16, index3Start, prevIndexLength, indexLength);
1300                     }
1301                 }
1302             }
1303             if (index3NullOffset < 0 && i3FirstNull >= 0) {
1304                 index3NullOffset = i3;
1305             }
1306             // Set the index-2 table entry.
1307             index2[i2Length++] = (char)i3;
1308         }
1309         assert(i2Length == index2Capacity);
1310         assert(indexLength <= index3Start + index3Capacity);
1311 
1312         if (index3NullOffset < 0) {
1313             index3NullOffset = CodePointTrie.NO_INDEX3_NULL_OFFSET;
1314         }
1315         if (indexLength >= (CodePointTrie.NO_INDEX3_NULL_OFFSET + CodePointTrie.INDEX_3_BLOCK_LENGTH)) {
1316             // The index-3 offsets exceed 15 bits, or
1317             // the last one cannot be distinguished from the no-null-block value.
1318             throw new IndexOutOfBoundsException(
1319                     "The trie data exceeds limitations of the data structure.");
1320         }
1321 
1322         // Compact the index-2 table and write the index-1 table.
1323         // assert(CodePointTrie.INDEX_2_BLOCK_LENGTH == CodePointTrie.INDEX_3_BLOCK_LENGTH) :
1324         //     "must re-init mixedBlocks";
1325         int blockLength = CodePointTrie.INDEX_2_BLOCK_LENGTH;
1326         int i1 = fastIndexLength;
1327         for (int i = 0; i < i2Length; i += blockLength) {
1328             int n;
1329             if ((i2Length - i) >= blockLength) {
1330                 // normal block
1331                 assert(blockLength == CodePointTrie.INDEX_2_BLOCK_LENGTH);
1332                 n = mixedBlocks.findBlock(index16, index2, i);
1333             } else {
1334                 // highStart is inside the last index-2 block. Shorten it.
1335                 blockLength = i2Length - i;
1336                 n = findSameBlock(index16, index3Start, indexLength,
1337                         index2, i, blockLength);
1338             }
1339             int i2;
1340             if (n >= 0) {
1341                 i2 = n;
1342             } else {
1343                 if (indexLength == index3Start) {
1344                     // No overlap at the boundary between the index-1 and index-3/2 tables.
1345                     n = 0;
1346                 } else {
1347                     n = getOverlap(index16, indexLength, index2, i, blockLength);
1348                 }
1349                 i2 = indexLength - n;
1350                 int prevIndexLength = indexLength;
1351                 while (n < blockLength) {
1352                     index16[indexLength++] = index2[i + n++];
1353                 }
1354                 mixedBlocks.extend(index16, index3Start, prevIndexLength, indexLength);
1355             }
1356             // Set the index-1 table entry.
1357             index16[i1++] = (char)i2;
1358         }
1359         assert(i1 == index3Start);
1360         assert(indexLength <= index16Capacity);
1361 
1362         return indexLength;
1363     }
1364 
compactTrie(int fastILimit)1365     private int compactTrie(int fastILimit) {
1366         // Find the real highStart and round it up.
1367         assert((highStart & (CodePointTrie.CP_PER_INDEX_2_ENTRY - 1)) == 0);
1368         highValue = get(MAX_UNICODE);
1369         int realHighStart = findHighStart();
1370         realHighStart = (realHighStart + (CodePointTrie.CP_PER_INDEX_2_ENTRY - 1)) &
1371             ~(CodePointTrie.CP_PER_INDEX_2_ENTRY - 1);
1372         if (realHighStart == UNICODE_LIMIT) {
1373             highValue = initialValue;
1374         }
1375 
1376         // We always store indexes and data values for the fast range.
1377         // Pin highStart to the top of that range while building.
1378         int fastLimit = fastILimit << CodePointTrie.SHIFT_3;
1379         if (realHighStart < fastLimit) {
1380             for (int i = (realHighStart >> CodePointTrie.SHIFT_3); i < fastILimit; ++i) {
1381                 flags[i] = ALL_SAME;
1382                 index[i] = highValue;
1383             }
1384             highStart = fastLimit;
1385         } else {
1386             highStart = realHighStart;
1387         }
1388 
1389         int[] asciiData = new int[ASCII_LIMIT];
1390         for (int i = 0; i < ASCII_LIMIT; ++i) {
1391             asciiData[i] = get(i);
1392         }
1393 
1394         // First we look for which data blocks have the same value repeated over the whole block,
1395         // deduplicate such blocks, find a good null data block (for faster enumeration),
1396         // and get an upper bound for the necessary data array length.
1397         AllSameBlocks allSameBlocks = new AllSameBlocks();
1398         int newDataCapacity = compactWholeDataBlocks(fastILimit, allSameBlocks);
1399         int[] newData = Arrays.copyOf(asciiData, newDataCapacity);
1400 
1401         int dataNullIndex = allSameBlocks.findMostUsed();
1402 
1403         MixedBlocks mixedBlocks = new MixedBlocks();
1404         int newDataLength = compactData(fastILimit, newData, dataNullIndex, mixedBlocks);
1405         assert(newDataLength <= newDataCapacity);
1406         data = newData;
1407         dataLength = newDataLength;
1408         if (dataLength > (0x3ffff + CodePointTrie.SMALL_DATA_BLOCK_LENGTH)) {
1409             // The offset of the last data block is too high to be stored in the index table.
1410             throw new IndexOutOfBoundsException(
1411                     "The trie data exceeds limitations of the data structure.");
1412         }
1413 
1414         if (dataNullIndex >= 0) {
1415             dataNullOffset = index[dataNullIndex];
1416             initialValue = data[dataNullOffset];
1417         } else {
1418             dataNullOffset = CodePointTrie.NO_DATA_NULL_OFFSET;
1419         }
1420 
1421         int indexLength = compactIndex(fastILimit, mixedBlocks);
1422         highStart = realHighStart;
1423         return indexLength;
1424     }
1425 
build(CodePointTrie.Type type, CodePointTrie.ValueWidth valueWidth)1426     private CodePointTrie build(CodePointTrie.Type type, CodePointTrie.ValueWidth valueWidth) {
1427         // The mutable trie always stores 32-bit values.
1428         // When we build a UCPTrie for a smaller value width, we first mask off unused bits
1429         // before compacting the data.
1430         switch (valueWidth) {
1431         case BITS_32:
1432             break;
1433         case BITS_16:
1434             maskValues(0xffff);
1435             break;
1436         case BITS_8:
1437             maskValues(0xff);
1438             break;
1439         default:
1440             // Should be unreachable.
1441             throw new IllegalArgumentException();
1442         }
1443 
1444         int fastLimit = type == CodePointTrie.Type.FAST ? BMP_LIMIT : CodePointTrie.SMALL_LIMIT;
1445         int indexLength = compactTrie(fastLimit >> CodePointTrie.SHIFT_3);
1446 
1447         // Ensure data table alignment: The index length must be even for uint32_t data.
1448         if (valueWidth == CodePointTrie.ValueWidth.BITS_32 && (indexLength & 1) != 0) {
1449             index16[indexLength++] = 0xffee;  // arbitrary value
1450         }
1451 
1452         // Make the total trie structure length a multiple of 4 bytes by padding the data table,
1453         // and store special values as the last two data values.
1454         int length = indexLength * 2;
1455         if (valueWidth == CodePointTrie.ValueWidth.BITS_16) {
1456             if (((indexLength ^ dataLength) & 1) != 0) {
1457                 // padding
1458                 data[dataLength++] = errorValue;
1459             }
1460             if (data[dataLength - 1] != errorValue || data[dataLength - 2] != highValue) {
1461                 data[dataLength++] = highValue;
1462                 data[dataLength++] = errorValue;
1463             }
1464             length += dataLength * 2;
1465         } else if (valueWidth == CodePointTrie.ValueWidth.BITS_32) {
1466             // 32-bit data words never need padding to a multiple of 4 bytes.
1467             if (data[dataLength - 1] != errorValue || data[dataLength - 2] != highValue) {
1468                 if (data[dataLength - 1] != highValue) {
1469                     data[dataLength++] = highValue;
1470                 }
1471                 data[dataLength++] = errorValue;
1472             }
1473             length += dataLength * 4;
1474         } else {
1475             int and3 = (length + dataLength) & 3;
1476             if (and3 == 0 && data[dataLength - 1] == errorValue && data[dataLength - 2] == highValue) {
1477                 // all set
1478             } else if(and3 == 3 && data[dataLength - 1] == highValue) {
1479                 data[dataLength++] = errorValue;
1480             } else {
1481                 while (and3 != 2) {
1482                     data[dataLength++] = highValue;
1483                     and3 = (and3 + 1) & 3;
1484                 }
1485                 data[dataLength++] = highValue;
1486                 data[dataLength++] = errorValue;
1487             }
1488             length += dataLength;
1489         }
1490         assert((length & 3) == 0);
1491 
1492         // Fill the index and data arrays.
1493         char[] trieIndex;
1494         if (highStart <= fastLimit) {
1495             // Condense only the fast index from the mutable-trie index.
1496             trieIndex = new char[indexLength];
1497             for (int i = 0, j = 0; j < indexLength; i += SMALL_DATA_BLOCKS_PER_BMP_BLOCK, ++j) {
1498                 trieIndex[j] = (char)index[i];
1499             }
1500         } else {
1501             if (indexLength == index16.length) {
1502                 trieIndex = index16;
1503                 index16 = null;
1504             } else {
1505                 trieIndex = Arrays.copyOf(index16, indexLength);
1506             }
1507         }
1508 
1509         // Write the data array.
1510         switch (valueWidth) {
1511         case BITS_16: {
1512             // Write 16-bit data values.
1513             char[] data16 = new char[dataLength];
1514             for (int i = 0; i < dataLength; ++i) { data16[i] = (char)data[i]; }
1515             return type == CodePointTrie.Type.FAST ?
1516                     new CodePointTrie.Fast16(trieIndex, data16, highStart,
1517                             index3NullOffset, dataNullOffset) :
1518                     new CodePointTrie.Small16(trieIndex, data16, highStart,
1519                             index3NullOffset, dataNullOffset);
1520         }
1521         case BITS_32: {
1522             // Write 32-bit data values.
1523             int[] data32 = Arrays.copyOf(data, dataLength);
1524             return type == CodePointTrie.Type.FAST ?
1525                     new CodePointTrie.Fast32(trieIndex, data32, highStart,
1526                             index3NullOffset, dataNullOffset) :
1527                     new CodePointTrie.Small32(trieIndex, data32, highStart,
1528                             index3NullOffset, dataNullOffset);
1529         }
1530         case BITS_8: {
1531             // Write 8-bit data values.
1532             byte[] data8 = new byte[dataLength];
1533             for (int i = 0; i < dataLength; ++i) { data8[i] = (byte)data[i]; }
1534             return type == CodePointTrie.Type.FAST ?
1535                     new CodePointTrie.Fast8(trieIndex, data8, highStart,
1536                             index3NullOffset, dataNullOffset) :
1537                     new CodePointTrie.Small8(trieIndex, data8, highStart,
1538                             index3NullOffset, dataNullOffset);
1539         }
1540         default:
1541             // Should be unreachable.
1542             throw new IllegalArgumentException();
1543         }
1544     }
1545 }
1546