• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* GENERATED SOURCE. DO NOT MODIFY. */
2 // © 2016 and later: Unicode, Inc. and others.
3 // License & terms of use: http://www.unicode.org/copyright.html#License
4 /*
5 *******************************************************************************
6 * Copyright (C) 2012-2015, International Business Machines
7 * Corporation and others.  All Rights Reserved.
8 *******************************************************************************
9 * CollationDataBuilder.java, ported from collationdatabuilder.h/.cpp
10 *
11 * C++ version created on: 2012apr01
12 * created by: Markus W. Scherer
13 */
14 
15 package ohos.global.icu.impl.coll;
16 
17 import java.util.ArrayList;
18 import java.util.Arrays;
19 import java.util.Iterator;
20 
21 import ohos.global.icu.impl.Norm2AllModes;
22 import ohos.global.icu.impl.Normalizer2Impl;
23 import ohos.global.icu.impl.Normalizer2Impl.Hangul;
24 import ohos.global.icu.impl.Trie2;
25 import ohos.global.icu.impl.Trie2Writable;
26 import ohos.global.icu.lang.UCharacter;
27 import ohos.global.icu.text.UnicodeSet;
28 import ohos.global.icu.text.UnicodeSetIterator;
29 import ohos.global.icu.util.CharsTrie;
30 import ohos.global.icu.util.CharsTrieBuilder;
31 import ohos.global.icu.util.StringTrieBuilder;
32 
33 /**
34  * Low-level CollationData builder.
35  * Takes (character, CE) pairs and builds them into runtime data structures.
36  * Supports characters with context prefixes and contraction suffixes.
37  */
38 final class CollationDataBuilder {  // not final in C++
39     /**
40      * Collation element modifier. Interface class for a modifier
41      * that changes a tailoring builder's temporary CEs to final CEs.
42      * Called for every non-special CE32 and every expansion CE.
43      */
44     interface CEModifier {
45         /** Returns a new CE to replace the non-special input CE32, or else Collation.NO_CE. */
modifyCE32(int ce32)46         long modifyCE32(int ce32);
47         /** Returns a new CE to replace the input CE, or else Collation.NO_CE. */
modifyCE(long ce)48         long modifyCE(long ce);
49     }
50 
CollationDataBuilder()51     CollationDataBuilder() {
52         nfcImpl = Norm2AllModes.getNFCInstance().impl;
53         base = null;
54         baseSettings = null;
55         trie = null;
56         ce32s = new UVector32();
57         ce64s = new UVector64();
58         conditionalCE32s = new ArrayList<ConditionalCE32>();
59         modified = false;
60         fastLatinEnabled = false;
61         fastLatinBuilder = null;
62         collIter = null;
63         // Reserve the first CE32 for U+0000.
64         ce32s.addElement(0);
65     }
66 
initForTailoring(CollationData b)67     void initForTailoring(CollationData b) {
68         if(trie != null) {
69             throw new IllegalStateException("attempt to reuse a CollationDataBuilder");
70         }
71         if(b == null) {
72             throw new IllegalArgumentException("null CollationData");
73         }
74         base = b;
75 
76         // For a tailoring, the default is to fall back to the base.
77         trie = new Trie2Writable(Collation.FALLBACK_CE32, Collation.FFFD_CE32);
78 
79         // Set the Latin-1 letters block so that it is allocated first in the data array,
80         // to try to improve locality of reference when sorting Latin-1 text.
81         // Do not use utrie2_setRange32() since that will not actually allocate blocks
82         // that are filled with the default value.
83         // ASCII (0..7F) is already preallocated anyway.
84         for(int c = 0xc0; c <= 0xff; ++c) {
85             trie.set(c, Collation.FALLBACK_CE32);
86         }
87 
88         // Hangul syllables are not tailorable (except via tailoring Jamos).
89         // Always set the Hangul tag to help performance.
90         // Do this here, rather than in buildMappings(),
91         // so that we see the HANGUL_TAG in various assertions.
92         int hangulCE32 = Collation.makeCE32FromTagAndIndex(Collation.HANGUL_TAG, 0);
93         trie.setRange(Hangul.HANGUL_BASE, Hangul.HANGUL_END, hangulCE32, true);
94 
95         // Copy the set contents but don't copy/clone the set as a whole because
96         // that would copy the isFrozen state too.
97         unsafeBackwardSet.addAll(b.unsafeBackwardSet);
98     }
99 
isCompressibleLeadByte(int b)100     boolean isCompressibleLeadByte(int b) {
101         return base.isCompressibleLeadByte(b);
102     }
103 
isCompressiblePrimary(long p)104     boolean isCompressiblePrimary(long p) {
105         return isCompressibleLeadByte((int)p >>> 24);
106     }
107 
108     /**
109      * @return true if this builder has mappings (e.g., add() has been called)
110      */
hasMappings()111     boolean hasMappings() { return modified; }
112 
113     /**
114      * @return true if c has CEs in this builder
115      */
isAssigned(int c)116     boolean isAssigned(int c) {
117         return Collation.isAssignedCE32(trie.get(c));
118     }
119 
add(CharSequence prefix, CharSequence s, long ces[], int cesLength)120     void add(CharSequence prefix, CharSequence s, long ces[], int cesLength) {
121         int ce32 = encodeCEs(ces, cesLength);
122         addCE32(prefix, s, ce32);
123     }
124 
125     /**
126      * Encodes the ces as either the returned ce32 by itself,
127      * or by storing an expansion, with the returned ce32 referring to that.
128      *
129      * <p>add(p, s, ces, cesLength) = addCE32(p, s, encodeCEs(ces, cesLength))
130      */
encodeCEs(long ces[], int cesLength)131     int encodeCEs(long ces[], int cesLength) {
132         if(cesLength < 0 || cesLength > Collation.MAX_EXPANSION_LENGTH) {
133             throw new IllegalArgumentException("mapping to too many CEs");
134         }
135         if(!isMutable()) {
136             throw new IllegalStateException("attempt to add mappings after build()");
137         }
138         if(cesLength == 0) {
139             // Convenience: We cannot map to nothing, but we can map to a completely ignorable CE.
140             // Do this here so that callers need not do it.
141             return encodeOneCEAsCE32(0);
142         } else if(cesLength == 1) {
143             return encodeOneCE(ces[0]);
144         } else if(cesLength == 2) {
145             // Try to encode two CEs as one CE32.
146             long ce0 = ces[0];
147             long ce1 = ces[1];
148             long p0 = ce0 >>> 32;
149             if((ce0 & 0xffffffffff00ffL) == Collation.COMMON_SECONDARY_CE &&
150                     (ce1 & 0xffffffff00ffffffL) == Collation.COMMON_TERTIARY_CE &&
151                     p0 != 0) {
152                 // Latin mini expansion
153                 return
154                     (int)p0 |
155                     (((int)ce0 & 0xff00) << 8) |
156                     (((int)ce1 >> 16) & 0xff00) |
157                     Collation.SPECIAL_CE32_LOW_BYTE |
158                     Collation.LATIN_EXPANSION_TAG;
159             }
160         }
161         // Try to encode two or more CEs as CE32s.
162         int[] newCE32s = new int[Collation.MAX_EXPANSION_LENGTH];  // TODO: instance field?
163         for(int i = 0;; ++i) {
164             if(i == cesLength) {
165                 return encodeExpansion32(newCE32s, 0, cesLength);
166             }
167             int ce32 = encodeOneCEAsCE32(ces[i]);
168             if(ce32 == Collation.NO_CE32) { break; }
169             newCE32s[i] = ce32;
170         }
171         return encodeExpansion(ces, 0, cesLength);
172     }
173 
addCE32(CharSequence prefix, CharSequence s, int ce32)174     void addCE32(CharSequence prefix, CharSequence s, int ce32) {
175         if(s.length() == 0) {
176             throw new IllegalArgumentException("mapping from empty string");
177         }
178         if(!isMutable()) {
179             throw new IllegalStateException("attempt to add mappings after build()");
180         }
181         int c = Character.codePointAt(s, 0);
182         int cLength = Character.charCount(c);
183         int oldCE32 = trie.get(c);
184         boolean hasContext = prefix.length() != 0|| s.length() > cLength;
185         if(oldCE32 == Collation.FALLBACK_CE32) {
186             // First tailoring for c.
187             // If c has contextual base mappings or if we add a contextual mapping,
188             // then copy the base mappings.
189             // Otherwise we just override the base mapping.
190             int baseCE32 = base.getFinalCE32(base.getCE32(c));
191             if(hasContext || Collation.ce32HasContext(baseCE32)) {
192                 oldCE32 = copyFromBaseCE32(c, baseCE32, true);
193                 trie.set(c, oldCE32);
194             }
195         }
196         if(!hasContext) {
197             // No prefix, no contraction.
198             if(!isBuilderContextCE32(oldCE32)) {
199                 trie.set(c, ce32);
200             } else {
201                 ConditionalCE32 cond = getConditionalCE32ForCE32(oldCE32);
202                 cond.builtCE32 = Collation.NO_CE32;
203                 cond.ce32 = ce32;
204             }
205         } else {
206             ConditionalCE32 cond;
207             if(!isBuilderContextCE32(oldCE32)) {
208                 // Replace the simple oldCE32 with a builder context CE32
209                 // pointing to a new ConditionalCE32 list head.
210                 int index = addConditionalCE32("\0", oldCE32);
211                 int contextCE32 = makeBuilderContextCE32(index);
212                 trie.set(c, contextCE32);
213                 contextChars.add(c);
214                 cond = getConditionalCE32(index);
215             } else {
216                 cond = getConditionalCE32ForCE32(oldCE32);
217                 cond.builtCE32 = Collation.NO_CE32;
218             }
219             CharSequence suffix = s.subSequence(cLength, s.length());
220             String context = new StringBuilder().append((char)prefix.length()).
221                     append(prefix).append(suffix).toString();
222             unsafeBackwardSet.addAll(suffix);
223             for(;;) {
224                 // invariant: context > cond.context
225                 int next = cond.next;
226                 if(next < 0) {
227                     // Append a new ConditionalCE32 after cond.
228                     int index = addConditionalCE32(context, ce32);
229                     cond.next = index;
230                     break;
231                 }
232                 ConditionalCE32 nextCond = getConditionalCE32(next);
233                 int cmp = context.compareTo(nextCond.context);
234                 if(cmp < 0) {
235                     // Insert a new ConditionalCE32 between cond and nextCond.
236                     int index = addConditionalCE32(context, ce32);
237                     cond.next = index;
238                     getConditionalCE32(index).next = next;
239                     break;
240                 } else if(cmp == 0) {
241                     // Same context as before, overwrite its ce32.
242                     nextCond.ce32 = ce32;
243                     break;
244                 }
245                 cond = nextCond;
246             }
247         }
248         modified = true;
249     }
250 
251     /**
252      * Copies all mappings from the src builder, with modifications.
253      * This builder here must not be built yet, and should be empty.
254      */
copyFrom(CollationDataBuilder src, CEModifier modifier)255     void copyFrom(CollationDataBuilder src, CEModifier modifier) {
256         if(!isMutable()) {
257             throw new IllegalStateException("attempt to copyFrom() after build()");
258         }
259         CopyHelper helper = new CopyHelper(src, this, modifier);
260         Iterator<Trie2.Range> trieIterator = src.trie.iterator();
261         Trie2.Range range;
262         while(trieIterator.hasNext() && !(range = trieIterator.next()).leadSurrogate) {
263             enumRangeForCopy(range.startCodePoint, range.endCodePoint, range.value, helper);
264         }
265         // Update the contextChars and the unsafeBackwardSet while copying,
266         // in case a character had conditional mappings in the source builder
267         // and they were removed later.
268         modified |= src.modified;
269     }
270 
optimize(UnicodeSet set)271     void optimize(UnicodeSet set) {
272         if(set.isEmpty()) { return; }
273         UnicodeSetIterator iter = new UnicodeSetIterator(set);
274         while(iter.next() && iter.codepoint != UnicodeSetIterator.IS_STRING) {
275             int c = iter.codepoint;
276             int ce32 = trie.get(c);
277             if(ce32 == Collation.FALLBACK_CE32) {
278                 ce32 = base.getFinalCE32(base.getCE32(c));
279                 ce32 = copyFromBaseCE32(c, ce32, true);
280                 trie.set(c, ce32);
281             }
282         }
283         modified = true;
284     }
285 
suppressContractions(UnicodeSet set)286     void suppressContractions(UnicodeSet set) {
287         if(set.isEmpty()) { return; }
288         UnicodeSetIterator iter = new UnicodeSetIterator(set);
289         while(iter.next() && iter.codepoint != UnicodeSetIterator.IS_STRING) {
290             int c = iter.codepoint;
291             int ce32 = trie.get(c);
292             if(ce32 == Collation.FALLBACK_CE32) {
293                 ce32 = base.getFinalCE32(base.getCE32(c));
294                 if(Collation.ce32HasContext(ce32)) {
295                     ce32 = copyFromBaseCE32(c, ce32, false /* without context */);
296                     trie.set(c, ce32);
297                 }
298             } else if(isBuilderContextCE32(ce32)) {
299                 ce32 = getConditionalCE32ForCE32(ce32).ce32;
300                 // Simply abandon the list of ConditionalCE32.
301                 // The caller will copy this builder in the end,
302                 // eliminating unreachable data.
303                 trie.set(c, ce32);
304                 contextChars.remove(c);
305             }
306         }
307         modified = true;
308     }
309 
enableFastLatin()310     void enableFastLatin() { fastLatinEnabled = true; }
build(CollationData data)311     void build(CollationData data) {
312         buildMappings(data);
313         if(base != null) {
314             data.numericPrimary = base.numericPrimary;
315             data.compressibleBytes = base.compressibleBytes;
316             data.numScripts = base.numScripts;
317             data.scriptsIndex = base.scriptsIndex;
318             data.scriptStarts = base.scriptStarts;
319         }
320         buildFastLatinTable(data);
321     }
322 
323     /**
324      * Looks up CEs for s and appends them to the ces array.
325      * Does not handle normalization: s should be in FCD form.
326      *
327      * Does not write completely ignorable CEs.
328      * Does not write beyond Collation.MAX_EXPANSION_LENGTH.
329      *
330      * @return incremented cesLength
331      */
getCEs(CharSequence s, long ces[], int cesLength)332     int getCEs(CharSequence s, long ces[], int cesLength) {
333         return getCEs(s, 0, ces, cesLength);
334     }
335 
getCEs(CharSequence prefix, CharSequence s, long ces[], int cesLength)336     int getCEs(CharSequence prefix, CharSequence s, long ces[], int cesLength) {
337         int prefixLength = prefix.length();
338         if(prefixLength == 0) {
339             return getCEs(s, 0, ces, cesLength);
340         } else {
341             return getCEs(new StringBuilder(prefix).append(s), prefixLength, ces, cesLength);
342         }
343     }
344 
345     /**
346      * Build-time context and CE32 for a code point.
347      * If a code point has contextual mappings, then the default (no-context) mapping
348      * and all conditional mappings are stored in a singly-linked list
349      * of ConditionalCE32, sorted by context strings.
350      *
351      * Context strings sort by prefix length, then by prefix, then by contraction suffix.
352      * Context strings must be unique and in ascending order.
353      */
354     private static final class ConditionalCE32 {
ConditionalCE32(String ct, int ce)355         ConditionalCE32(String ct, int ce) {
356             context = ct;
357             ce32 = ce;
358             defaultCE32 = Collation.NO_CE32;
359             builtCE32 = Collation.NO_CE32;
360             next = -1;
361         }
362 
hasContext()363         boolean hasContext() { return context.length() > 1; }
prefixLength()364         int prefixLength() { return context.charAt(0); }
365 
366         /**
367          * "\0" for the first entry for any code point, with its default CE32.
368          *
369          * Otherwise one unit with the length of the prefix string,
370          * then the prefix string, then the contraction suffix.
371          */
372         String context;
373         /**
374          * CE32 for the code point and its context.
375          * Can be special (e.g., for an expansion) but not contextual (prefix or contraction tag).
376          */
377         int ce32;
378         /**
379          * Default CE32 for all contexts with this same prefix.
380          * Initially NO_CE32. Set only while building runtime data structures,
381          * and only on one of the nodes of a sub-list with the same prefix.
382          */
383         int defaultCE32;
384         /**
385          * CE32 for the built contexts.
386          * When fetching CEs from the builder, the contexts are built into their runtime form
387          * so that the normal collation implementation can process them.
388          * The result is cached in the list head. It is reset when the contexts are modified.
389          */
390         int builtCE32;
391         /**
392          * Index of the next ConditionalCE32.
393          * Negative for the end of the list.
394          */
395         int next;
396     }
397 
getCE32FromOffsetCE32(boolean fromBase, int c, int ce32)398     protected int getCE32FromOffsetCE32(boolean fromBase, int c, int ce32) {
399         int i = Collation.indexFromCE32(ce32);
400         long dataCE = fromBase ? base.ces[i] : ce64s.elementAti(i);
401         long p = Collation.getThreeBytePrimaryForOffsetData(c, dataCE);
402         return Collation.makeLongPrimaryCE32(p);
403     }
404 
addCE(long ce)405     protected int addCE(long ce) {
406         int length = ce64s.size();
407         for(int i = 0; i < length; ++i) {
408             if(ce == ce64s.elementAti(i)) { return i; }
409         }
410         ce64s.addElement(ce);
411         return length;
412     }
413 
addCE32(int ce32)414     protected int addCE32(int ce32) {
415         int length = ce32s.size();
416         for(int i = 0; i < length; ++i) {
417             if(ce32 == ce32s.elementAti(i)) { return i; }
418         }
419         ce32s.addElement(ce32);
420         return length;
421     }
422 
addConditionalCE32(String context, int ce32)423     protected int addConditionalCE32(String context, int ce32) {
424         assert(context.length() != 0);
425         int index = conditionalCE32s.size();
426         if(index > Collation.MAX_INDEX) {
427             throw new IndexOutOfBoundsException("too many context-sensitive mappings");
428             // BufferOverflowException is a better fit
429             // but cannot be constructed with a message string.
430         }
431         ConditionalCE32 cond = new ConditionalCE32(context, ce32);
432         conditionalCE32s.add(cond);
433         return index;
434     }
435 
getConditionalCE32(int index)436     protected ConditionalCE32 getConditionalCE32(int index) {
437         return conditionalCE32s.get(index);
438     }
getConditionalCE32ForCE32(int ce32)439     protected ConditionalCE32 getConditionalCE32ForCE32(int ce32) {
440         return getConditionalCE32(Collation.indexFromCE32(ce32));
441     }
442 
makeBuilderContextCE32(int index)443     protected static int makeBuilderContextCE32(int index) {
444         return Collation.makeCE32FromTagAndIndex(Collation.BUILDER_DATA_TAG, index);
445     }
isBuilderContextCE32(int ce32)446     protected static boolean isBuilderContextCE32(int ce32) {
447         return Collation.hasCE32Tag(ce32, Collation.BUILDER_DATA_TAG);
448     }
449 
encodeOneCEAsCE32(long ce)450     protected static int encodeOneCEAsCE32(long ce) {
451         long p = ce >>> 32;
452         int lower32 = (int)ce;
453         int t = lower32 & 0xffff;
454         assert((t & 0xc000) != 0xc000);  // Impossible case bits 11 mark special CE32s.
455         if((ce & 0xffff00ff00ffL) == 0) {
456             // normal form ppppsstt
457             return (int)p | (lower32 >>> 16) | (t >> 8);
458         } else if((ce & 0xffffffffffL) == Collation.COMMON_SEC_AND_TER_CE) {
459             // long-primary form ppppppC1
460             return Collation.makeLongPrimaryCE32(p);
461         } else if(p == 0 && (t & 0xff) == 0) {
462             // long-secondary form ssssttC2
463             return Collation.makeLongSecondaryCE32(lower32);
464         }
465         return Collation.NO_CE32;
466     }
467 
encodeOneCE(long ce)468     protected int encodeOneCE(long ce) {
469         // Try to encode one CE as one CE32.
470         int ce32 = encodeOneCEAsCE32(ce);
471         if(ce32 != Collation.NO_CE32) { return ce32; }
472         int index = addCE(ce);
473         if(index > Collation.MAX_INDEX) {
474             throw new IndexOutOfBoundsException("too many mappings");
475             // BufferOverflowException is a better fit
476             // but cannot be constructed with a message string.
477         }
478         return Collation.makeCE32FromTagIndexAndLength(Collation.EXPANSION_TAG, index, 1);
479     }
480 
encodeExpansion(long ces[], int start, int length)481     protected int encodeExpansion(long ces[], int start, int length) {
482         // See if this sequence of CEs has already been stored.
483         long first = ces[start];
484         int ce64sMax = ce64s.size() - length;
485         for(int i = 0; i <= ce64sMax; ++i) {
486             if(first == ce64s.elementAti(i)) {
487                 if(i > Collation.MAX_INDEX) {
488                     throw new IndexOutOfBoundsException("too many mappings");
489                     // BufferOverflowException is a better fit
490                     // but cannot be constructed with a message string.
491                 }
492                 for(int j = 1;; ++j) {
493                     if(j == length) {
494                         return Collation.makeCE32FromTagIndexAndLength(
495                                 Collation.EXPANSION_TAG, i, length);
496                     }
497                     if(ce64s.elementAti(i + j) != ces[start + j]) { break; }
498                 }
499             }
500         }
501         // Store the new sequence.
502         int i = ce64s.size();
503         if(i > Collation.MAX_INDEX) {
504             throw new IndexOutOfBoundsException("too many mappings");
505             // BufferOverflowException is a better fit
506             // but cannot be constructed with a message string.
507         }
508         for(int j = 0; j < length; ++j) {
509             ce64s.addElement(ces[start + j]);
510         }
511         return Collation.makeCE32FromTagIndexAndLength(Collation.EXPANSION_TAG, i, length);
512     }
513 
encodeExpansion32(int newCE32s[], int start, int length)514     protected int encodeExpansion32(int newCE32s[], int start, int length) {
515         // See if this sequence of CE32s has already been stored.
516         int first = newCE32s[start];
517         int ce32sMax = ce32s.size() - length;
518         for(int i = 0; i <= ce32sMax; ++i) {
519             if(first == ce32s.elementAti(i)) {
520                 if(i > Collation.MAX_INDEX) {
521                     throw new IndexOutOfBoundsException("too many mappings");
522                     // BufferOverflowException is a better fit
523                     // but cannot be constructed with a message string.
524                 }
525                 for(int j = 1;; ++j) {
526                     if(j == length) {
527                         return Collation.makeCE32FromTagIndexAndLength(
528                                 Collation.EXPANSION32_TAG, i, length);
529                     }
530                     if(ce32s.elementAti(i + j) != newCE32s[start + j]) { break; }
531                 }
532             }
533         }
534         // Store the new sequence.
535         int i = ce32s.size();
536         if(i > Collation.MAX_INDEX) {
537             throw new IndexOutOfBoundsException("too many mappings");
538             // BufferOverflowException is a better fit
539             // but cannot be constructed with a message string.
540         }
541         for(int j = 0; j < length; ++j) {
542             ce32s.addElement(newCE32s[start + j]);
543         }
544         return Collation.makeCE32FromTagIndexAndLength(Collation.EXPANSION32_TAG, i, length);
545     }
546 
copyFromBaseCE32(int c, int ce32, boolean withContext)547     protected int copyFromBaseCE32(int c, int ce32, boolean withContext) {
548         if(!Collation.isSpecialCE32(ce32)) { return ce32; }
549         switch(Collation.tagFromCE32(ce32)) {
550         case Collation.LONG_PRIMARY_TAG:
551         case Collation.LONG_SECONDARY_TAG:
552         case Collation.LATIN_EXPANSION_TAG:
553             // copy as is
554             break;
555         case Collation.EXPANSION32_TAG: {
556             int index = Collation.indexFromCE32(ce32);
557             int length = Collation.lengthFromCE32(ce32);
558             ce32 = encodeExpansion32(base.ce32s, index, length);
559             break;
560         }
561         case Collation.EXPANSION_TAG: {
562             int index = Collation.indexFromCE32(ce32);
563             int length = Collation.lengthFromCE32(ce32);
564             ce32 = encodeExpansion(base.ces, index, length);
565             break;
566         }
567         case Collation.PREFIX_TAG: {
568             // Flatten prefixes and nested suffixes (contractions)
569             // into a linear list of ConditionalCE32.
570             int trieIndex = Collation.indexFromCE32(ce32);
571             ce32 = base.getCE32FromContexts(trieIndex);  // Default if no prefix match.
572             if(!withContext) {
573                 return copyFromBaseCE32(c, ce32, false);
574             }
575             ConditionalCE32 head = new ConditionalCE32("", 0);
576             StringBuilder context = new StringBuilder("\0");
577             int index;
578             if(Collation.isContractionCE32(ce32)) {
579                 index = copyContractionsFromBaseCE32(context, c, ce32, head);
580             } else {
581                 ce32 = copyFromBaseCE32(c, ce32, true);
582                 head.next = index = addConditionalCE32(context.toString(), ce32);
583             }
584             ConditionalCE32 cond = getConditionalCE32(index);  // the last ConditionalCE32 so far
585             CharsTrie.Iterator prefixes = CharsTrie.iterator(base.contexts, trieIndex + 2, 0);
586             while(prefixes.hasNext()) {
587                 CharsTrie.Entry entry = prefixes.next();
588                 context.setLength(0);
589                 context.append(entry.chars).reverse().insert(0, (char)entry.chars.length());
590                 ce32 = entry.value;
591                 if(Collation.isContractionCE32(ce32)) {
592                     index = copyContractionsFromBaseCE32(context, c, ce32, cond);
593                 } else {
594                     ce32 = copyFromBaseCE32(c, ce32, true);
595                     cond.next = index = addConditionalCE32(context.toString(), ce32);
596                 }
597                 cond = getConditionalCE32(index);
598             }
599             ce32 = makeBuilderContextCE32(head.next);
600             contextChars.add(c);
601             break;
602         }
603         case Collation.CONTRACTION_TAG: {
604             if(!withContext) {
605                 int index = Collation.indexFromCE32(ce32);
606                 ce32 = base.getCE32FromContexts(index);  // Default if no suffix match.
607                 return copyFromBaseCE32(c, ce32, false);
608             }
609             ConditionalCE32 head = new ConditionalCE32("", 0);
610             StringBuilder context = new StringBuilder("\0");
611             copyContractionsFromBaseCE32(context, c, ce32, head);
612             ce32 = makeBuilderContextCE32(head.next);
613             contextChars.add(c);
614             break;
615         }
616         case Collation.HANGUL_TAG:
617             throw new UnsupportedOperationException("We forbid tailoring of Hangul syllables.");
618         case Collation.OFFSET_TAG:
619             ce32 = getCE32FromOffsetCE32(true, c, ce32);
620             break;
621         case Collation.IMPLICIT_TAG:
622             ce32 = encodeOneCE(Collation.unassignedCEFromCodePoint(c));
623             break;
624         default:
625             throw new AssertionError("copyFromBaseCE32(c, ce32, withContext) " +
626                     "requires ce32 == base.getFinalCE32(ce32)");
627         }
628         return ce32;
629     }
630 
631     /**
632      * Copies base contractions to a list of ConditionalCE32.
633      * Sets cond.next to the index of the first new item
634      * and returns the index of the last new item.
635      */
copyContractionsFromBaseCE32(StringBuilder context, int c, int ce32, ConditionalCE32 cond)636     protected int copyContractionsFromBaseCE32(StringBuilder context, int c, int ce32,
637             ConditionalCE32 cond) {
638         int trieIndex = Collation.indexFromCE32(ce32);
639         int index;
640         if((ce32 & Collation.CONTRACT_SINGLE_CP_NO_MATCH) != 0) {
641             // No match on the single code point.
642             // We are underneath a prefix, and the default mapping is just
643             // a fallback to the mappings for a shorter prefix.
644             assert(context.length() > 1);
645             index = -1;
646         } else {
647             ce32 = base.getCE32FromContexts(trieIndex);  // Default if no suffix match.
648             assert(!Collation.isContractionCE32(ce32));
649             ce32 = copyFromBaseCE32(c, ce32, true);
650             cond.next = index = addConditionalCE32(context.toString(), ce32);
651             cond = getConditionalCE32(index);
652         }
653 
654         int suffixStart = context.length();
655         CharsTrie.Iterator suffixes = CharsTrie.iterator(base.contexts, trieIndex + 2, 0);
656         while(suffixes.hasNext()) {
657             CharsTrie.Entry entry = suffixes.next();
658             context.append(entry.chars);
659             ce32 = copyFromBaseCE32(c, entry.value, true);
660             cond.next = index = addConditionalCE32(context.toString(), ce32);
661             // No need to update the unsafeBackwardSet because the tailoring set
662             // is already a copy of the base set.
663             cond = getConditionalCE32(index);
664             context.setLength(suffixStart);
665         }
666         assert(index >= 0);
667         return index;
668     }
669 
670     private static final class CopyHelper {
CopyHelper(CollationDataBuilder s, CollationDataBuilder d, CollationDataBuilder.CEModifier m)671         CopyHelper(CollationDataBuilder s, CollationDataBuilder d,
672                   CollationDataBuilder.CEModifier m) {
673             src = s;
674             dest = d;
675             modifier = m;
676         }
677 
copyRangeCE32(int start, int end, int ce32)678         void copyRangeCE32(int start, int end, int ce32) {
679             ce32 = copyCE32(ce32);
680             dest.trie.setRange(start, end, ce32, true);
681             if(CollationDataBuilder.isBuilderContextCE32(ce32)) {
682                 dest.contextChars.add(start, end);
683             }
684         }
685 
copyCE32(int ce32)686         int copyCE32(int ce32) {
687             if(!Collation.isSpecialCE32(ce32)) {
688                 long ce = modifier.modifyCE32(ce32);
689                 if(ce != Collation.NO_CE) {
690                     ce32 = dest.encodeOneCE(ce);
691                 }
692             } else {
693                 int tag = Collation.tagFromCE32(ce32);
694                 if(tag == Collation.EXPANSION32_TAG) {
695                     int[] srcCE32s = src.ce32s.getBuffer();
696                     int srcIndex = Collation.indexFromCE32(ce32);
697                     int length = Collation.lengthFromCE32(ce32);
698                     // Inspect the source CE32s. Just copy them if none are modified.
699                     // Otherwise copy to modifiedCEs, with modifications.
700                     boolean isModified = false;
701                     for(int i = 0; i < length; ++i) {
702                         ce32 = srcCE32s[srcIndex + i];
703                         long ce;
704                         if(Collation.isSpecialCE32(ce32) ||
705                                 (ce = modifier.modifyCE32(ce32)) == Collation.NO_CE) {
706                             if(isModified) {
707                                 modifiedCEs[i] = Collation.ceFromCE32(ce32);
708                             }
709                         } else {
710                             if(!isModified) {
711                                 for(int j = 0; j < i; ++j) {
712                                     modifiedCEs[j] = Collation.ceFromCE32(srcCE32s[srcIndex + j]);
713                                 }
714                                 isModified = true;
715                             }
716                             modifiedCEs[i] = ce;
717                         }
718                     }
719                     if(isModified) {
720                         ce32 = dest.encodeCEs(modifiedCEs, length);
721                     } else {
722                         ce32 = dest.encodeExpansion32(srcCE32s, srcIndex, length);
723                     }
724                 } else if(tag == Collation.EXPANSION_TAG) {
725                     long[] srcCEs = src.ce64s.getBuffer();
726                     int srcIndex = Collation.indexFromCE32(ce32);
727                     int length = Collation.lengthFromCE32(ce32);
728                     // Inspect the source CEs. Just copy them if none are modified.
729                     // Otherwise copy to modifiedCEs, with modifications.
730                     boolean isModified = false;
731                     for(int i = 0; i < length; ++i) {
732                         long srcCE = srcCEs[srcIndex + i];
733                         long ce = modifier.modifyCE(srcCE);
734                         if(ce == Collation.NO_CE) {
735                             if(isModified) {
736                                 modifiedCEs[i] = srcCE;
737                             }
738                         } else {
739                             if(!isModified) {
740                                 for(int j = 0; j < i; ++j) {
741                                     modifiedCEs[j] = srcCEs[srcIndex + j];
742                                 }
743                                 isModified = true;
744                             }
745                             modifiedCEs[i] = ce;
746                         }
747                     }
748                     if(isModified) {
749                         ce32 = dest.encodeCEs(modifiedCEs, length);
750                     } else {
751                         ce32 = dest.encodeExpansion(srcCEs, srcIndex, length);
752                     }
753                 } else if(tag == Collation.BUILDER_DATA_TAG) {
754                     // Copy the list of ConditionalCE32.
755                     ConditionalCE32 cond = src.getConditionalCE32ForCE32(ce32);
756                     assert(!cond.hasContext());
757                     int destIndex = dest.addConditionalCE32(
758                             cond.context, copyCE32(cond.ce32));
759                     ce32 = CollationDataBuilder.makeBuilderContextCE32(destIndex);
760                     while(cond.next >= 0) {
761                         cond = src.getConditionalCE32(cond.next);
762                         ConditionalCE32 prevDestCond = dest.getConditionalCE32(destIndex);
763                         destIndex = dest.addConditionalCE32(
764                                 cond.context, copyCE32(cond.ce32));
765                         int suffixStart = cond.prefixLength() + 1;
766                         dest.unsafeBackwardSet.addAll(cond.context.substring(suffixStart));
767                         prevDestCond.next = destIndex;
768                     }
769                 } else {
770                     // Just copy long CEs and Latin mini expansions (and other expected values) as is,
771                     // assuming that the modifier would not modify them.
772                     assert(tag == Collation.LONG_PRIMARY_TAG ||
773                             tag == Collation.LONG_SECONDARY_TAG ||
774                             tag == Collation.LATIN_EXPANSION_TAG ||
775                             tag == Collation.HANGUL_TAG);
776                 }
777             }
778             return ce32;
779         }
780 
781         CollationDataBuilder src;
782         CollationDataBuilder dest;
783         CollationDataBuilder.CEModifier modifier;
784         long[] modifiedCEs = new long[Collation.MAX_EXPANSION_LENGTH];
785     }
786 
787     private static void
enumRangeForCopy(int start, int end, int value, CopyHelper helper)788     enumRangeForCopy(int start, int end, int value, CopyHelper helper) {
789         if(value != Collation.UNASSIGNED_CE32 && value != Collation.FALLBACK_CE32) {
790             helper.copyRangeCE32(start, end, value);
791         }
792     }
793 
getJamoCE32s(int jamoCE32s[])794     protected boolean getJamoCE32s(int jamoCE32s[]) {
795         boolean anyJamoAssigned = base == null;  // always set jamoCE32s in the base data
796         boolean needToCopyFromBase = false;
797         for(int j = 0; j < CollationData.JAMO_CE32S_LENGTH; ++j) {  // Count across Jamo types.
798             int jamo = jamoCpFromIndex(j);
799             boolean fromBase = false;
800             int ce32 = trie.get(jamo);
801             anyJamoAssigned |= Collation.isAssignedCE32(ce32);
802             // TODO: Try to prevent [optimize [Jamo]] from counting as anyJamoAssigned.
803             // (As of CLDR 24 [2013] the Korean tailoring does not optimize conjoining Jamo.)
804             if(ce32 == Collation.FALLBACK_CE32) {
805                 fromBase = true;
806                 ce32 = base.getCE32(jamo);
807             }
808             if(Collation.isSpecialCE32(ce32)) {
809                 switch(Collation.tagFromCE32(ce32)) {
810                 case Collation.LONG_PRIMARY_TAG:
811                 case Collation.LONG_SECONDARY_TAG:
812                 case Collation.LATIN_EXPANSION_TAG:
813                     // Copy the ce32 as-is.
814                     break;
815                 case Collation.EXPANSION32_TAG:
816                 case Collation.EXPANSION_TAG:
817                 case Collation.PREFIX_TAG:
818                 case Collation.CONTRACTION_TAG:
819                     if(fromBase) {
820                         // Defer copying until we know if anyJamoAssigned.
821                         ce32 = Collation.FALLBACK_CE32;
822                         needToCopyFromBase = true;
823                     }
824                     break;
825                 case Collation.IMPLICIT_TAG:
826                     // An unassigned Jamo should only occur in tests with incomplete bases.
827                     assert(fromBase);
828                     ce32 = Collation.FALLBACK_CE32;
829                     needToCopyFromBase = true;
830                     break;
831                 case Collation.OFFSET_TAG:
832                     ce32 = getCE32FromOffsetCE32(fromBase, jamo, ce32);
833                     break;
834                 case Collation.FALLBACK_TAG:
835                 case Collation.RESERVED_TAG_3:
836                 case Collation.BUILDER_DATA_TAG:
837                 case Collation.DIGIT_TAG:
838                 case Collation.U0000_TAG:
839                 case Collation.HANGUL_TAG:
840                 case Collation.LEAD_SURROGATE_TAG:
841                     throw new AssertionError(String.format("unexpected special tag in ce32=0x%08x", ce32));
842                 }
843             }
844             jamoCE32s[j] = ce32;
845         }
846         if(anyJamoAssigned && needToCopyFromBase) {
847             for(int j = 0; j < CollationData.JAMO_CE32S_LENGTH; ++j) {
848                 if(jamoCE32s[j] == Collation.FALLBACK_CE32) {
849                     int jamo = jamoCpFromIndex(j);
850                     jamoCE32s[j] = copyFromBaseCE32(jamo, base.getCE32(jamo),
851                                                     /*withContext=*/ true);
852                 }
853             }
854         }
855         return anyJamoAssigned;
856     }
857 
setDigitTags()858     protected void setDigitTags() {
859         UnicodeSet digits = new UnicodeSet("[:Nd:]");
860         UnicodeSetIterator iter = new UnicodeSetIterator(digits);
861         while(iter.next()) {
862             assert(iter.codepoint != UnicodeSetIterator.IS_STRING);
863             int c = iter.codepoint;
864             int ce32 = trie.get(c);
865             if(ce32 != Collation.FALLBACK_CE32 && ce32 != Collation.UNASSIGNED_CE32) {
866                 int index = addCE32(ce32);
867                 if(index > Collation.MAX_INDEX) {
868                     throw new IndexOutOfBoundsException("too many mappings");
869                     // BufferOverflowException is a better fit
870                     // but cannot be constructed with a message string.
871                 }
872                 ce32 = Collation.makeCE32FromTagIndexAndLength(
873                         Collation.DIGIT_TAG, index, UCharacter.digit(c));  // u_charDigitValue(c)
874                 trie.set(c, ce32);
875             }
876         }
877     }
878 
setLeadSurrogates()879     protected void setLeadSurrogates() {
880         for(char lead = 0xd800; lead < 0xdc00; ++lead) {
881             int leadValue = -1;
882             // utrie2_enumForLeadSurrogate(trie, lead, null, , &value);
883             Iterator<Trie2.Range> trieIterator = trie.iteratorForLeadSurrogate(lead);
884             while(trieIterator.hasNext()) {
885                 Trie2.Range range = trieIterator.next();
886                 // The rest of this loop is equivalent to C++ enumRangeLeadValue().
887                 int value = range.value;
888                 if(value == Collation.UNASSIGNED_CE32) {
889                     value = Collation.LEAD_ALL_UNASSIGNED;
890                 } else if(value == Collation.FALLBACK_CE32) {
891                     value = Collation.LEAD_ALL_FALLBACK;
892                 } else {
893                     leadValue = Collation.LEAD_MIXED;
894                     break;
895                 }
896                 if(leadValue < 0) {
897                     leadValue = value;
898                 } else if(leadValue != value) {
899                     leadValue = Collation.LEAD_MIXED;
900                     break;
901                 }
902             }
903             trie.setForLeadSurrogateCodeUnit(lead,
904                     Collation.makeCE32FromTagAndIndex(Collation.LEAD_SURROGATE_TAG, 0) | leadValue);
905         }
906     }
907 
buildMappings(CollationData data)908     protected void buildMappings(CollationData data) {
909         if(!isMutable()) {
910             throw new IllegalStateException("attempt to build() after build()");
911         }
912 
913         buildContexts();
914 
915         int[] jamoCE32s = new int[CollationData.JAMO_CE32S_LENGTH];
916         int jamoIndex = -1;
917         if(getJamoCE32s(jamoCE32s)) {
918             jamoIndex = ce32s.size();
919             for(int i = 0; i < CollationData.JAMO_CE32S_LENGTH; ++i) {
920                 ce32s.addElement(jamoCE32s[i]);
921             }
922             // Small optimization: Use a bit in the Hangul ce32
923             // to indicate that none of the Jamo CE32s are isSpecialCE32()
924             // (as it should be in the root collator).
925             // It allows CollationIterator to avoid recursive function calls and per-Jamo tests.
926             // In order to still have good trie compression and keep this code simple,
927             // we only set this flag if a whole block of 588 Hangul syllables starting with
928             // a common leading consonant (Jamo L) has this property.
929             boolean isAnyJamoVTSpecial = false;
930             for(int i = Hangul.JAMO_L_COUNT; i < CollationData.JAMO_CE32S_LENGTH; ++i) {
931                 if(Collation.isSpecialCE32(jamoCE32s[i])) {
932                     isAnyJamoVTSpecial = true;
933                     break;
934                 }
935             }
936             int hangulCE32 = Collation.makeCE32FromTagAndIndex(Collation.HANGUL_TAG, 0);
937             int c = Hangul.HANGUL_BASE;
938             for(int i = 0; i < Hangul.JAMO_L_COUNT; ++i) {  // iterate over the Jamo L
939                 int ce32 = hangulCE32;
940                 if(!isAnyJamoVTSpecial && !Collation.isSpecialCE32(jamoCE32s[i])) {
941                     ce32 |= Collation.HANGUL_NO_SPECIAL_JAMO;
942                 }
943                 int limit = c + Hangul.JAMO_VT_COUNT;
944                 trie.setRange(c, limit - 1, ce32, true);
945                 c = limit;
946             }
947         } else {
948             // Copy the Hangul CE32s from the base in blocks per Jamo L,
949             // assuming that HANGUL_NO_SPECIAL_JAMO is set or not set for whole blocks.
950             for(int c = Hangul.HANGUL_BASE; c < Hangul.HANGUL_LIMIT;) {
951                 int ce32 = base.getCE32(c);
952                 assert(Collation.hasCE32Tag(ce32, Collation.HANGUL_TAG));
953                 int limit = c + Hangul.JAMO_VT_COUNT;
954                 trie.setRange(c, limit - 1, ce32, true);
955                 c = limit;
956             }
957         }
958 
959         setDigitTags();
960         setLeadSurrogates();
961 
962         // For U+0000, move its normal ce32 into CE32s[0] and set U0000_TAG.
963         ce32s.setElementAt(trie.get(0), 0);
964         trie.set(0, Collation.makeCE32FromTagAndIndex(Collation.U0000_TAG, 0));
965 
966         data.trie = trie.toTrie2_32();
967 
968         // Mark each lead surrogate as "unsafe"
969         // if any of its 1024 associated supplementary code points is "unsafe".
970         int c = 0x10000;
971         for(char lead = 0xd800; lead < 0xdc00; ++lead, c += 0x400) {
972             if(unsafeBackwardSet.containsSome(c, c + 0x3ff)) {
973                 unsafeBackwardSet.add(lead);
974             }
975         }
976         unsafeBackwardSet.freeze();
977 
978         data.ce32s = ce32s.getBuffer();
979         data.ces = ce64s.getBuffer();
980         data.contexts = contexts.toString();
981 
982         data.base = base;
983         if(jamoIndex >= 0) {
984             data.jamoCE32s = jamoCE32s;  // C++: data.ce32s + jamoIndex
985         } else {
986             data.jamoCE32s = base.jamoCE32s;
987         }
988         data.unsafeBackwardSet = unsafeBackwardSet;
989     }
990 
clearContexts()991     protected void clearContexts() {
992         contexts.setLength(0);
993         UnicodeSetIterator iter = new UnicodeSetIterator(contextChars);
994         while(iter.next()) {
995             assert(iter.codepoint != UnicodeSetIterator.IS_STRING);
996             int ce32 = trie.get(iter.codepoint);
997             assert(isBuilderContextCE32(ce32));
998             getConditionalCE32ForCE32(ce32).builtCE32 = Collation.NO_CE32;
999         }
1000     }
1001 
buildContexts()1002     protected void buildContexts() {
1003         // Ignore abandoned lists and the cached builtCE32,
1004         // and build all contexts from scratch.
1005         contexts.setLength(0);
1006         UnicodeSetIterator iter = new UnicodeSetIterator(contextChars);
1007         while(iter.next()) {
1008             assert(iter.codepoint != UnicodeSetIterator.IS_STRING);
1009             int c = iter.codepoint;
1010             int ce32 = trie.get(c);
1011             if(!isBuilderContextCE32(ce32)) {
1012                 throw new AssertionError("Impossible: No context data for c in contextChars.");
1013             }
1014             ConditionalCE32 cond = getConditionalCE32ForCE32(ce32);
1015             ce32 = buildContext(cond);
1016             trie.set(c, ce32);
1017         }
1018     }
1019 
buildContext(ConditionalCE32 head)1020     protected int buildContext(ConditionalCE32 head) {
1021         // The list head must have no context.
1022         assert(!head.hasContext());
1023         // The list head must be followed by one or more nodes that all do have context.
1024         assert(head.next >= 0);
1025         CharsTrieBuilder prefixBuilder = new CharsTrieBuilder();
1026         CharsTrieBuilder contractionBuilder = new CharsTrieBuilder();
1027         for(ConditionalCE32 cond = head;; cond = getConditionalCE32(cond.next)) {
1028             // After the list head, the prefix or suffix can be empty, but not both.
1029             assert(cond == head || cond.hasContext());
1030             int prefixLength = cond.prefixLength();
1031             StringBuilder prefix = new StringBuilder().append(cond.context, 0, prefixLength + 1);
1032             String prefixString = prefix.toString();
1033             // Collect all contraction suffixes for one prefix.
1034             ConditionalCE32 firstCond = cond;
1035             ConditionalCE32 lastCond = cond;
1036             while(cond.next >= 0 &&
1037                     (cond = getConditionalCE32(cond.next)).context.startsWith(prefixString)) {
1038                 lastCond = cond;
1039             }
1040             int ce32;
1041             int suffixStart = prefixLength + 1;  // == prefix.length()
1042             if(lastCond.context.length() == suffixStart) {
1043                 // One prefix without contraction suffix.
1044                 assert(firstCond == lastCond);
1045                 ce32 = lastCond.ce32;
1046                 cond = lastCond;
1047             } else {
1048                 // Build the contractions trie.
1049                 contractionBuilder.clear();
1050                 // Entry for an empty suffix, to be stored before the trie.
1051                 int emptySuffixCE32 = Collation.NO_CE32;  // Will always be set to a real value.
1052                 int flags = 0;
1053                 if(firstCond.context.length() == suffixStart) {
1054                     // There is a mapping for the prefix and the single character c. (p|c)
1055                     // If no other suffix matches, then we return this value.
1056                     emptySuffixCE32 = firstCond.ce32;
1057                     cond = getConditionalCE32(firstCond.next);
1058                 } else {
1059                     // There is no mapping for the prefix and just the single character.
1060                     // (There is no p|c, only p|cd, p|ce etc.)
1061                     flags |= Collation.CONTRACT_SINGLE_CP_NO_MATCH;
1062                     // When the prefix matches but none of the prefix-specific suffixes,
1063                     // then we fall back to the mappings with the next-longest prefix,
1064                     // and ultimately to mappings with no prefix.
1065                     // Each fallback might be another set of contractions.
1066                     // For example, if there are mappings for ch, p|cd, p|ce, but not for p|c,
1067                     // then in text "pch" we find the ch contraction.
1068                     for(cond = head;; cond = getConditionalCE32(cond.next)) {
1069                         int length = cond.prefixLength();
1070                         if(length == prefixLength) { break; }
1071                         if(cond.defaultCE32 != Collation.NO_CE32 &&
1072                                 (length==0 || prefixString.regionMatches(
1073                                         prefix.length() - length, cond.context, 1, length)
1074                                         /* C++: prefix.endsWith(cond.context, 1, length) */)) {
1075                             emptySuffixCE32 = cond.defaultCE32;
1076                         }
1077                     }
1078                     cond = firstCond;
1079                 }
1080                 // Optimization: Set a flag when
1081                 // the first character of every contraction suffix has lccc!=0.
1082                 // Short-circuits contraction matching when a normal letter follows.
1083                 flags |= Collation.CONTRACT_NEXT_CCC;
1084                 // Add all of the non-empty suffixes into the contraction trie.
1085                 for(;;) {
1086                     String suffix = cond.context.substring(suffixStart);
1087                     int fcd16 = nfcImpl.getFCD16(suffix.codePointAt(0));
1088                     if(fcd16 <= 0xff) {
1089                         flags &= ~Collation.CONTRACT_NEXT_CCC;
1090                     }
1091                     fcd16 = nfcImpl.getFCD16(suffix.codePointBefore(suffix.length()));
1092                     if(fcd16 > 0xff) {
1093                         // The last suffix character has lccc!=0, allowing for discontiguous contractions.
1094                         flags |= Collation.CONTRACT_TRAILING_CCC;
1095                     }
1096                     contractionBuilder.add(suffix, cond.ce32);
1097                     if(cond == lastCond) { break; }
1098                     cond = getConditionalCE32(cond.next);
1099                 }
1100                 int index = addContextTrie(emptySuffixCE32, contractionBuilder);
1101                 if(index > Collation.MAX_INDEX) {
1102                     throw new IndexOutOfBoundsException("too many context-sensitive mappings");
1103                     // BufferOverflowException is a better fit
1104                     // but cannot be constructed with a message string.
1105                 }
1106                 ce32 = Collation.makeCE32FromTagAndIndex(Collation.CONTRACTION_TAG, index) | flags;
1107             }
1108             assert(cond == lastCond);
1109             firstCond.defaultCE32 = ce32;
1110             if(prefixLength == 0) {
1111                 if(cond.next < 0) {
1112                     // No non-empty prefixes, only contractions.
1113                     return ce32;
1114                 }
1115             } else {
1116                 prefix.delete(0, 1);  // Remove the length unit.
1117                 prefix.reverse();
1118                 prefixBuilder.add(prefix, ce32);
1119                 if(cond.next < 0) { break; }
1120             }
1121         }
1122         assert(head.defaultCE32 != Collation.NO_CE32);
1123         int index = addContextTrie(head.defaultCE32, prefixBuilder);
1124         if(index > Collation.MAX_INDEX) {
1125             throw new IndexOutOfBoundsException("too many context-sensitive mappings");
1126             // BufferOverflowException is a better fit
1127             // but cannot be constructed with a message string.
1128         }
1129         return Collation.makeCE32FromTagAndIndex(Collation.PREFIX_TAG, index);
1130     }
1131 
addContextTrie(int defaultCE32, CharsTrieBuilder trieBuilder)1132     protected int addContextTrie(int defaultCE32, CharsTrieBuilder trieBuilder) {
1133         StringBuilder context = new StringBuilder();
1134         context.append((char)(defaultCE32 >> 16)).append((char)defaultCE32);
1135         context.append(trieBuilder.buildCharSequence(StringTrieBuilder.Option.SMALL));
1136         int index = contexts.indexOf(context.toString());
1137         if(index < 0) {
1138             index = contexts.length();
1139             contexts.append(context);
1140         }
1141         return index;
1142     }
1143 
buildFastLatinTable(CollationData data)1144     protected void buildFastLatinTable(CollationData data) {
1145         if(!fastLatinEnabled) { return; }
1146 
1147         fastLatinBuilder = new CollationFastLatinBuilder();
1148         if(fastLatinBuilder.forData(data)) {
1149             char[] header = fastLatinBuilder.getHeader();
1150             char[] table = fastLatinBuilder.getTable();
1151             if(base != null &&
1152                     Arrays.equals(header, base.fastLatinTableHeader) &&
1153                     Arrays.equals(table, base.fastLatinTable)) {
1154                 // Same fast Latin table as in the base, use that one instead.
1155                 fastLatinBuilder = null;
1156                 header = base.fastLatinTableHeader;
1157                 table = base.fastLatinTable;
1158             }
1159             data.fastLatinTableHeader = header;
1160             data.fastLatinTable = table;
1161         } else {
1162             fastLatinBuilder = null;
1163         }
1164     }
1165 
getCEs(CharSequence s, int start, long ces[], int cesLength)1166     protected int getCEs(CharSequence s, int start, long ces[], int cesLength) {
1167         if(collIter == null) {
1168             collIter = new DataBuilderCollationIterator(this, new CollationData(nfcImpl));
1169             if(collIter == null) { return 0; }
1170         }
1171         return collIter.fetchCEs(s, start, ces, cesLength);
1172     }
1173 
jamoCpFromIndex(int i)1174     protected static int jamoCpFromIndex(int i) {
1175         // 0 <= i < CollationData.JAMO_CE32S_LENGTH = 19 + 21 + 27
1176         if(i < Hangul.JAMO_L_COUNT) { return Hangul.JAMO_L_BASE + i; }
1177         i -= Hangul.JAMO_L_COUNT;
1178         if(i < Hangul.JAMO_V_COUNT) { return Hangul.JAMO_V_BASE + i; }
1179         i -= Hangul.JAMO_V_COUNT;
1180         // i < 27
1181         return Hangul.JAMO_T_BASE + 1 + i;
1182     }
1183 
1184     /**
1185      * Build-time collation element and character iterator.
1186      * Uses the runtime CollationIterator for fetching CEs for a string
1187      * but reads from the builder's unfinished data structures.
1188      * In particular, this class reads from the unfinished trie
1189      * and has to avoid CollationIterator.nextCE() and redirect other
1190      * calls to data.getCE32() and data.getCE32FromSupplementary().
1191      *
1192      * We do this so that we need not implement the collation algorithm
1193      * again for the builder and make it behave exactly like the runtime code.
1194      * That would be more difficult to test and maintain than this indirection.
1195      *
1196      * Some CE32 tags (for example, the DIGIT_TAG) do not occur in the builder data,
1197      * so the data accesses from those code paths need not be modified.
1198      *
1199      * This class iterates directly over whole code points
1200      * so that the CollationIterator does not need the finished trie
1201      * for handling the LEAD_SURROGATE_TAG.
1202      */
1203     private static final class DataBuilderCollationIterator extends CollationIterator {
DataBuilderCollationIterator(CollationDataBuilder b, CollationData newData)1204         DataBuilderCollationIterator(CollationDataBuilder b, CollationData newData) {
1205             super(newData, /*numeric=*/ false);
1206             builder = b;
1207             builderData = newData;
1208             builderData.base = builder.base;
1209             // Set all of the jamoCE32s[] to indirection CE32s.
1210             for(int j = 0; j < CollationData.JAMO_CE32S_LENGTH; ++j) {  // Count across Jamo types.
1211                 int jamo = CollationDataBuilder.jamoCpFromIndex(j);
1212                 jamoCE32s[j] = Collation.makeCE32FromTagAndIndex(Collation.BUILDER_DATA_TAG, jamo) |
1213                         CollationDataBuilder.IS_BUILDER_JAMO_CE32;
1214             }
1215             builderData.jamoCE32s = jamoCE32s;
1216         }
1217 
fetchCEs(CharSequence str, int start, long ces[], int cesLength)1218         int fetchCEs(CharSequence str, int start, long ces[], int cesLength) {
1219             // Set the pointers each time, in case they changed due to reallocation.
1220             builderData.ce32s = builder.ce32s.getBuffer();
1221             builderData.ces = builder.ce64s.getBuffer();
1222             builderData.contexts = builder.contexts.toString();
1223             // Modified copy of CollationIterator.nextCE() and CollationIterator.nextCEFromCE32().
1224             reset();
1225             s = str;
1226             pos = start;
1227             while(pos < s.length()) {
1228                 // No need to keep all CEs in the iterator buffer.
1229                 clearCEs();
1230                 int c = Character.codePointAt(s, pos);
1231                 pos += Character.charCount(c);
1232                 int ce32 = builder.trie.get(c);
1233                 CollationData d;
1234                 if(ce32 == Collation.FALLBACK_CE32) {
1235                     d = builder.base;
1236                     ce32 = builder.base.getCE32(c);
1237                 } else {
1238                     d = builderData;
1239                 }
1240                 appendCEsFromCE32(d, c, ce32, /*forward=*/ true);
1241                 for(int i = 0; i < getCEsLength(); ++i) {
1242                     long ce = getCE(i);
1243                     if(ce != 0) {
1244                         if(cesLength < Collation.MAX_EXPANSION_LENGTH) {
1245                             ces[cesLength] = ce;
1246                         }
1247                         ++cesLength;
1248                     }
1249                 }
1250             }
1251             return cesLength;
1252         }
1253 
1254         @Override
resetToOffset(int newOffset)1255         public void resetToOffset(int newOffset) {
1256             reset();
1257             pos = newOffset;
1258         }
1259 
1260         @Override
getOffset()1261         public int getOffset() {
1262             return pos;
1263         }
1264 
1265         @Override
nextCodePoint()1266         public int nextCodePoint() {
1267             if(pos == s.length()) {
1268                 return Collation.SENTINEL_CP;
1269             }
1270             int c = Character.codePointAt(s, pos);
1271             pos += Character.charCount(c);
1272             return c;
1273         }
1274 
1275         @Override
previousCodePoint()1276         public int previousCodePoint() {
1277             if(pos == 0) {
1278                 return Collation.SENTINEL_CP;
1279             }
1280             int c = Character.codePointBefore(s, pos);
1281             pos -= Character.charCount(c);
1282             return c;
1283         }
1284 
1285         @Override
forwardNumCodePoints(int num)1286         protected void forwardNumCodePoints(int num) {
1287             pos = Character.offsetByCodePoints(s, pos, num);
1288         }
1289 
1290         @Override
backwardNumCodePoints(int num)1291         protected void backwardNumCodePoints(int num) {
1292             pos = Character.offsetByCodePoints(s, pos, -num);
1293         }
1294 
1295         @Override
getDataCE32(int c)1296         protected int getDataCE32(int c) {
1297             return builder.trie.get(c);
1298         }
1299 
1300         @Override
getCE32FromBuilderData(int ce32)1301         protected int getCE32FromBuilderData(int ce32) {
1302             assert(Collation.hasCE32Tag(ce32, Collation.BUILDER_DATA_TAG));
1303             if((ce32 & CollationDataBuilder.IS_BUILDER_JAMO_CE32) != 0) {
1304                 int jamo = Collation.indexFromCE32(ce32);
1305                 return builder.trie.get(jamo);
1306             } else {
1307                 ConditionalCE32 cond = builder.getConditionalCE32ForCE32(ce32);
1308                 if(cond.builtCE32 == Collation.NO_CE32) {
1309                     // Build the context-sensitive mappings into their runtime form and cache the result.
1310                     try {
1311                         cond.builtCE32 = builder.buildContext(cond);
1312                     } catch(IndexOutOfBoundsException e) {
1313                         builder.clearContexts();
1314                         cond.builtCE32 = builder.buildContext(cond);
1315                     }
1316                     builderData.contexts = builder.contexts.toString();
1317                 }
1318                 return cond.builtCE32;
1319             }
1320         }
1321 
1322         protected final CollationDataBuilder builder;
1323         protected final CollationData builderData;
1324         protected final int[] jamoCE32s = new int[CollationData.JAMO_CE32S_LENGTH];
1325         protected CharSequence s;
1326         protected int pos;
1327     }
1328 
isMutable()1329     protected final boolean isMutable() {
1330         // C++ tests !(trie == NULL || utrie2_isFrozen(trie))
1331         // but Java Trie2Writable does not have an observable isFrozen() state.
1332         return trie != null && unsafeBackwardSet != null && !unsafeBackwardSet.isFrozen();
1333     }
1334 
1335     /** @see Collation.BUILDER_DATA_TAG */
1336     private static final int IS_BUILDER_JAMO_CE32 = 0x100;
1337 
1338     protected Normalizer2Impl nfcImpl;
1339     protected CollationData base;
1340     protected CollationSettings baseSettings;
1341     protected Trie2Writable trie;
1342     protected UVector32 ce32s;
1343     protected UVector64 ce64s;
1344     protected ArrayList<ConditionalCE32> conditionalCE32s;  // vector of ConditionalCE32
1345     // Characters that have context (prefixes or contraction suffixes).
1346     protected UnicodeSet contextChars = new UnicodeSet();
1347     // Serialized UCharsTrie structures for finalized contexts.
1348     protected StringBuilder contexts = new StringBuilder();
1349     protected UnicodeSet unsafeBackwardSet = new UnicodeSet();
1350     protected boolean modified;
1351 
1352     protected boolean fastLatinEnabled;
1353     protected CollationFastLatinBuilder fastLatinBuilder;
1354 
1355     protected DataBuilderCollationIterator collIter;
1356 }
1357