• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html#License
3 /*
4 *******************************************************************************
5 * Copyright (C) 2013-2015, International Business Machines
6 * Corporation and others.  All Rights Reserved.
7 *******************************************************************************
8 * CollationBuilder.java, ported from collationbuilder.h/.cpp
9 *
10 * C++ version created on: 2013may06
11 * created by: Markus W. Scherer
12 */
13 
14 package com.ibm.icu.impl.coll;
15 
16 import java.text.ParseException;
17 
18 import com.ibm.icu.impl.Norm2AllModes;
19 import com.ibm.icu.impl.Normalizer2Impl;
20 import com.ibm.icu.impl.Normalizer2Impl.Hangul;
21 import com.ibm.icu.lang.UScript;
22 import com.ibm.icu.text.CanonicalIterator;
23 import com.ibm.icu.text.Collator;
24 import com.ibm.icu.text.Normalizer2;
25 import com.ibm.icu.text.UnicodeSet;
26 import com.ibm.icu.text.UnicodeSetIterator;
27 import com.ibm.icu.util.ULocale;
28 
29 public final class CollationBuilder extends CollationRuleParser.Sink {
30     private static final boolean DEBUG = false;
31     private static final class BundleImporter implements CollationRuleParser.Importer {
BundleImporter()32         BundleImporter() {}
33         @Override
getRules(String localeID, String collationType)34         public String getRules(String localeID, String collationType) {
35             return CollationLoader.loadRules(new ULocale(localeID), collationType);
36         }
37     }
38 
CollationBuilder(CollationTailoring b)39     public CollationBuilder(CollationTailoring b) {
40         nfd = Normalizer2.getNFDInstance();
41         fcd = Norm2AllModes.getFCDNormalizer2();
42         nfcImpl = Norm2AllModes.getNFCInstance().impl;
43         base = b;
44         baseData = b.data;
45         rootElements = new CollationRootElements(b.data.rootElements);
46         variableTop = 0;
47         dataBuilder = new CollationDataBuilder();
48         fastLatinEnabled = true;
49         cesLength = 0;
50         rootPrimaryIndexes = new UVector32();
51         nodes = new UVector64();
52         nfcImpl.ensureCanonIterData();
53         dataBuilder.initForTailoring(baseData);
54     }
55 
parseAndBuild(String ruleString)56     public CollationTailoring parseAndBuild(String ruleString) throws ParseException {
57         if(baseData.rootElements == null) {
58             // C++ U_MISSING_RESOURCE_ERROR
59             throw new UnsupportedOperationException(
60                     "missing root elements data, tailoring not supported");
61         }
62         CollationTailoring tailoring = new CollationTailoring(base.settings);
63         CollationRuleParser parser = new CollationRuleParser(baseData);
64         // Note: This always bases &[last variable] and &[first regular]
65         // on the root collator's maxVariable/variableTop.
66         // If we wanted this to change after [maxVariable x], then we would keep
67         // the tailoring.settings pointer here and read its variableTop when we need it.
68         // See http://unicode.org/cldr/trac/ticket/6070
69         variableTop = base.settings.readOnly().variableTop;
70         parser.setSink(this);
71         // In Java, there is only one Importer implementation.
72         // In C++, the importer is a parameter for this method.
73         parser.setImporter(new BundleImporter());
74         CollationSettings ownedSettings = tailoring.settings.copyOnWrite();
75         parser.parse(ruleString, ownedSettings);
76         if(dataBuilder.hasMappings()) {
77             makeTailoredCEs();
78             closeOverComposites();
79             finalizeCEs();
80             // Copy all of ASCII, and Latin-1 letters, into each tailoring.
81             optimizeSet.add(0, 0x7f);
82             optimizeSet.add(0xc0, 0xff);
83             // Hangul is decomposed on the fly during collation,
84             // and the tailoring data is always built with HANGUL_TAG specials.
85             optimizeSet.remove(Hangul.HANGUL_BASE, Hangul.HANGUL_END);
86             dataBuilder.optimize(optimizeSet);
87             tailoring.ensureOwnedData();
88             if(fastLatinEnabled) { dataBuilder.enableFastLatin(); }
89             dataBuilder.build(tailoring.ownedData);
90             // C++ tailoring.builder = dataBuilder;
91             dataBuilder = null;
92         } else {
93             tailoring.data = baseData;
94         }
95         ownedSettings.fastLatinOptions = CollationFastLatin.getOptions(
96                 tailoring.data, ownedSettings,
97                 ownedSettings.fastLatinPrimaries);
98         tailoring.setRules(ruleString);
99         // In Java, we do not have a rules version.
100         // In C++, the genrb build tool reads and supplies one,
101         // and the rulesVersion is a parameter for this method.
102         tailoring.setVersion(base.version, 0 /* rulesVersion */);
103         return tailoring;
104     }
105 
106     /** Implements CollationRuleParser.Sink. */
107     @Override
addReset(int strength, CharSequence str)108     void addReset(int strength, CharSequence str) {
109         assert(str.length() != 0);
110         if(str.charAt(0) == CollationRuleParser.POS_LEAD) {
111             ces[0] = getSpecialResetPosition(str);
112             cesLength = 1;
113             assert((ces[0] & Collation.CASE_AND_QUATERNARY_MASK) == 0);
114         } else {
115             // normal reset to a character or string
116             String nfdString = nfd.normalize(str);
117             cesLength = dataBuilder.getCEs(nfdString, ces, 0);
118             if(cesLength > Collation.MAX_EXPANSION_LENGTH) {
119                 throw new IllegalArgumentException(
120                         "reset position maps to too many collation elements (more than 31)");
121             }
122         }
123         if(strength == Collator.IDENTICAL) { return; }  // simple reset-at-position
124 
125         // &[before strength]position
126         assert(Collator.PRIMARY <= strength && strength <= Collator.TERTIARY);
127         int index = findOrInsertNodeForCEs(strength);
128 
129         long node = nodes.elementAti(index);
130         // If the index is for a "weaker" node,
131         // then skip backwards over this and further "weaker" nodes.
132         while(strengthFromNode(node) > strength) {
133             index = previousIndexFromNode(node);
134             node = nodes.elementAti(index);
135         }
136 
137         // Find or insert a node whose index we will put into a temporary CE.
138         if(strengthFromNode(node) == strength && isTailoredNode(node)) {
139             // Reset to just before this same-strength tailored node.
140             index = previousIndexFromNode(node);
141         } else if(strength == Collator.PRIMARY) {
142             // root primary node (has no previous index)
143             long p = weight32FromNode(node);
144             if(p == 0) {
145                 throw new UnsupportedOperationException(
146                         "reset primary-before ignorable not possible");
147             }
148             if(p <= rootElements.getFirstPrimary()) {
149                 // There is no primary gap between ignorables and the space-first-primary.
150                 throw new UnsupportedOperationException(
151                         "reset primary-before first non-ignorable not supported");
152             }
153             if(p == Collation.FIRST_TRAILING_PRIMARY) {
154                 // We do not support tailoring to an unassigned-implicit CE.
155                 throw new UnsupportedOperationException(
156                         "reset primary-before [first trailing] not supported");
157             }
158             p = rootElements.getPrimaryBefore(p, baseData.isCompressiblePrimary(p));
159             index = findOrInsertNodeForPrimary(p);
160             // Go to the last node in this list:
161             // Tailor after the last node between adjacent root nodes.
162             for(;;) {
163                 node = nodes.elementAti(index);
164                 int nextIndex = nextIndexFromNode(node);
165                 if(nextIndex == 0) { break; }
166                 index = nextIndex;
167             }
168         } else {
169             // &[before 2] or &[before 3]
170             index = findCommonNode(index, Collator.SECONDARY);
171             if(strength >= Collator.TERTIARY) {
172                 index = findCommonNode(index, Collator.TERTIARY);
173             }
174             // findCommonNode() stayed on the stronger node or moved to
175             // an explicit common-weight node of the reset-before strength.
176             node = nodes.elementAti(index);
177             if(strengthFromNode(node) == strength) {
178                 // Found a same-strength node with an explicit weight.
179                 int weight16 = weight16FromNode(node);
180                 if(weight16 == 0) {
181                     throw new UnsupportedOperationException(
182                             (strength == Collator.SECONDARY) ?
183                                     "reset secondary-before secondary ignorable not possible" :
184                                     "reset tertiary-before completely ignorable not possible");
185                 }
186                 assert(weight16 > Collation.BEFORE_WEIGHT16);
187                 // Reset to just before this node.
188                 // Insert the preceding same-level explicit weight if it is not there already.
189                 // Which explicit weight immediately precedes this one?
190                 weight16 = getWeight16Before(index, node, strength);
191                 // Does this preceding weight have a node?
192                 int previousWeight16;
193                 int previousIndex = previousIndexFromNode(node);
194                 for(int i = previousIndex;; i = previousIndexFromNode(node)) {
195                     node = nodes.elementAti(i);
196                     int previousStrength = strengthFromNode(node);
197                     if(previousStrength < strength) {
198                         assert(weight16 >= Collation.COMMON_WEIGHT16 || i == previousIndex);
199                         // Either the reset element has an above-common weight and
200                         // the parent node provides the implied common weight,
201                         // or the reset element has a weight<=common in the node
202                         // right after the parent, and we need to insert the preceding weight.
203                         previousWeight16 = Collation.COMMON_WEIGHT16;
204                         break;
205                     } else if(previousStrength == strength && !isTailoredNode(node)) {
206                         previousWeight16 = weight16FromNode(node);
207                         break;
208                     }
209                     // Skip weaker nodes and same-level tailored nodes.
210                 }
211                 if(previousWeight16 == weight16) {
212                     // The preceding weight has a node,
213                     // maybe with following weaker or tailored nodes.
214                     // Reset to the last of them.
215                     index = previousIndex;
216                 } else {
217                     // Insert a node with the preceding weight, reset to that.
218                     node = nodeFromWeight16(weight16) | nodeFromStrength(strength);
219                     index = insertNodeBetween(previousIndex, index, node);
220                 }
221             } else {
222                 // Found a stronger node with implied strength-common weight.
223                 int weight16 = getWeight16Before(index, node, strength);
224                 index = findOrInsertWeakNode(index, weight16, strength);
225             }
226             // Strength of the temporary CE = strength of its reset position.
227             // Code above raises an error if the before-strength is stronger.
228             strength = ceStrength(ces[cesLength - 1]);
229         }
230         ces[cesLength - 1] = tempCEFromIndexAndStrength(index, strength);
231     }
232 
233     /**
234      * Returns the secondary or tertiary weight preceding the current node's weight.
235      * node=nodes[index].
236      */
getWeight16Before(int index, long node, int level)237     private int getWeight16Before(int index, long node, int level) {
238         assert(strengthFromNode(node) < level || !isTailoredNode(node));
239         // Collect the root CE weights if this node is for a root CE.
240         // If it is not, then return the low non-primary boundary for a tailored CE.
241         int t;
242         if(strengthFromNode(node) == Collator.TERTIARY) {
243             t = weight16FromNode(node);
244         } else {
245             t = Collation.COMMON_WEIGHT16;  // Stronger node with implied common weight.
246         }
247         while(strengthFromNode(node) > Collator.SECONDARY) {
248             index = previousIndexFromNode(node);
249             node = nodes.elementAti(index);
250         }
251         if(isTailoredNode(node)) {
252             return Collation.BEFORE_WEIGHT16;
253         }
254         int s;
255         if(strengthFromNode(node) == Collator.SECONDARY) {
256             s = weight16FromNode(node);
257         } else {
258             s = Collation.COMMON_WEIGHT16;  // Stronger node with implied common weight.
259         }
260         while(strengthFromNode(node) > Collator.PRIMARY) {
261             index = previousIndexFromNode(node);
262             node = nodes.elementAti(index);
263         }
264         if(isTailoredNode(node)) {
265             return Collation.BEFORE_WEIGHT16;
266         }
267         // [p, s, t] is a root CE. Return the preceding weight for the requested level.
268         long p = weight32FromNode(node);
269         int weight16;
270         if(level == Collator.SECONDARY) {
271             weight16 = rootElements.getSecondaryBefore(p, s);
272         } else {
273             weight16 = rootElements.getTertiaryBefore(p, s, t);
274             assert((weight16 & ~Collation.ONLY_TERTIARY_MASK) == 0);
275         }
276         return weight16;
277     }
278 
getSpecialResetPosition(CharSequence str)279     private long getSpecialResetPosition(CharSequence str) {
280         assert(str.length() == 2);
281         long ce;
282         int strength = Collator.PRIMARY;
283         boolean isBoundary = false;
284         CollationRuleParser.Position pos =
285                 CollationRuleParser.POSITION_VALUES[str.charAt(1) - CollationRuleParser.POS_BASE];
286         switch(pos) {
287         case FIRST_TERTIARY_IGNORABLE:
288             // Quaternary CEs are not supported.
289             // Non-zero quaternary weights are possible only on tertiary or stronger CEs.
290             return 0;
291         case LAST_TERTIARY_IGNORABLE:
292             return 0;
293         case FIRST_SECONDARY_IGNORABLE: {
294             // Look for a tailored tertiary node after [0, 0, 0].
295             int index = findOrInsertNodeForRootCE(0, Collator.TERTIARY);
296             long node = nodes.elementAti(index);
297             if((index = nextIndexFromNode(node)) != 0) {
298                 node = nodes.elementAti(index);
299                 assert(strengthFromNode(node) <= Collator.TERTIARY);
300                 if(isTailoredNode(node) && strengthFromNode(node) == Collator.TERTIARY) {
301                     return tempCEFromIndexAndStrength(index, Collator.TERTIARY);
302                 }
303             }
304             return rootElements.getFirstTertiaryCE();
305             // No need to look for nodeHasAnyBefore() on a tertiary node.
306         }
307         case LAST_SECONDARY_IGNORABLE:
308             ce = rootElements.getLastTertiaryCE();
309             strength = Collator.TERTIARY;
310             break;
311         case FIRST_PRIMARY_IGNORABLE: {
312             // Look for a tailored secondary node after [0, 0, *].
313             int index = findOrInsertNodeForRootCE(0, Collator.SECONDARY);
314             long node = nodes.elementAti(index);
315             while((index = nextIndexFromNode(node)) != 0) {
316                 node = nodes.elementAti(index);
317                 strength = strengthFromNode(node);
318                 if(strength < Collator.SECONDARY) { break; }
319                 if(strength == Collator.SECONDARY) {
320                     if(isTailoredNode(node)) {
321                         if(nodeHasBefore3(node)) {
322                             index = nextIndexFromNode(nodes.elementAti(nextIndexFromNode(node)));
323                             assert(isTailoredNode(nodes.elementAti(index)));
324                         }
325                         return tempCEFromIndexAndStrength(index, Collator.SECONDARY);
326                     } else {
327                         break;
328                     }
329                 }
330             }
331             ce = rootElements.getFirstSecondaryCE();
332             strength = Collator.SECONDARY;
333             break;
334         }
335         case LAST_PRIMARY_IGNORABLE:
336             ce = rootElements.getLastSecondaryCE();
337             strength = Collator.SECONDARY;
338             break;
339         case FIRST_VARIABLE:
340             ce = rootElements.getFirstPrimaryCE();
341             isBoundary = true;  // FractionalUCA.txt: FDD1 00A0, SPACE first primary
342             break;
343         case LAST_VARIABLE:
344             ce = rootElements.lastCEWithPrimaryBefore(variableTop + 1);
345             break;
346         case FIRST_REGULAR:
347             ce = rootElements.firstCEWithPrimaryAtLeast(variableTop + 1);
348             isBoundary = true;  // FractionalUCA.txt: FDD1 263A, SYMBOL first primary
349             break;
350         case LAST_REGULAR:
351             // Use the Hani-first-primary rather than the actual last "regular" CE before it,
352             // for backward compatibility with behavior before the introduction of
353             // script-first-primary CEs in the root collator.
354             ce = rootElements.firstCEWithPrimaryAtLeast(
355                 baseData.getFirstPrimaryForGroup(UScript.HAN));
356             break;
357         case FIRST_IMPLICIT:
358             ce = baseData.getSingleCE(0x4e00);
359             break;
360         case LAST_IMPLICIT:
361             // We do not support tailoring to an unassigned-implicit CE.
362             throw new UnsupportedOperationException(
363                     "reset to [last implicit] not supported");
364         case FIRST_TRAILING:
365             ce = Collation.makeCE(Collation.FIRST_TRAILING_PRIMARY);
366             isBoundary = true;  // trailing first primary (there is no mapping for it)
367             break;
368         case LAST_TRAILING:
369             throw new IllegalArgumentException("LDML forbids tailoring to U+FFFF");
370         default:
371             assert(false);
372             return 0;
373         }
374 
375         int index = findOrInsertNodeForRootCE(ce, strength);
376         long node = nodes.elementAti(index);
377         if((pos.ordinal() & 1) == 0) {
378             // even pos = [first xyz]
379             if(!nodeHasAnyBefore(node) && isBoundary) {
380                 // A <group> first primary boundary is artificially added to FractionalUCA.txt.
381                 // It is reachable via its special contraction, but is not normally used.
382                 // Find the first character tailored after the boundary CE,
383                 // or the first real root CE after it.
384                 if((index = nextIndexFromNode(node)) != 0) {
385                     // If there is a following node, then it must be tailored
386                     // because there are no root CEs with a boundary primary
387                     // and non-common secondary/tertiary weights.
388                     node = nodes.elementAti(index);
389                     assert(isTailoredNode(node));
390                     ce = tempCEFromIndexAndStrength(index, strength);
391                 } else {
392                     assert(strength == Collator.PRIMARY);
393                     long p = ce >>> 32;
394                     int pIndex = rootElements.findPrimary(p);
395                     boolean isCompressible = baseData.isCompressiblePrimary(p);
396                     p = rootElements.getPrimaryAfter(p, pIndex, isCompressible);
397                     ce = Collation.makeCE(p);
398                     index = findOrInsertNodeForRootCE(ce, Collator.PRIMARY);
399                     node = nodes.elementAti(index);
400                 }
401             }
402             if(nodeHasAnyBefore(node)) {
403                 // Get the first node that was tailored before this one at a weaker strength.
404                 if(nodeHasBefore2(node)) {
405                     index = nextIndexFromNode(nodes.elementAti(nextIndexFromNode(node)));
406                     node = nodes.elementAti(index);
407                 }
408                 if(nodeHasBefore3(node)) {
409                     index = nextIndexFromNode(nodes.elementAti(nextIndexFromNode(node)));
410                 }
411                 assert(isTailoredNode(nodes.elementAti(index)));
412                 ce = tempCEFromIndexAndStrength(index, strength);
413             }
414         } else {
415             // odd pos = [last xyz]
416             // Find the last node that was tailored after the [last xyz]
417             // at a strength no greater than the position's strength.
418             for(;;) {
419                 int nextIndex = nextIndexFromNode(node);
420                 if(nextIndex == 0) { break; }
421                 long nextNode = nodes.elementAti(nextIndex);
422                 if(strengthFromNode(nextNode) < strength) { break; }
423                 index = nextIndex;
424                 node = nextNode;
425             }
426             // Do not make a temporary CE for a root node.
427             // This last node might be the node for the root CE itself,
428             // or a node with a common secondary or tertiary weight.
429             if(isTailoredNode(node)) {
430                 ce = tempCEFromIndexAndStrength(index, strength);
431             }
432         }
433         return ce;
434     }
435 
436     /** Implements CollationRuleParser.Sink. */
437     @Override
addRelation(int strength, CharSequence prefix, CharSequence str, CharSequence extension)438     void addRelation(int strength, CharSequence prefix, CharSequence str, CharSequence extension) {
439         String nfdPrefix;
440         if(prefix.length() == 0) {
441             nfdPrefix = "";
442         } else {
443             nfdPrefix = nfd.normalize(prefix);
444         }
445         String nfdString = nfd.normalize(str);
446 
447         // The runtime code decomposes Hangul syllables on the fly,
448         // with recursive processing but without making the Jamo pieces visible for matching.
449         // It does not work with certain types of contextual mappings.
450         int nfdLength = nfdString.length();
451         if(nfdLength >= 2) {
452             char c = nfdString.charAt(0);
453             if(Hangul.isJamoL(c) || Hangul.isJamoV(c)) {
454                 // While handling a Hangul syllable, contractions starting with Jamo L or V
455                 // would not see the following Jamo of that syllable.
456                 throw new UnsupportedOperationException(
457                         "contractions starting with conjoining Jamo L or V not supported");
458             }
459             c = nfdString.charAt(nfdLength - 1);
460             if(Hangul.isJamoL(c) ||
461                     (Hangul.isJamoV(c) && Hangul.isJamoL(nfdString.charAt(nfdLength - 2)))) {
462                 // A contraction ending with Jamo L or L+V would require
463                 // generating Hangul syllables in addTailComposites() (588 for a Jamo L),
464                 // or decomposing a following Hangul syllable on the fly, during contraction matching.
465                 throw new UnsupportedOperationException(
466                         "contractions ending with conjoining Jamo L or L+V not supported");
467             }
468             // A Hangul syllable completely inside a contraction is ok.
469         }
470         // Note: If there is a prefix, then the parser checked that
471         // both the prefix and the string beging with NFC boundaries (not Jamo V or T).
472         // Therefore: prefix.isEmpty() || !isJamoVOrT(nfdString.charAt(0))
473         // (While handling a Hangul syllable, prefixes on Jamo V or T
474         // would not see the previous Jamo of that syllable.)
475 
476         if(strength != Collator.IDENTICAL) {
477             // Find the node index after which we insert the new tailored node.
478             int index = findOrInsertNodeForCEs(strength);
479             assert(cesLength > 0);
480             long ce = ces[cesLength - 1];
481             if(strength == Collator.PRIMARY && !isTempCE(ce) && (ce >>> 32) == 0) {
482                 // There is no primary gap between ignorables and the space-first-primary.
483                 throw new UnsupportedOperationException(
484                         "tailoring primary after ignorables not supported");
485             }
486             if(strength == Collator.QUATERNARY && ce == 0) {
487                 // The CE data structure does not support non-zero quaternary weights
488                 // on tertiary ignorables.
489                 throw new UnsupportedOperationException(
490                         "tailoring quaternary after tertiary ignorables not supported");
491             }
492             // Insert the new tailored node.
493             index = insertTailoredNodeAfter(index, strength);
494             // Strength of the temporary CE:
495             // The new relation may yield a stronger CE but not a weaker one.
496             int tempStrength = ceStrength(ce);
497             if(strength < tempStrength) { tempStrength = strength; }
498             ces[cesLength - 1] = tempCEFromIndexAndStrength(index, tempStrength);
499         }
500 
501         setCaseBits(nfdString);
502 
503         int cesLengthBeforeExtension = cesLength;
504         if(extension.length() != 0) {
505             String nfdExtension = nfd.normalize(extension);
506             cesLength = dataBuilder.getCEs(nfdExtension, ces, cesLength);
507             if(cesLength > Collation.MAX_EXPANSION_LENGTH) {
508                 throw new IllegalArgumentException(
509                         "extension string adds too many collation elements (more than 31 total)");
510             }
511         }
512         int ce32 = Collation.UNASSIGNED_CE32;
513         if((!nfdPrefix.contentEquals(prefix) || !nfdString.contentEquals(str)) &&
514                 !ignorePrefix(prefix) && !ignoreString(str)) {
515             // Map from the original input to the CEs.
516             // We do this in case the canonical closure is incomplete,
517             // so that it is possible to explicitly provide the missing mappings.
518             ce32 = addIfDifferent(prefix, str, ces, cesLength, ce32);
519         }
520         addWithClosure(nfdPrefix, nfdString, ces, cesLength, ce32);
521         cesLength = cesLengthBeforeExtension;
522     }
523 
524     /**
525      * Picks one of the current CEs and finds or inserts a node in the graph
526      * for the CE + strength.
527      */
findOrInsertNodeForCEs(int strength)528     private int findOrInsertNodeForCEs(int strength) {
529         assert(Collator.PRIMARY <= strength && strength <= Collator.QUATERNARY);
530 
531         // Find the last CE that is at least as "strong" as the requested difference.
532         // Note: Stronger is smaller (Collator.PRIMARY=0).
533         long ce;
534         for(;; --cesLength) {
535             if(cesLength == 0) {
536                 ce = ces[0] = 0;
537                 cesLength = 1;
538                 break;
539             } else {
540                 ce = ces[cesLength - 1];
541             }
542             if(ceStrength(ce) <= strength) { break; }
543         }
544 
545         if(isTempCE(ce)) {
546             // No need to findCommonNode() here for lower levels
547             // because insertTailoredNodeAfter() will do that anyway.
548             return indexFromTempCE(ce);
549         }
550 
551         // root CE
552         if((int)(ce >>> 56) == Collation.UNASSIGNED_IMPLICIT_BYTE) {
553             throw new UnsupportedOperationException(
554                     "tailoring relative to an unassigned code point not supported");
555         }
556         return findOrInsertNodeForRootCE(ce, strength);
557     }
558 
findOrInsertNodeForRootCE(long ce, int strength)559     private int findOrInsertNodeForRootCE(long ce, int strength) {
560         assert((int)(ce >>> 56) != Collation.UNASSIGNED_IMPLICIT_BYTE);
561 
562         // Find or insert the node for each of the root CE's weights,
563         // down to the requested level/strength.
564         // Root CEs must have common=zero quaternary weights (for which we never insert any nodes).
565         assert((ce & 0xc0) == 0);
566         int index = findOrInsertNodeForPrimary(ce >>> 32);
567         if(strength >= Collator.SECONDARY) {
568             int lower32 = (int)ce;
569             index = findOrInsertWeakNode(index, lower32 >>> 16, Collator.SECONDARY);
570             if(strength >= Collator.TERTIARY) {
571                 index = findOrInsertWeakNode(index, lower32 & Collation.ONLY_TERTIARY_MASK,
572                                             Collator.TERTIARY);
573             }
574         }
575         return index;
576     }
577 
578     /**
579      * Like Java Collections.binarySearch(List, key, Comparator).
580      *
581      * @return the index>=0 where the item was found,
582      *         or the index<0 for inserting the string at ~index in sorted order
583      *         (index into rootPrimaryIndexes)
584      */
binarySearchForRootPrimaryNode( int[] rootPrimaryIndexes, int length, long[] nodes, long p)585     private static final int binarySearchForRootPrimaryNode(
586             int[] rootPrimaryIndexes, int length, long[] nodes, long p) {
587         if(length == 0) { return ~0; }
588         int start = 0;
589         int limit = length;
590         for (;;) {
591             int i = (int)(((long)start + (long)limit) / 2);
592             long node = nodes[rootPrimaryIndexes[i]];
593             long nodePrimary = node >>> 32;  // weight32FromNode(node)
594             if (p == nodePrimary) {
595                 return i;
596             } else if (p < nodePrimary) {
597                 if (i == start) {
598                     return ~start;  // insert s before i
599                 }
600                 limit = i;
601             } else {
602                 if (i == start) {
603                     return ~(start + 1);  // insert s after i
604                 }
605                 start = i;
606             }
607         }
608     }
609 
610     /** Finds or inserts the node for a root CE's primary weight. */
findOrInsertNodeForPrimary(long p)611     private int findOrInsertNodeForPrimary(long p) {
612         int rootIndex = binarySearchForRootPrimaryNode(
613             rootPrimaryIndexes.getBuffer(), rootPrimaryIndexes.size(), nodes.getBuffer(), p);
614         if(rootIndex >= 0) {
615             return rootPrimaryIndexes.elementAti(rootIndex);
616         } else {
617             // Start a new list of nodes with this primary.
618             int index = nodes.size();
619             nodes.addElement(nodeFromWeight32(p));
620             rootPrimaryIndexes.insertElementAt(index, ~rootIndex);
621             return index;
622         }
623     }
624 
625     /** Finds or inserts the node for a secondary or tertiary weight. */
findOrInsertWeakNode(int index, int weight16, int level)626     private int findOrInsertWeakNode(int index, int weight16, int level) {
627         assert(0 <= index && index < nodes.size());
628         assert(Collator.SECONDARY <= level && level <= Collator.TERTIARY);
629 
630         if(weight16 == Collation.COMMON_WEIGHT16) {
631             return findCommonNode(index, level);
632         }
633 
634         // If this will be the first below-common weight for the parent node,
635         // then we will also need to insert a common weight after it.
636         long node = nodes.elementAti(index);
637         assert(strengthFromNode(node) < level);  // parent node is stronger
638         if(weight16 != 0 && weight16 < Collation.COMMON_WEIGHT16) {
639             int hasThisLevelBefore = level == Collator.SECONDARY ? HAS_BEFORE2 : HAS_BEFORE3;
640             if((node & hasThisLevelBefore) == 0) {
641                 // The parent node has an implied level-common weight.
642                 long commonNode =
643                     nodeFromWeight16(Collation.COMMON_WEIGHT16) | nodeFromStrength(level);
644                 if(level == Collator.SECONDARY) {
645                     // Move the HAS_BEFORE3 flag from the parent node
646                     // to the new secondary common node.
647                     commonNode |= node & HAS_BEFORE3;
648                     node &= ~(long)HAS_BEFORE3;
649                 }
650                 nodes.setElementAt(node | hasThisLevelBefore, index);
651                 // Insert below-common-weight node.
652                 int nextIndex = nextIndexFromNode(node);
653                 node = nodeFromWeight16(weight16) | nodeFromStrength(level);
654                 index = insertNodeBetween(index, nextIndex, node);
655                 // Insert common-weight node.
656                 insertNodeBetween(index, nextIndex, commonNode);
657                 // Return index of below-common-weight node.
658                 return index;
659             }
660         }
661 
662         // Find the root CE's weight for this level.
663         // Postpone insertion if not found:
664         // Insert the new root node before the next stronger node,
665         // or before the next root node with the same strength and a larger weight.
666         int nextIndex;
667         while((nextIndex = nextIndexFromNode(node)) != 0) {
668             node = nodes.elementAti(nextIndex);
669             int nextStrength = strengthFromNode(node);
670             if(nextStrength <= level) {
671                 // Insert before a stronger node.
672                 if(nextStrength < level) { break; }
673                 // nextStrength == level
674                 if(!isTailoredNode(node)) {
675                     int nextWeight16 = weight16FromNode(node);
676                     if(nextWeight16 == weight16) {
677                         // Found the node for the root CE up to this level.
678                         return nextIndex;
679                     }
680                     // Insert before a node with a larger same-strength weight.
681                     if(nextWeight16 > weight16) { break; }
682                 }
683             }
684             // Skip the next node.
685             index = nextIndex;
686         }
687         node = nodeFromWeight16(weight16) | nodeFromStrength(level);
688         return insertNodeBetween(index, nextIndex, node);
689     }
690 
691     /**
692      * Makes and inserts a new tailored node into the list, after the one at index.
693      * Skips over nodes of weaker strength to maintain collation order
694      * ("postpone insertion").
695      * @return the new node's index
696      */
697     private int insertTailoredNodeAfter(int index, int strength) {
698         assert(0 <= index && index < nodes.size());
699         if(strength >= Collator.SECONDARY) {
700             index = findCommonNode(index, Collator.SECONDARY);
701             if(strength >= Collator.TERTIARY) {
702                 index = findCommonNode(index, Collator.TERTIARY);
703             }
704         }
705         // Postpone insertion:
706         // Insert the new node before the next one with a strength at least as strong.
707         long node = nodes.elementAti(index);
708         int nextIndex;
709         while((nextIndex = nextIndexFromNode(node)) != 0) {
710             node = nodes.elementAti(nextIndex);
711             if(strengthFromNode(node) <= strength) { break; }
712             // Skip the next node which has a weaker (larger) strength than the new one.
713             index = nextIndex;
714         }
715         node = IS_TAILORED | nodeFromStrength(strength);
716         return insertNodeBetween(index, nextIndex, node);
717     }
718 
719     /**
720      * Inserts a new node into the list, between list-adjacent items.
721      * The node's previous and next indexes must not be set yet.
722      * @return the new node's index
723      */
724     private int insertNodeBetween(int index, int nextIndex, long node) {
725         assert(previousIndexFromNode(node) == 0);
726         assert(nextIndexFromNode(node) == 0);
727         assert(nextIndexFromNode(nodes.elementAti(index)) == nextIndex);
728         // Append the new node and link it to the existing nodes.
729         int newIndex = nodes.size();
730         node |= nodeFromPreviousIndex(index) | nodeFromNextIndex(nextIndex);
731         nodes.addElement(node);
732         // nodes[index].nextIndex = newIndex
733         node = nodes.elementAti(index);
734         nodes.setElementAt(changeNodeNextIndex(node, newIndex), index);
735         // nodes[nextIndex].previousIndex = newIndex
736         if(nextIndex != 0) {
737             node = nodes.elementAti(nextIndex);
738             nodes.setElementAt(changeNodePreviousIndex(node, newIndex), nextIndex);
739         }
740         return newIndex;
741     }
742 
743     /**
744      * Finds the node which implies or contains a common=05 weight of the given strength
745      * (secondary or tertiary), if the current node is stronger.
746      * Skips weaker nodes and tailored nodes if the current node is stronger
747      * and is followed by an explicit-common-weight node.
748      * Always returns the input index if that node is no stronger than the given strength.
749      */
750     private int findCommonNode(int index, int strength) {
751         assert(Collator.SECONDARY <= strength && strength <= Collator.TERTIARY);
752         long node = nodes.elementAti(index);
753         if(strengthFromNode(node) >= strength) {
754             // The current node is no stronger.
755             return index;
756         }
757         if(strength == Collator.SECONDARY ? !nodeHasBefore2(node) : !nodeHasBefore3(node)) {
758             // The current node implies the strength-common weight.
759             return index;
760         }
761         index = nextIndexFromNode(node);
762         node = nodes.elementAti(index);
763         assert(!isTailoredNode(node) && strengthFromNode(node) == strength &&
764                 weight16FromNode(node) < Collation.COMMON_WEIGHT16);
765         // Skip to the explicit common node.
766         do {
767             index = nextIndexFromNode(node);
768             node = nodes.elementAti(index);
769             assert(strengthFromNode(node) >= strength);
770         } while(isTailoredNode(node) || strengthFromNode(node) > strength ||
771                 weight16FromNode(node) < Collation.COMMON_WEIGHT16);
772         assert(weight16FromNode(node) == Collation.COMMON_WEIGHT16);
773         return index;
774     }
775 
776     private void setCaseBits(CharSequence nfdString) {
777         int numTailoredPrimaries = 0;
778         for(int i = 0; i < cesLength; ++i) {
779             if(ceStrength(ces[i]) == Collator.PRIMARY) { ++numTailoredPrimaries; }
780         }
781         // We should not be able to get too many case bits because
782         // cesLength<=31==MAX_EXPANSION_LENGTH.
783         // 31 pairs of case bits fit into an long without setting its sign bit.
784         assert(numTailoredPrimaries <= 31);
785 
786         long cases = 0;
787         if(numTailoredPrimaries > 0) {
788             CharSequence s = nfdString;
789             UTF16CollationIterator baseCEs = new UTF16CollationIterator(baseData, false, s, 0);
790             int baseCEsLength = baseCEs.fetchCEs() - 1;
791             assert(baseCEsLength >= 0 && baseCEs.getCE(baseCEsLength) == Collation.NO_CE);
792 
793             int lastCase = 0;
794             int numBasePrimaries = 0;
795             for(int i = 0; i < baseCEsLength; ++i) {
796                 long ce = baseCEs.getCE(i);
797                 if((ce >>> 32) != 0) {
798                     ++numBasePrimaries;
799                     int c = ((int)ce >> 14) & 3;
800                     assert(c == 0 || c == 2);  // lowercase or uppercase, no mixed case in any base CE
801                     if(numBasePrimaries < numTailoredPrimaries) {
802                         cases |= (long)c << ((numBasePrimaries - 1) * 2);
803                     } else if(numBasePrimaries == numTailoredPrimaries) {
804                         lastCase = c;
805                     } else if(c != lastCase) {
806                         // There are more base primary CEs than tailored primaries.
807                         // Set mixed case if the case bits of the remainder differ.
808                         lastCase = 1;
809                         // Nothing more can change.
810                         break;
811                     }
812                 }
813             }
814             if(numBasePrimaries >= numTailoredPrimaries) {
815                 cases |= (long)lastCase << ((numTailoredPrimaries - 1) * 2);
816             }
817         }
818 
819         for(int i = 0; i < cesLength; ++i) {
820             long ce = ces[i] & 0xffffffffffff3fffL;  // clear old case bits
821             int strength = ceStrength(ce);
822             if(strength == Collator.PRIMARY) {
823                 ce |= (cases & 3) << 14;
824                 cases >>>= 2;
825             } else if(strength == Collator.TERTIARY) {
826                 // Tertiary CEs must have uppercase bits.
827                 // See the LDML spec, and comments in class CollationCompare.
828                 ce |= 0x8000;
829             }
830             // Tertiary ignorable CEs must have 0 case bits.
831             // We set 0 case bits for secondary CEs too
832             // since currently only U+0345 is cased and maps to a secondary CE,
833             // and it is lowercase. Other secondaries are uncased.
834             // See [[:Cased:]&[:uca1=:]] where uca1 queries the root primary weight.
835             ces[i] = ce;
836         }
837     }
838 
839     /** Implements CollationRuleParser.Sink. */
840     @Override
841     void suppressContractions(UnicodeSet set) {
842         dataBuilder.suppressContractions(set);
843     }
844 
845     /** Implements CollationRuleParser.Sink. */
846     @Override
847     void optimize(UnicodeSet set) {
848         optimizeSet.addAll(set);
849     }
850 
851     /**
852      * Adds the mapping and its canonical closure.
853      * Takes ce32=dataBuilder.encodeCEs(...) so that the data builder
854      * need not re-encode the CEs multiple times.
855      */
856     private int addWithClosure(CharSequence nfdPrefix, CharSequence nfdString,
857                 long[] newCEs, int newCEsLength, int ce32) {
858         // Map from the NFD input to the CEs.
859         ce32 = addIfDifferent(nfdPrefix, nfdString, newCEs, newCEsLength, ce32);
860         ce32 = addOnlyClosure(nfdPrefix, nfdString, newCEs, newCEsLength, ce32);
861         addTailComposites(nfdPrefix, nfdString);
862         return ce32;
863     }
864 
865     private int addOnlyClosure(CharSequence nfdPrefix, CharSequence nfdString,
866                 long[] newCEs, int newCEsLength, int ce32) {
867         // Map from canonically equivalent input to the CEs. (But not from the all-NFD input.)
868         // TODO: make CanonicalIterator work with CharSequence, or maybe change arguments here to String
869         if(nfdPrefix.length() == 0) {
870             CanonicalIterator stringIter = new CanonicalIterator(nfdString.toString());
871             String prefix = "";
872             for(;;) {
873                 String str = stringIter.next();
874                 if(str == null) { break; }
875                 if(ignoreString(str) || str.contentEquals(nfdString)) { continue; }
876                 ce32 = addIfDifferent(prefix, str, newCEs, newCEsLength, ce32);
877             }
878         } else {
879             CanonicalIterator prefixIter = new CanonicalIterator(nfdPrefix.toString());
880             CanonicalIterator stringIter = new CanonicalIterator(nfdString.toString());
881             for(;;) {
882                 String prefix = prefixIter.next();
883                 if(prefix == null) { break; }
884                 if(ignorePrefix(prefix)) { continue; }
885                 boolean samePrefix = prefix.contentEquals(nfdPrefix);
886                 for(;;) {
887                     String str = stringIter.next();
888                     if(str == null) { break; }
889                     if(ignoreString(str) || (samePrefix && str.contentEquals(nfdString))) { continue; }
890                     ce32 = addIfDifferent(prefix, str, newCEs, newCEsLength, ce32);
891                 }
892                 stringIter.reset();
893             }
894         }
895         return ce32;
896     }
897 
898     private void addTailComposites(CharSequence nfdPrefix, CharSequence nfdString) {
899         // Look for the last starter in the NFD string.
900         int lastStarter;
901         int indexAfterLastStarter = nfdString.length();
902         for(;;) {
903             if(indexAfterLastStarter == 0) { return; }  // no starter at all
904             lastStarter = Character.codePointBefore(nfdString, indexAfterLastStarter);
905             if(nfd.getCombiningClass(lastStarter) == 0) { break; }
906             indexAfterLastStarter -= Character.charCount(lastStarter);
907         }
908         // No closure to Hangul syllables since we decompose them on the fly.
909         if(Hangul.isJamoL(lastStarter)) { return; }
910 
911         // Are there any composites whose decomposition starts with the lastStarter?
912         // Note: Normalizer2Impl does not currently return start sets for NFC_QC=Maybe characters.
913         // We might find some more equivalent mappings here if it did.
914         UnicodeSet composites = new UnicodeSet();
915         if(!nfcImpl.getCanonStartSet(lastStarter, composites)) { return; }
916 
917         StringBuilder newNFDString = new StringBuilder(), newString = new StringBuilder();
918         long[] newCEs = new long[Collation.MAX_EXPANSION_LENGTH];
919         UnicodeSetIterator iter = new UnicodeSetIterator(composites);
920         while(iter.next()) {
921             assert(iter.codepoint != UnicodeSetIterator.IS_STRING);
922             int composite = iter.codepoint;
923             String decomp = nfd.getDecomposition(composite);
924             if(!mergeCompositeIntoString(nfdString, indexAfterLastStarter, composite, decomp,
925                     newNFDString, newString)) {
926                 continue;
927             }
928             int newCEsLength = dataBuilder.getCEs(nfdPrefix, newNFDString, newCEs, 0);
929             if(newCEsLength > Collation.MAX_EXPANSION_LENGTH) {
930                 // Ignore mappings that we cannot store.
931                 continue;
932             }
933             // Note: It is possible that the newCEs do not make use of the mapping
934             // for which we are adding the tail composites, in which case we might be adding
935             // unnecessary mappings.
936             // For example, when we add tail composites for ae^ (^=combining circumflex),
937             // UCA discontiguous-contraction matching does not find any matches
938             // for ae_^ (_=any combining diacritic below) *unless* there is also
939             // a contraction mapping for ae.
940             // Thus, if there is no ae contraction, then the ae^ mapping is ignored
941             // while fetching the newCEs for ae_^.
942             // TODO: Try to detect this effectively.
943             // (Alternatively, print a warning when prefix contractions are missing.)
944 
945             // We do not need an explicit mapping for the NFD strings.
946             // It is fine if the NFD input collates like this via a sequence of mappings.
947             // It also saves a little bit of space, and may reduce the set of characters with contractions.
948             int ce32 = addIfDifferent(nfdPrefix, newString,
949                                           newCEs, newCEsLength, Collation.UNASSIGNED_CE32);
950             if(ce32 != Collation.UNASSIGNED_CE32) {
951                 // was different, was added
952                 addOnlyClosure(nfdPrefix, newNFDString, newCEs, newCEsLength, ce32);
953             }
954         }
955     }
956 
957     private boolean mergeCompositeIntoString(CharSequence nfdString, int indexAfterLastStarter,
958                 int composite, CharSequence decomp,
959                 StringBuilder newNFDString, StringBuilder newString) {
960         assert(Character.codePointBefore(nfdString, indexAfterLastStarter) ==
961                 Character.codePointAt(decomp, 0));
962         int lastStarterLength = Character.offsetByCodePoints(decomp, 0, 1);
963         if(lastStarterLength == decomp.length()) {
964             // Singleton decompositions should be found by addWithClosure()
965             // and the CanonicalIterator, so we can ignore them here.
966             return false;
967         }
968         if(equalSubSequences(nfdString, indexAfterLastStarter, decomp, lastStarterLength)) {
969             // same strings, nothing new to be found here
970             return false;
971         }
972 
973         // Make new FCD strings that combine a composite, or its decomposition,
974         // into the nfdString's last starter and the combining marks following it.
975         // Make an NFD version, and a version with the composite.
976         newNFDString.setLength(0);
977         newNFDString.append(nfdString, 0, indexAfterLastStarter);
978         newString.setLength(0);
979         newString.append(nfdString, 0, indexAfterLastStarter - lastStarterLength)
980             .appendCodePoint(composite);
981 
982         // The following is related to discontiguous contraction matching,
983         // but builds only FCD strings (or else returns false).
984         int sourceIndex = indexAfterLastStarter;
985         int decompIndex = lastStarterLength;
986         // Small optimization: We keep the source character across loop iterations
987         // because we do not always consume it,
988         // and then need not fetch it again nor look up its combining class again.
989         int sourceChar = Collation.SENTINEL_CP;
990         // The cc variables need to be declared before the loop so that at the end
991         // they are set to the last combining classes seen.
992         int sourceCC = 0;
993         int decompCC = 0;
994         for(;;) {
995             if(sourceChar < 0) {
996                 if(sourceIndex >= nfdString.length()) { break; }
997                 sourceChar = Character.codePointAt(nfdString, sourceIndex);
998                 sourceCC = nfd.getCombiningClass(sourceChar);
999                 assert(sourceCC != 0);
1000             }
1001             // We consume a decomposition character in each iteration.
1002             if(decompIndex >= decomp.length()) { break; }
1003             int decompChar = Character.codePointAt(decomp, decompIndex);
1004             decompCC = nfd.getCombiningClass(decompChar);
1005             // Compare the two characters and their combining classes.
1006             if(decompCC == 0) {
1007                 // Unable to merge because the source contains a non-zero combining mark
1008                 // but the composite's decomposition contains another starter.
1009                 // The strings would not be equivalent.
1010                 return false;
1011             } else if(sourceCC < decompCC) {
1012                 // Composite + sourceChar would not be FCD.
1013                 return false;
1014             } else if(decompCC < sourceCC) {
1015                 newNFDString.appendCodePoint(decompChar);
1016                 decompIndex += Character.charCount(decompChar);
1017             } else if(decompChar != sourceChar) {
1018                 // Blocked because same combining class.
1019                 return false;
1020             } else {  // match: decompChar == sourceChar
1021                 newNFDString.appendCodePoint(decompChar);
1022                 decompIndex += Character.charCount(decompChar);
1023                 sourceIndex += Character.charCount(decompChar);
1024                 sourceChar = Collation.SENTINEL_CP;
1025             }
1026         }
1027         // We are at the end of at least one of the two inputs.
1028         if(sourceChar >= 0) {  // more characters from nfdString but not from decomp
1029             if(sourceCC < decompCC) {
1030                 // Appending the next source character to the composite would not be FCD.
1031                 return false;
1032             }
1033             newNFDString.append(nfdString, sourceIndex, nfdString.length());
1034             newString.append(nfdString, sourceIndex, nfdString.length());
1035         } else if(decompIndex < decomp.length()) {  // more characters from decomp, not from nfdString
1036             newNFDString.append(decomp, decompIndex, decomp.length());
1037         }
1038         assert(nfd.isNormalized(newNFDString));
1039         assert(fcd.isNormalized(newString));
1040         assert(nfd.normalize(newString).equals(newNFDString.toString()));  // canonically equivalent
1041         return true;
1042     }
1043 
1044     private boolean equalSubSequences(CharSequence left, int leftStart, CharSequence right, int rightStart) {
1045         // C++ UnicodeString::compare(leftStart, 0x7fffffff, right, rightStart, 0x7fffffff) == 0
1046         int leftLength = left.length();
1047         if((leftLength - leftStart) != (right.length() - rightStart)) { return false; }
1048         while(leftStart < leftLength) {
1049             if(left.charAt(leftStart++) != right.charAt(rightStart++)) {
1050                 return false;
1051             }
1052         }
1053         return true;
1054     }
1055     private boolean ignorePrefix(CharSequence s) {
1056         // Do not map non-FCD prefixes.
1057         return !isFCD(s);
1058     }
1059     private boolean ignoreString(CharSequence s) {
1060         // Do not map non-FCD strings.
1061         // Do not map strings that start with Hangul syllables: We decompose those on the fly.
1062         return !isFCD(s) || Hangul.isHangul(s.charAt(0));
1063     }
1064     private boolean isFCD(CharSequence s) {
1065         return fcd.isNormalized(s);
1066     }
1067 
1068     private static final UnicodeSet COMPOSITES = new UnicodeSet("[:NFD_QC=N:]");
1069     static {
1070         // Hangul is decomposed on the fly during collation.
1071         COMPOSITES.remove(Hangul.HANGUL_BASE, Hangul.HANGUL_END);
1072     }
1073 
1074     private void closeOverComposites() {
1075         String prefix = "";  // empty
1076         UnicodeSetIterator iter = new UnicodeSetIterator(COMPOSITES);
1077         while(iter.next()) {
1078             assert(iter.codepoint != UnicodeSetIterator.IS_STRING);
1079             String nfdString = nfd.getDecomposition(iter.codepoint);
1080             cesLength = dataBuilder.getCEs(nfdString, ces, 0);
1081             if(cesLength > Collation.MAX_EXPANSION_LENGTH) {
1082                 // Too many CEs from the decomposition (unusual), ignore this composite.
1083                 // We could add a capacity parameter to getCEs() and reallocate if necessary.
1084                 // However, this can only really happen in contrived cases.
1085                 continue;
1086             }
1087             String composite = iter.getString();
1088             addIfDifferent(prefix, composite, ces, cesLength, Collation.UNASSIGNED_CE32);
1089         }
1090     }
1091 
1092     private int addIfDifferent(CharSequence prefix, CharSequence str,
1093                 long[] newCEs, int newCEsLength, int ce32) {
1094         long[] oldCEs = new long[Collation.MAX_EXPANSION_LENGTH];
1095         int oldCEsLength = dataBuilder.getCEs(prefix, str, oldCEs, 0);
1096         if(!sameCEs(newCEs, newCEsLength, oldCEs, oldCEsLength)) {
1097             if(ce32 == Collation.UNASSIGNED_CE32) {
1098                 ce32 = dataBuilder.encodeCEs(newCEs, newCEsLength);
1099             }
1100             dataBuilder.addCE32(prefix, str, ce32);
1101         }
1102         return ce32;
1103     }
1104 
1105     private static boolean sameCEs(long[] ces1, int ces1Length,
1106                 long[] ces2, int ces2Length) {
1107         if(ces1Length != ces2Length) {
1108             return false;
1109         }
1110         assert(ces1Length <= Collation.MAX_EXPANSION_LENGTH);
1111         for(int i = 0; i < ces1Length; ++i) {
1112             if(ces1[i] != ces2[i]) { return false; }
1113         }
1114         return true;
1115     }
1116 
1117     private static final int alignWeightRight(int w) {
1118         if(w != 0) {
1119             while((w & 0xff) == 0) { w >>>= 8; }
1120         }
1121         return w;
1122     }
1123 
1124     /**
1125      * Walks the tailoring graph and overwrites tailored nodes with new CEs.
1126      * After this, the graph is destroyed.
1127      * The nodes array can then be used only as a source of tailored CEs.
1128      */
1129     private void makeTailoredCEs() {
1130         CollationWeights primaries = new CollationWeights();
1131         CollationWeights secondaries = new CollationWeights();
1132         CollationWeights tertiaries = new CollationWeights();
1133         long[] nodesArray = nodes.getBuffer();
1134         if(DEBUG) {
1135             System.out.println("\nCollationBuilder.makeTailoredCEs()");
1136         }
1137 
1138         for(int rpi = 0; rpi < rootPrimaryIndexes.size(); ++rpi) {
1139             int i = rootPrimaryIndexes.elementAti(rpi);
1140             long node = nodesArray[i];
1141             long p = weight32FromNode(node);
1142             int s = p == 0 ? 0 : Collation.COMMON_WEIGHT16;
1143             int t = s;
1144             int q = 0;
1145             boolean pIsTailored = false;
1146             boolean sIsTailored = false;
1147             boolean tIsTailored = false;
1148             if(DEBUG) {
1149                 System.out.printf("\nprimary     %x\n", alignWeightRight((int)p));
1150             }
1151             int pIndex = p == 0 ? 0 : rootElements.findPrimary(p);
1152             int nextIndex = nextIndexFromNode(node);
1153             while(nextIndex != 0) {
1154                 i = nextIndex;
1155                 node = nodesArray[i];
1156                 nextIndex = nextIndexFromNode(node);
1157                 int strength = strengthFromNode(node);
1158                 if(strength == Collator.QUATERNARY) {
1159                     assert(isTailoredNode(node));
1160                     if(DEBUG) {
1161                         System.out.print("      quat+     ");
1162                     }
1163                     if(q == 3) {
1164                         // C++ U_BUFFER_OVERFLOW_ERROR
1165                         throw new UnsupportedOperationException("quaternary tailoring gap too small");
1166                     }
1167                     ++q;
1168                 } else {
1169                     if(strength == Collator.TERTIARY) {
1170                         if(isTailoredNode(node)) {
1171                             if(DEBUG) {
1172                                 System.out.print("    ter+        ");
1173                             }
1174                             if(!tIsTailored) {
1175                                 // First tailored tertiary node for [p, s].
1176                                 int tCount = countTailoredNodes(nodesArray, nextIndex,
1177                                                                     Collator.TERTIARY) + 1;
1178                                 int tLimit;
1179                                 if(t == 0) {
1180                                     // Gap at the beginning of the tertiary CE range.
1181                                     t = rootElements.getTertiaryBoundary() - 0x100;
1182                                     tLimit = (int)rootElements.getFirstTertiaryCE() & Collation.ONLY_TERTIARY_MASK;
1183                                 } else if(!pIsTailored && !sIsTailored) {
1184                                     // p and s are root weights.
1185                                     tLimit = rootElements.getTertiaryAfter(pIndex, s, t);
1186                                 } else if(t == Collation.BEFORE_WEIGHT16) {
1187                                     tLimit = Collation.COMMON_WEIGHT16;
1188                                 } else {
1189                                     // [p, s] is tailored.
1190                                     assert(t == Collation.COMMON_WEIGHT16);
1191                                     tLimit = rootElements.getTertiaryBoundary();
1192                                 }
1193                                 assert(tLimit == 0x4000 || (tLimit & ~Collation.ONLY_TERTIARY_MASK) == 0);
1194                                 tertiaries.initForTertiary();
1195                                 if(!tertiaries.allocWeights(t, tLimit, tCount)) {
1196                                     // C++ U_BUFFER_OVERFLOW_ERROR
1197                                     throw new UnsupportedOperationException("tertiary tailoring gap too small");
1198                                 }
1199                                 tIsTailored = true;
1200                             }
1201                             t = (int)tertiaries.nextWeight();
1202                             assert(t != 0xffffffff);
1203                         } else {
1204                             t = weight16FromNode(node);
1205                             tIsTailored = false;
1206                             if(DEBUG) {
1207                                 System.out.printf("    ter     %x\n", alignWeightRight(t));
1208                             }
1209                         }
1210                     } else {
1211                         if(strength == Collator.SECONDARY) {
1212                             if(isTailoredNode(node)) {
1213                                 if(DEBUG) {
1214                                     System.out.print("  sec+          ");
1215                                 }
1216                                 if(!sIsTailored) {
1217                                     // First tailored secondary node for p.
1218                                     int sCount = countTailoredNodes(nodesArray, nextIndex,
1219                                                                         Collator.SECONDARY) + 1;
1220                                     int sLimit;
1221                                     if(s == 0) {
1222                                         // Gap at the beginning of the secondary CE range.
1223                                         s = rootElements.getSecondaryBoundary() - 0x100;
1224                                         sLimit = (int)(rootElements.getFirstSecondaryCE() >> 16);
1225                                     } else if(!pIsTailored) {
1226                                         // p is a root primary.
1227                                         sLimit = rootElements.getSecondaryAfter(pIndex, s);
1228                                     } else if(s == Collation.BEFORE_WEIGHT16) {
1229                                         sLimit = Collation.COMMON_WEIGHT16;
1230                                     } else {
1231                                         // p is a tailored primary.
1232                                         assert(s == Collation.COMMON_WEIGHT16);
1233                                         sLimit = rootElements.getSecondaryBoundary();
1234                                     }
1235                                     if(s == Collation.COMMON_WEIGHT16) {
1236                                         // Do not tailor into the getSortKey() range of
1237                                         // compressed common secondaries.
1238                                         s = rootElements.getLastCommonSecondary();
1239                                     }
1240                                     secondaries.initForSecondary();
1241                                     if(!secondaries.allocWeights(s, sLimit, sCount)) {
1242                                         // C++ U_BUFFER_OVERFLOW_ERROR
1243                                         if(DEBUG) {
1244                                             System.out.printf("!secondaries.allocWeights(%x, %x, sCount=%d)\n",
1245                                                     alignWeightRight(s), alignWeightRight(sLimit),
1246                                                     alignWeightRight(sCount));
1247                                         }
1248                                         throw new UnsupportedOperationException("secondary tailoring gap too small");
1249                                     }
1250                                     sIsTailored = true;
1251                                 }
1252                                 s = (int)secondaries.nextWeight();
1253                                 assert(s != 0xffffffff);
1254                             } else {
1255                                 s = weight16FromNode(node);
1256                                 sIsTailored = false;
1257                                 if(DEBUG) {
1258                                     System.out.printf("  sec       %x\n", alignWeightRight(s));
1259                                 }
1260                             }
1261                         } else /* Collator.PRIMARY */ {
1262                             assert(isTailoredNode(node));
1263                             if(DEBUG) {
1264                                 System.out.print("pri+            ");
1265                             }
1266                             if(!pIsTailored) {
1267                                 // First tailored primary node in this list.
1268                                 int pCount = countTailoredNodes(nodesArray, nextIndex,
1269                                                                     Collator.PRIMARY) + 1;
1270                                 boolean isCompressible = baseData.isCompressiblePrimary(p);
1271                                 long pLimit =
1272                                     rootElements.getPrimaryAfter(p, pIndex, isCompressible);
1273                                 primaries.initForPrimary(isCompressible);
1274                                 if(!primaries.allocWeights(p, pLimit, pCount)) {
1275                                     // C++ U_BUFFER_OVERFLOW_ERROR  // TODO: introduce a more specific UErrorCode?
1276                                     throw new UnsupportedOperationException("primary tailoring gap too small");
1277                                 }
1278                                 pIsTailored = true;
1279                             }
1280                             p = primaries.nextWeight();
1281                             assert(p != 0xffffffffL);
1282                             s = Collation.COMMON_WEIGHT16;
1283                             sIsTailored = false;
1284                         }
1285                         t = s == 0 ? 0 : Collation.COMMON_WEIGHT16;
1286                         tIsTailored = false;
1287                     }
1288                     q = 0;
1289                 }
1290                 if(isTailoredNode(node)) {
1291                     nodesArray[i] = Collation.makeCE(p, s, t, q);
1292                     if(DEBUG) {
1293                         System.out.printf("%016x\n", nodesArray[i]);
1294                     }
1295                 }
1296             }
1297         }
1298     }
1299 
1300     /**
1301      * Counts the tailored nodes of the given strength up to the next node
1302      * which is either stronger or has an explicit weight of this strength.
1303      */
1304     private static int countTailoredNodes(long[] nodesArray, int i, int strength) {
1305         int count = 0;
1306         for(;;) {
1307             if(i == 0) { break; }
1308             long node = nodesArray[i];
1309             if(strengthFromNode(node) < strength) { break; }
1310             if(strengthFromNode(node) == strength) {
1311                 if(isTailoredNode(node)) {
1312                     ++count;
1313                 } else {
1314                     break;
1315                 }
1316             }
1317             i = nextIndexFromNode(node);
1318         }
1319         return count;
1320     }
1321 
1322     private static final class CEFinalizer implements CollationDataBuilder.CEModifier {
1323         CEFinalizer(long[] ces) {
1324             finalCEs = ces;
1325         }
1326         @Override
1327         public long modifyCE32(int ce32) {
1328             assert(!Collation.isSpecialCE32(ce32));
1329             if(CollationBuilder.isTempCE32(ce32)) {
1330                 // retain case bits
1331                 return finalCEs[CollationBuilder.indexFromTempCE32(ce32)] | ((ce32 & 0xc0) << 8);
1332             } else {
1333                 return Collation.NO_CE;
1334             }
1335         }
1336         @Override
1337         public long modifyCE(long ce) {
1338             if(CollationBuilder.isTempCE(ce)) {
1339                 // retain case bits
1340                 return finalCEs[CollationBuilder.indexFromTempCE(ce)] | (ce & 0xc000);
1341             } else {
1342                 return Collation.NO_CE;
1343             }
1344         }
1345 
1346         private long[] finalCEs;
1347     };
1348 
1349     /** Replaces temporary CEs with the final CEs they point to. */
1350     private void finalizeCEs() {
1351         CollationDataBuilder newBuilder = new CollationDataBuilder();
1352         newBuilder.initForTailoring(baseData);
1353         CEFinalizer finalizer = new CEFinalizer(nodes.getBuffer());
1354         newBuilder.copyFrom(dataBuilder, finalizer);
1355         dataBuilder = newBuilder;
1356     }
1357 
1358     /**
1359      * Encodes "temporary CE" data into a CE that fits into the CE32 data structure,
1360      * with 2-byte primary, 1-byte secondary and 6-bit tertiary,
1361      * with valid CE byte values.
1362      *
1363      * The index must not exceed 20 bits (0xfffff).
1364      * The strength must fit into 2 bits (Collator.PRIMARY..Collator.QUATERNARY).
1365      *
1366      * Temporary CEs are distinguished from real CEs by their use of
1367      * secondary weights 06..45 which are otherwise reserved for compressed sort keys.
1368      *
1369      * The case bits are unused and available.
1370      */
1371     private static long tempCEFromIndexAndStrength(int index, int strength) {
1372         return
1373             // CE byte offsets, to ensure valid CE bytes, and case bits 11
1374             0x4040000006002000L +
1375             // index bits 19..13 -> primary byte 1 = CE bits 63..56 (byte values 40..BF)
1376             ((long)(index & 0xfe000) << 43) +
1377             // index bits 12..6 -> primary byte 2 = CE bits 55..48 (byte values 40..BF)
1378             ((long)(index & 0x1fc0) << 42) +
1379             // index bits 5..0 -> secondary byte 1 = CE bits 31..24 (byte values 06..45)
1380             ((index & 0x3f) << 24) +
1381             // strength bits 1..0 -> tertiary byte 1 = CE bits 13..8 (byte values 20..23)
1382             (strength << 8);
1383     }
1384     private static int indexFromTempCE(long tempCE) {
1385         tempCE -= 0x4040000006002000L;
1386         return
1387             ((int)(tempCE >> 43) & 0xfe000) |
1388             ((int)(tempCE >> 42) & 0x1fc0) |
1389             ((int)(tempCE >> 24) & 0x3f);
1390     }
1391     private static int strengthFromTempCE(long tempCE) {
1392         return ((int)tempCE >> 8) & 3;
1393     }
1394     private static boolean isTempCE(long ce) {
1395         int sec = (int)ce >>> 24;
1396         return 6 <= sec && sec <= 0x45;
1397     }
1398 
1399     private static int indexFromTempCE32(int tempCE32) {
1400         tempCE32 -= 0x40400620;
1401         return
1402             ((tempCE32 >> 11) & 0xfe000) |
1403             ((tempCE32 >> 10) & 0x1fc0) |
1404             ((tempCE32 >> 8) & 0x3f);
1405     }
isTempCE32(int ce32)1406     private static boolean isTempCE32(int ce32) {
1407         return
1408             (ce32 & 0xff) >= 2 &&  // not a long-primary/long-secondary CE32
1409             6 <= ((ce32 >> 8) & 0xff) && ((ce32 >> 8) & 0xff) <= 0x45;
1410     }
1411 
ceStrength(long ce)1412     private static int ceStrength(long ce) {
1413         return
1414             isTempCE(ce) ? strengthFromTempCE(ce) :
1415             (ce & 0xff00000000000000L) != 0 ? Collator.PRIMARY :
1416             ((int)ce & 0xff000000) != 0 ? Collator.SECONDARY :
1417             ce != 0 ? Collator.TERTIARY :
1418             Collator.IDENTICAL;
1419     }
1420 
1421     /** At most 1M nodes, limited by the 20 bits in node bit fields. */
1422     private static final int MAX_INDEX = 0xfffff;
1423     /**
1424      * Node bit 6 is set on a primary node if there are nodes
1425      * with secondary values below the common secondary weight (05).
1426      */
1427     private static final int HAS_BEFORE2 = 0x40;
1428     /**
1429      * Node bit 5 is set on a primary or secondary node if there are nodes
1430      * with tertiary values below the common tertiary weight (05).
1431      */
1432     private static final int HAS_BEFORE3 = 0x20;
1433     /**
1434      * Node bit 3 distinguishes a tailored node, which has no weight value,
1435      * from a node with an explicit (root or default) weight.
1436      */
1437     private static final int IS_TAILORED = 8;
1438 
nodeFromWeight32(long weight32)1439     private static long nodeFromWeight32(long weight32) {
1440         return weight32 << 32;
1441     }
nodeFromWeight16(int weight16)1442     private static long nodeFromWeight16(int weight16) {
1443         return (long)weight16 << 48;
1444     }
nodeFromPreviousIndex(int previous)1445     private static long nodeFromPreviousIndex(int previous) {
1446         return (long)previous << 28;
1447     }
nodeFromNextIndex(int next)1448     private static long nodeFromNextIndex(int next) {
1449         return next << 8;
1450     }
nodeFromStrength(int strength)1451     private static long nodeFromStrength(int strength) {
1452         return strength;
1453     }
1454 
weight32FromNode(long node)1455     private static long weight32FromNode(long node) {
1456         return node >>> 32;
1457     }
weight16FromNode(long node)1458     private static int weight16FromNode(long node) {
1459         return (int)(node >> 48) & 0xffff;
1460     }
previousIndexFromNode(long node)1461     private static int previousIndexFromNode(long node) {
1462         return (int)(node >> 28) & MAX_INDEX;
1463     }
nextIndexFromNode(long node)1464     private static int nextIndexFromNode(long node) {
1465         return ((int)node >> 8) & MAX_INDEX;
1466     }
strengthFromNode(long node)1467     private static int strengthFromNode(long node) {
1468         return (int)node & 3;
1469     }
1470 
nodeHasBefore2(long node)1471     private static boolean nodeHasBefore2(long node) {
1472         return (node & HAS_BEFORE2) != 0;
1473     }
nodeHasBefore3(long node)1474     private static boolean nodeHasBefore3(long node) {
1475         return (node & HAS_BEFORE3) != 0;
1476     }
nodeHasAnyBefore(long node)1477     private static boolean nodeHasAnyBefore(long node) {
1478         return (node & (HAS_BEFORE2 | HAS_BEFORE3)) != 0;
1479     }
isTailoredNode(long node)1480     private static boolean isTailoredNode(long node) {
1481         return (node & IS_TAILORED) != 0;
1482     }
1483 
changeNodePreviousIndex(long node, int previous)1484     private static long changeNodePreviousIndex(long node, int previous) {
1485         return (node & 0xffff00000fffffffL) | nodeFromPreviousIndex(previous);
1486     }
changeNodeNextIndex(long node, int next)1487     private static long changeNodeNextIndex(long node, int next) {
1488         return (node & 0xfffffffff00000ffL) | nodeFromNextIndex(next);
1489     }
1490 
1491     private Normalizer2 nfd, fcd;
1492     private Normalizer2Impl nfcImpl;
1493 
1494     private CollationTailoring base;
1495     private CollationData baseData;
1496     private CollationRootElements rootElements;
1497     private long variableTop;
1498 
1499     private CollationDataBuilder dataBuilder;
1500     private boolean fastLatinEnabled;
1501     private UnicodeSet optimizeSet = new UnicodeSet();
1502 
1503     private long[] ces = new long[Collation.MAX_EXPANSION_LENGTH];
1504     private int cesLength;
1505 
1506     /**
1507      * Indexes of nodes with root primary weights, sorted by primary.
1508      * Compact form of a TreeMap from root primary to node index.
1509      *
1510      * This is a performance optimization for finding reset positions.
1511      * Without this, we would have to search through the entire nodes list.
1512      * It also allows storing root primary weights in list head nodes,
1513      * without previous index, leaving room in root primary nodes for 32-bit primary weights.
1514      */
1515     private UVector32 rootPrimaryIndexes;
1516     /**
1517      * Data structure for assigning tailored weights and CEs.
1518      * Doubly-linked lists of nodes in mostly collation order.
1519      * Each list starts with a root primary node and ends with a nextIndex of 0.
1520      *
1521      * When there are any nodes in the list, then there is always a root primary node at index 0.
1522      * This allows some code not to have to check explicitly for nextIndex==0.
1523      *
1524      * Root primary nodes have 32-bit weights but do not have previous indexes.
1525      * All other nodes have at most 16-bit weights and do have previous indexes.
1526      *
1527      * Nodes with explicit weights store root collator weights,
1528      * or default weak weights (e.g., secondary 05) for stronger nodes.
1529      * "Tailored" nodes, with the IS_TAILORED bit set,
1530      * do not store explicit weights but rather
1531      * create a difference of a certain strength from the preceding node.
1532      *
1533      * A root node is followed by either
1534      * - a root/default node of the same strength, or
1535      * - a root/default node of the next-weaker strength, or
1536      * - a tailored node of the same strength.
1537      *
1538      * A node of a given strength normally implies "common" weights on weaker levels.
1539      *
1540      * A node with HAS_BEFORE2 must be immediately followed by
1541      * a secondary node with an explicit below-common weight, then a secondary tailored node,
1542      * and later an explicit common-secondary node.
1543      * The below-common weight can be a root weight,
1544      * or it can be BEFORE_WEIGHT16 for tailoring before an implied common weight
1545      * or before the lowest root weight.
1546      * (&[before 2] resets to an explicit secondary node so that
1547      * the following addRelation(secondary) tailors right after that.
1548      * If we did not have this node and instead were to reset on the primary node,
1549      * then addRelation(secondary) would skip forward to the the COMMON_WEIGHT16 node.)
1550      *
1551      * If the flag is not set, then there are no explicit secondary nodes
1552      * with the common or lower weights.
1553      *
1554      * Same for HAS_BEFORE3 for tertiary nodes and weights.
1555      * A node must not have both flags set.
1556      *
1557      * Tailored CEs are initially represented in a CollationDataBuilder as temporary CEs
1558      * which point to stable indexes in this list,
1559      * and temporary CEs stored in a CollationDataBuilder only point to tailored nodes.
1560      *
1561      * A temporary CE in the ces[] array may point to a non-tailored reset-before-position node,
1562      * until the next relation is added.
1563      *
1564      * At the end, the tailored weights are allocated as necessary,
1565      * then the tailored nodes are replaced with final CEs,
1566      * and the CollationData is rewritten by replacing temporary CEs with final ones.
1567      *
1568      * We cannot simply insert new nodes in the middle of the array
1569      * because that would invalidate the indexes stored in existing temporary CEs.
1570      * We need to use a linked graph with stable indexes to existing nodes.
1571      * A doubly-linked list seems easiest to maintain.
1572      *
1573      * Each node is stored as an long, with its fields stored as bit fields.
1574      *
1575      * Root primary node:
1576      * - primary weight: 32 bits 63..32
1577      * - reserved/unused/zero: 4 bits 31..28
1578      *
1579      * Weaker root nodes & tailored nodes:
1580      * - a weight: 16 bits 63..48
1581      *   + a root or default weight for a non-tailored node
1582      *   + unused/zero for a tailored node
1583      * - index to the previous node: 20 bits 47..28
1584      *
1585      * All types of nodes:
1586      * - index to the next node: 20 bits 27..8
1587      *   + nextIndex=0 in last node per root-primary list
1588      * - reserved/unused/zero bits: bits 7, 4, 2
1589      * - HAS_BEFORE2: bit 6
1590      * - HAS_BEFORE3: bit 5
1591      * - IS_TAILORED: bit 3
1592      * - the difference strength (primary/secondary/tertiary/quaternary): 2 bits 1..0
1593      *
1594      * We could allocate structs with pointers, but we would have to store them
1595      * in a pointer list so that they can be indexed from temporary CEs,
1596      * and they would require more memory allocations.
1597      */
1598     private UVector64 nodes;
1599 }
1600