• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* GENERATED SOURCE. DO NOT MODIFY. */
2 // © 2016 and later: Unicode, Inc. and others.
3 // License & terms of use: http://www.unicode.org/copyright.html#License
4 /*
5 *******************************************************************************
6 * Copyright (C) 2013-2015, International Business Machines
7 * Corporation and others.  All Rights Reserved.
8 *******************************************************************************
9 * CollationBuilder.java, ported from collationbuilder.h/.cpp
10 *
11 * C++ version created on: 2013may06
12 * created by: Markus W. Scherer
13 */
14 
15 package ohos.global.icu.impl.coll;
16 
17 import java.text.ParseException;
18 
19 import ohos.global.icu.impl.Norm2AllModes;
20 import ohos.global.icu.impl.Normalizer2Impl;
21 import ohos.global.icu.impl.Normalizer2Impl.Hangul;
22 import ohos.global.icu.lang.UScript;
23 import ohos.global.icu.text.CanonicalIterator;
24 import ohos.global.icu.text.Collator;
25 import ohos.global.icu.text.Normalizer2;
26 import ohos.global.icu.text.UnicodeSet;
27 import ohos.global.icu.text.UnicodeSetIterator;
28 import ohos.global.icu.util.ULocale;
29 
30 /**
31  * @hide exposed on OHOS
32  */
33 public final class CollationBuilder extends CollationRuleParser.Sink {
34     private static final boolean DEBUG = false;
35     private static final class BundleImporter implements CollationRuleParser.Importer {
BundleImporter()36         BundleImporter() {}
37         @Override
getRules(String localeID, String collationType)38         public String getRules(String localeID, String collationType) {
39             return CollationLoader.loadRules(new ULocale(localeID), collationType);
40         }
41     }
42 
CollationBuilder(CollationTailoring b)43     public CollationBuilder(CollationTailoring b) {
44         nfd = Normalizer2.getNFDInstance();
45         fcd = Norm2AllModes.getFCDNormalizer2();
46         nfcImpl = Norm2AllModes.getNFCInstance().impl;
47         base = b;
48         baseData = b.data;
49         rootElements = new CollationRootElements(b.data.rootElements);
50         variableTop = 0;
51         dataBuilder = new CollationDataBuilder();
52         fastLatinEnabled = true;
53         cesLength = 0;
54         rootPrimaryIndexes = new UVector32();
55         nodes = new UVector64();
56         nfcImpl.ensureCanonIterData();
57         dataBuilder.initForTailoring(baseData);
58     }
59 
parseAndBuild(String ruleString)60     public CollationTailoring parseAndBuild(String ruleString) throws ParseException {
61         if(baseData.rootElements == null) {
62             // C++ U_MISSING_RESOURCE_ERROR
63             throw new UnsupportedOperationException(
64                     "missing root elements data, tailoring not supported");
65         }
66         CollationTailoring tailoring = new CollationTailoring(base.settings);
67         CollationRuleParser parser = new CollationRuleParser(baseData);
68         // Note: This always bases &[last variable] and &[first regular]
69         // on the root collator's maxVariable/variableTop.
70         // If we wanted this to change after [maxVariable x], then we would keep
71         // the tailoring.settings pointer here and read its variableTop when we need it.
72         // See http://unicode.org/cldr/trac/ticket/6070
73         variableTop = base.settings.readOnly().variableTop;
74         parser.setSink(this);
75         // In Java, there is only one Importer implementation.
76         // In C++, the importer is a parameter for this method.
77         parser.setImporter(new BundleImporter());
78         CollationSettings ownedSettings = tailoring.settings.copyOnWrite();
79         parser.parse(ruleString, ownedSettings);
80         if(dataBuilder.hasMappings()) {
81             makeTailoredCEs();
82             closeOverComposites();
83             finalizeCEs();
84             // Copy all of ASCII, and Latin-1 letters, into each tailoring.
85             optimizeSet.add(0, 0x7f);
86             optimizeSet.add(0xc0, 0xff);
87             // Hangul is decomposed on the fly during collation,
88             // and the tailoring data is always built with HANGUL_TAG specials.
89             optimizeSet.remove(Hangul.HANGUL_BASE, Hangul.HANGUL_END);
90             dataBuilder.optimize(optimizeSet);
91             tailoring.ensureOwnedData();
92             if(fastLatinEnabled) { dataBuilder.enableFastLatin(); }
93             dataBuilder.build(tailoring.ownedData);
94             // C++ tailoring.builder = dataBuilder;
95             dataBuilder = null;
96         } else {
97             tailoring.data = baseData;
98         }
99         ownedSettings.fastLatinOptions = CollationFastLatin.getOptions(
100                 tailoring.data, ownedSettings,
101                 ownedSettings.fastLatinPrimaries);
102         tailoring.setRules(ruleString);
103         // In Java, we do not have a rules version.
104         // In C++, the genrb build tool reads and supplies one,
105         // and the rulesVersion is a parameter for this method.
106         tailoring.setVersion(base.version, 0 /* rulesVersion */);
107         return tailoring;
108     }
109 
110     /** Implements CollationRuleParser.Sink. */
111     @Override
addReset(int strength, CharSequence str)112     void addReset(int strength, CharSequence str) {
113         assert(str.length() != 0);
114         if(str.charAt(0) == CollationRuleParser.POS_LEAD) {
115             ces[0] = getSpecialResetPosition(str);
116             cesLength = 1;
117             assert((ces[0] & Collation.CASE_AND_QUATERNARY_MASK) == 0);
118         } else {
119             // normal reset to a character or string
120             String nfdString = nfd.normalize(str);
121             cesLength = dataBuilder.getCEs(nfdString, ces, 0);
122             if(cesLength > Collation.MAX_EXPANSION_LENGTH) {
123                 throw new IllegalArgumentException(
124                         "reset position maps to too many collation elements (more than 31)");
125             }
126         }
127         if(strength == Collator.IDENTICAL) { return; }  // simple reset-at-position
128 
129         // &[before strength]position
130         assert(Collator.PRIMARY <= strength && strength <= Collator.TERTIARY);
131         int index = findOrInsertNodeForCEs(strength);
132 
133         long node = nodes.elementAti(index);
134         // If the index is for a "weaker" node,
135         // then skip backwards over this and further "weaker" nodes.
136         while(strengthFromNode(node) > strength) {
137             index = previousIndexFromNode(node);
138             node = nodes.elementAti(index);
139         }
140 
141         // Find or insert a node whose index we will put into a temporary CE.
142         if(strengthFromNode(node) == strength && isTailoredNode(node)) {
143             // Reset to just before this same-strength tailored node.
144             index = previousIndexFromNode(node);
145         } else if(strength == Collator.PRIMARY) {
146             // root primary node (has no previous index)
147             long p = weight32FromNode(node);
148             if(p == 0) {
149                 throw new UnsupportedOperationException(
150                         "reset primary-before ignorable not possible");
151             }
152             if(p <= rootElements.getFirstPrimary()) {
153                 // There is no primary gap between ignorables and the space-first-primary.
154                 throw new UnsupportedOperationException(
155                         "reset primary-before first non-ignorable not supported");
156             }
157             if(p == Collation.FIRST_TRAILING_PRIMARY) {
158                 // We do not support tailoring to an unassigned-implicit CE.
159                 throw new UnsupportedOperationException(
160                         "reset primary-before [first trailing] not supported");
161             }
162             p = rootElements.getPrimaryBefore(p, baseData.isCompressiblePrimary(p));
163             index = findOrInsertNodeForPrimary(p);
164             // Go to the last node in this list:
165             // Tailor after the last node between adjacent root nodes.
166             for(;;) {
167                 node = nodes.elementAti(index);
168                 int nextIndex = nextIndexFromNode(node);
169                 if(nextIndex == 0) { break; }
170                 index = nextIndex;
171             }
172         } else {
173             // &[before 2] or &[before 3]
174             index = findCommonNode(index, Collator.SECONDARY);
175             if(strength >= Collator.TERTIARY) {
176                 index = findCommonNode(index, Collator.TERTIARY);
177             }
178             // findCommonNode() stayed on the stronger node or moved to
179             // an explicit common-weight node of the reset-before strength.
180             node = nodes.elementAti(index);
181             if(strengthFromNode(node) == strength) {
182                 // Found a same-strength node with an explicit weight.
183                 int weight16 = weight16FromNode(node);
184                 if(weight16 == 0) {
185                     throw new UnsupportedOperationException(
186                             (strength == Collator.SECONDARY) ?
187                                     "reset secondary-before secondary ignorable not possible" :
188                                     "reset tertiary-before completely ignorable not possible");
189                 }
190                 assert(weight16 > Collation.BEFORE_WEIGHT16);
191                 // Reset to just before this node.
192                 // Insert the preceding same-level explicit weight if it is not there already.
193                 // Which explicit weight immediately precedes this one?
194                 weight16 = getWeight16Before(index, node, strength);
195                 // Does this preceding weight have a node?
196                 int previousWeight16;
197                 int previousIndex = previousIndexFromNode(node);
198                 for(int i = previousIndex;; i = previousIndexFromNode(node)) {
199                     node = nodes.elementAti(i);
200                     int previousStrength = strengthFromNode(node);
201                     if(previousStrength < strength) {
202                         assert(weight16 >= Collation.COMMON_WEIGHT16 || i == previousIndex);
203                         // Either the reset element has an above-common weight and
204                         // the parent node provides the implied common weight,
205                         // or the reset element has a weight<=common in the node
206                         // right after the parent, and we need to insert the preceding weight.
207                         previousWeight16 = Collation.COMMON_WEIGHT16;
208                         break;
209                     } else if(previousStrength == strength && !isTailoredNode(node)) {
210                         previousWeight16 = weight16FromNode(node);
211                         break;
212                     }
213                     // Skip weaker nodes and same-level tailored nodes.
214                 }
215                 if(previousWeight16 == weight16) {
216                     // The preceding weight has a node,
217                     // maybe with following weaker or tailored nodes.
218                     // Reset to the last of them.
219                     index = previousIndex;
220                 } else {
221                     // Insert a node with the preceding weight, reset to that.
222                     node = nodeFromWeight16(weight16) | nodeFromStrength(strength);
223                     index = insertNodeBetween(previousIndex, index, node);
224                 }
225             } else {
226                 // Found a stronger node with implied strength-common weight.
227                 int weight16 = getWeight16Before(index, node, strength);
228                 index = findOrInsertWeakNode(index, weight16, strength);
229             }
230             // Strength of the temporary CE = strength of its reset position.
231             // Code above raises an error if the before-strength is stronger.
232             strength = ceStrength(ces[cesLength - 1]);
233         }
234         ces[cesLength - 1] = tempCEFromIndexAndStrength(index, strength);
235     }
236 
237     /**
238      * Returns the secondary or tertiary weight preceding the current node's weight.
239      * node=nodes[index].
240      */
getWeight16Before(int index, long node, int level)241     private int getWeight16Before(int index, long node, int level) {
242         assert(strengthFromNode(node) < level || !isTailoredNode(node));
243         // Collect the root CE weights if this node is for a root CE.
244         // If it is not, then return the low non-primary boundary for a tailored CE.
245         int t;
246         if(strengthFromNode(node) == Collator.TERTIARY) {
247             t = weight16FromNode(node);
248         } else {
249             t = Collation.COMMON_WEIGHT16;  // Stronger node with implied common weight.
250         }
251         while(strengthFromNode(node) > Collator.SECONDARY) {
252             index = previousIndexFromNode(node);
253             node = nodes.elementAti(index);
254         }
255         if(isTailoredNode(node)) {
256             return Collation.BEFORE_WEIGHT16;
257         }
258         int s;
259         if(strengthFromNode(node) == Collator.SECONDARY) {
260             s = weight16FromNode(node);
261         } else {
262             s = Collation.COMMON_WEIGHT16;  // Stronger node with implied common weight.
263         }
264         while(strengthFromNode(node) > Collator.PRIMARY) {
265             index = previousIndexFromNode(node);
266             node = nodes.elementAti(index);
267         }
268         if(isTailoredNode(node)) {
269             return Collation.BEFORE_WEIGHT16;
270         }
271         // [p, s, t] is a root CE. Return the preceding weight for the requested level.
272         long p = weight32FromNode(node);
273         int weight16;
274         if(level == Collator.SECONDARY) {
275             weight16 = rootElements.getSecondaryBefore(p, s);
276         } else {
277             weight16 = rootElements.getTertiaryBefore(p, s, t);
278             assert((weight16 & ~Collation.ONLY_TERTIARY_MASK) == 0);
279         }
280         return weight16;
281     }
282 
getSpecialResetPosition(CharSequence str)283     private long getSpecialResetPosition(CharSequence str) {
284         assert(str.length() == 2);
285         long ce;
286         int strength = Collator.PRIMARY;
287         boolean isBoundary = false;
288         CollationRuleParser.Position pos =
289                 CollationRuleParser.POSITION_VALUES[str.charAt(1) - CollationRuleParser.POS_BASE];
290         switch(pos) {
291         case FIRST_TERTIARY_IGNORABLE:
292             // Quaternary CEs are not supported.
293             // Non-zero quaternary weights are possible only on tertiary or stronger CEs.
294             return 0;
295         case LAST_TERTIARY_IGNORABLE:
296             return 0;
297         case FIRST_SECONDARY_IGNORABLE: {
298             // Look for a tailored tertiary node after [0, 0, 0].
299             int index = findOrInsertNodeForRootCE(0, Collator.TERTIARY);
300             long node = nodes.elementAti(index);
301             if((index = nextIndexFromNode(node)) != 0) {
302                 node = nodes.elementAti(index);
303                 assert(strengthFromNode(node) <= Collator.TERTIARY);
304                 if(isTailoredNode(node) && strengthFromNode(node) == Collator.TERTIARY) {
305                     return tempCEFromIndexAndStrength(index, Collator.TERTIARY);
306                 }
307             }
308             return rootElements.getFirstTertiaryCE();
309             // No need to look for nodeHasAnyBefore() on a tertiary node.
310         }
311         case LAST_SECONDARY_IGNORABLE:
312             ce = rootElements.getLastTertiaryCE();
313             strength = Collator.TERTIARY;
314             break;
315         case FIRST_PRIMARY_IGNORABLE: {
316             // Look for a tailored secondary node after [0, 0, *].
317             int index = findOrInsertNodeForRootCE(0, Collator.SECONDARY);
318             long node = nodes.elementAti(index);
319             while((index = nextIndexFromNode(node)) != 0) {
320                 node = nodes.elementAti(index);
321                 strength = strengthFromNode(node);
322                 if(strength < Collator.SECONDARY) { break; }
323                 if(strength == Collator.SECONDARY) {
324                     if(isTailoredNode(node)) {
325                         if(nodeHasBefore3(node)) {
326                             index = nextIndexFromNode(nodes.elementAti(nextIndexFromNode(node)));
327                             assert(isTailoredNode(nodes.elementAti(index)));
328                         }
329                         return tempCEFromIndexAndStrength(index, Collator.SECONDARY);
330                     } else {
331                         break;
332                     }
333                 }
334             }
335             ce = rootElements.getFirstSecondaryCE();
336             strength = Collator.SECONDARY;
337             break;
338         }
339         case LAST_PRIMARY_IGNORABLE:
340             ce = rootElements.getLastSecondaryCE();
341             strength = Collator.SECONDARY;
342             break;
343         case FIRST_VARIABLE:
344             ce = rootElements.getFirstPrimaryCE();
345             isBoundary = true;  // FractionalUCA.txt: FDD1 00A0, SPACE first primary
346             break;
347         case LAST_VARIABLE:
348             ce = rootElements.lastCEWithPrimaryBefore(variableTop + 1);
349             break;
350         case FIRST_REGULAR:
351             ce = rootElements.firstCEWithPrimaryAtLeast(variableTop + 1);
352             isBoundary = true;  // FractionalUCA.txt: FDD1 263A, SYMBOL first primary
353             break;
354         case LAST_REGULAR:
355             // Use the Hani-first-primary rather than the actual last "regular" CE before it,
356             // for backward compatibility with behavior before the introduction of
357             // script-first-primary CEs in the root collator.
358             ce = rootElements.firstCEWithPrimaryAtLeast(
359                 baseData.getFirstPrimaryForGroup(UScript.HAN));
360             break;
361         case FIRST_IMPLICIT:
362             ce = baseData.getSingleCE(0x4e00);
363             break;
364         case LAST_IMPLICIT:
365             // We do not support tailoring to an unassigned-implicit CE.
366             throw new UnsupportedOperationException(
367                     "reset to [last implicit] not supported");
368         case FIRST_TRAILING:
369             ce = Collation.makeCE(Collation.FIRST_TRAILING_PRIMARY);
370             isBoundary = true;  // trailing first primary (there is no mapping for it)
371             break;
372         case LAST_TRAILING:
373             throw new IllegalArgumentException("LDML forbids tailoring to U+FFFF");
374         default:
375             assert(false);
376             return 0;
377         }
378 
379         int index = findOrInsertNodeForRootCE(ce, strength);
380         long node = nodes.elementAti(index);
381         if((pos.ordinal() & 1) == 0) {
382             // even pos = [first xyz]
383             if(!nodeHasAnyBefore(node) && isBoundary) {
384                 // A <group> first primary boundary is artificially added to FractionalUCA.txt.
385                 // It is reachable via its special contraction, but is not normally used.
386                 // Find the first character tailored after the boundary CE,
387                 // or the first real root CE after it.
388                 if((index = nextIndexFromNode(node)) != 0) {
389                     // If there is a following node, then it must be tailored
390                     // because there are no root CEs with a boundary primary
391                     // and non-common secondary/tertiary weights.
392                     node = nodes.elementAti(index);
393                     assert(isTailoredNode(node));
394                     ce = tempCEFromIndexAndStrength(index, strength);
395                 } else {
396                     assert(strength == Collator.PRIMARY);
397                     long p = ce >>> 32;
398                     int pIndex = rootElements.findPrimary(p);
399                     boolean isCompressible = baseData.isCompressiblePrimary(p);
400                     p = rootElements.getPrimaryAfter(p, pIndex, isCompressible);
401                     ce = Collation.makeCE(p);
402                     index = findOrInsertNodeForRootCE(ce, Collator.PRIMARY);
403                     node = nodes.elementAti(index);
404                 }
405             }
406             if(nodeHasAnyBefore(node)) {
407                 // Get the first node that was tailored before this one at a weaker strength.
408                 if(nodeHasBefore2(node)) {
409                     index = nextIndexFromNode(nodes.elementAti(nextIndexFromNode(node)));
410                     node = nodes.elementAti(index);
411                 }
412                 if(nodeHasBefore3(node)) {
413                     index = nextIndexFromNode(nodes.elementAti(nextIndexFromNode(node)));
414                 }
415                 assert(isTailoredNode(nodes.elementAti(index)));
416                 ce = tempCEFromIndexAndStrength(index, strength);
417             }
418         } else {
419             // odd pos = [last xyz]
420             // Find the last node that was tailored after the [last xyz]
421             // at a strength no greater than the position's strength.
422             for(;;) {
423                 int nextIndex = nextIndexFromNode(node);
424                 if(nextIndex == 0) { break; }
425                 long nextNode = nodes.elementAti(nextIndex);
426                 if(strengthFromNode(nextNode) < strength) { break; }
427                 index = nextIndex;
428                 node = nextNode;
429             }
430             // Do not make a temporary CE for a root node.
431             // This last node might be the node for the root CE itself,
432             // or a node with a common secondary or tertiary weight.
433             if(isTailoredNode(node)) {
434                 ce = tempCEFromIndexAndStrength(index, strength);
435             }
436         }
437         return ce;
438     }
439 
440     /** Implements CollationRuleParser.Sink. */
441     @Override
addRelation(int strength, CharSequence prefix, CharSequence str, CharSequence extension)442     void addRelation(int strength, CharSequence prefix, CharSequence str, CharSequence extension) {
443         String nfdPrefix;
444         if(prefix.length() == 0) {
445             nfdPrefix = "";
446         } else {
447             nfdPrefix = nfd.normalize(prefix);
448         }
449         String nfdString = nfd.normalize(str);
450 
451         // The runtime code decomposes Hangul syllables on the fly,
452         // with recursive processing but without making the Jamo pieces visible for matching.
453         // It does not work with certain types of contextual mappings.
454         int nfdLength = nfdString.length();
455         if(nfdLength >= 2) {
456             char c = nfdString.charAt(0);
457             if(Hangul.isJamoL(c) || Hangul.isJamoV(c)) {
458                 // While handling a Hangul syllable, contractions starting with Jamo L or V
459                 // would not see the following Jamo of that syllable.
460                 throw new UnsupportedOperationException(
461                         "contractions starting with conjoining Jamo L or V not supported");
462             }
463             c = nfdString.charAt(nfdLength - 1);
464             if(Hangul.isJamoL(c) ||
465                     (Hangul.isJamoV(c) && Hangul.isJamoL(nfdString.charAt(nfdLength - 2)))) {
466                 // A contraction ending with Jamo L or L+V would require
467                 // generating Hangul syllables in addTailComposites() (588 for a Jamo L),
468                 // or decomposing a following Hangul syllable on the fly, during contraction matching.
469                 throw new UnsupportedOperationException(
470                         "contractions ending with conjoining Jamo L or L+V not supported");
471             }
472             // A Hangul syllable completely inside a contraction is ok.
473         }
474         // Note: If there is a prefix, then the parser checked that
475         // both the prefix and the string beging with NFC boundaries (not Jamo V or T).
476         // Therefore: prefix.isEmpty() || !isJamoVOrT(nfdString.charAt(0))
477         // (While handling a Hangul syllable, prefixes on Jamo V or T
478         // would not see the previous Jamo of that syllable.)
479 
480         if(strength != Collator.IDENTICAL) {
481             // Find the node index after which we insert the new tailored node.
482             int index = findOrInsertNodeForCEs(strength);
483             assert(cesLength > 0);
484             long ce = ces[cesLength - 1];
485             if(strength == Collator.PRIMARY && !isTempCE(ce) && (ce >>> 32) == 0) {
486                 // There is no primary gap between ignorables and the space-first-primary.
487                 throw new UnsupportedOperationException(
488                         "tailoring primary after ignorables not supported");
489             }
490             if(strength == Collator.QUATERNARY && ce == 0) {
491                 // The CE data structure does not support non-zero quaternary weights
492                 // on tertiary ignorables.
493                 throw new UnsupportedOperationException(
494                         "tailoring quaternary after tertiary ignorables not supported");
495             }
496             // Insert the new tailored node.
497             index = insertTailoredNodeAfter(index, strength);
498             // Strength of the temporary CE:
499             // The new relation may yield a stronger CE but not a weaker one.
500             int tempStrength = ceStrength(ce);
501             if(strength < tempStrength) { tempStrength = strength; }
502             ces[cesLength - 1] = tempCEFromIndexAndStrength(index, tempStrength);
503         }
504 
505         setCaseBits(nfdString);
506 
507         int cesLengthBeforeExtension = cesLength;
508         if(extension.length() != 0) {
509             String nfdExtension = nfd.normalize(extension);
510             cesLength = dataBuilder.getCEs(nfdExtension, ces, cesLength);
511             if(cesLength > Collation.MAX_EXPANSION_LENGTH) {
512                 throw new IllegalArgumentException(
513                         "extension string adds too many collation elements (more than 31 total)");
514             }
515         }
516         int ce32 = Collation.UNASSIGNED_CE32;
517         if((!nfdPrefix.contentEquals(prefix) || !nfdString.contentEquals(str)) &&
518                 !ignorePrefix(prefix) && !ignoreString(str)) {
519             // Map from the original input to the CEs.
520             // We do this in case the canonical closure is incomplete,
521             // so that it is possible to explicitly provide the missing mappings.
522             ce32 = addIfDifferent(prefix, str, ces, cesLength, ce32);
523         }
524         addWithClosure(nfdPrefix, nfdString, ces, cesLength, ce32);
525         cesLength = cesLengthBeforeExtension;
526     }
527 
528     /**
529      * Picks one of the current CEs and finds or inserts a node in the graph
530      * for the CE + strength.
531      */
findOrInsertNodeForCEs(int strength)532     private int findOrInsertNodeForCEs(int strength) {
533         assert(Collator.PRIMARY <= strength && strength <= Collator.QUATERNARY);
534 
535         // Find the last CE that is at least as "strong" as the requested difference.
536         // Note: Stronger is smaller (Collator.PRIMARY=0).
537         long ce;
538         for(;; --cesLength) {
539             if(cesLength == 0) {
540                 ce = ces[0] = 0;
541                 cesLength = 1;
542                 break;
543             } else {
544                 ce = ces[cesLength - 1];
545             }
546             if(ceStrength(ce) <= strength) { break; }
547         }
548 
549         if(isTempCE(ce)) {
550             // No need to findCommonNode() here for lower levels
551             // because insertTailoredNodeAfter() will do that anyway.
552             return indexFromTempCE(ce);
553         }
554 
555         // root CE
556         if((int)(ce >>> 56) == Collation.UNASSIGNED_IMPLICIT_BYTE) {
557             throw new UnsupportedOperationException(
558                     "tailoring relative to an unassigned code point not supported");
559         }
560         return findOrInsertNodeForRootCE(ce, strength);
561     }
562 
findOrInsertNodeForRootCE(long ce, int strength)563     private int findOrInsertNodeForRootCE(long ce, int strength) {
564         assert((int)(ce >>> 56) != Collation.UNASSIGNED_IMPLICIT_BYTE);
565 
566         // Find or insert the node for each of the root CE's weights,
567         // down to the requested level/strength.
568         // Root CEs must have common=zero quaternary weights (for which we never insert any nodes).
569         assert((ce & 0xc0) == 0);
570         int index = findOrInsertNodeForPrimary(ce >>> 32);
571         if(strength >= Collator.SECONDARY) {
572             int lower32 = (int)ce;
573             index = findOrInsertWeakNode(index, lower32 >>> 16, Collator.SECONDARY);
574             if(strength >= Collator.TERTIARY) {
575                 index = findOrInsertWeakNode(index, lower32 & Collation.ONLY_TERTIARY_MASK,
576                                             Collator.TERTIARY);
577             }
578         }
579         return index;
580     }
581 
582     /**
583      * Like Java Collections.binarySearch(List, key, Comparator).
584      *
585      * @return the index>=0 where the item was found,
586      *         or the index<0 for inserting the string at ~index in sorted order
587      *         (index into rootPrimaryIndexes)
588      */
binarySearchForRootPrimaryNode( int[] rootPrimaryIndexes, int length, long[] nodes, long p)589     private static final int binarySearchForRootPrimaryNode(
590             int[] rootPrimaryIndexes, int length, long[] nodes, long p) {
591         if(length == 0) { return ~0; }
592         int start = 0;
593         int limit = length;
594         for (;;) {
595             int i = (int)(((long)start + (long)limit) / 2);
596             long node = nodes[rootPrimaryIndexes[i]];
597             long nodePrimary = node >>> 32;  // weight32FromNode(node)
598             if (p == nodePrimary) {
599                 return i;
600             } else if (p < nodePrimary) {
601                 if (i == start) {
602                     return ~start;  // insert s before i
603                 }
604                 limit = i;
605             } else {
606                 if (i == start) {
607                     return ~(start + 1);  // insert s after i
608                 }
609                 start = i;
610             }
611         }
612     }
613 
614     /** Finds or inserts the node for a root CE's primary weight. */
findOrInsertNodeForPrimary(long p)615     private int findOrInsertNodeForPrimary(long p) {
616         int rootIndex = binarySearchForRootPrimaryNode(
617             rootPrimaryIndexes.getBuffer(), rootPrimaryIndexes.size(), nodes.getBuffer(), p);
618         if(rootIndex >= 0) {
619             return rootPrimaryIndexes.elementAti(rootIndex);
620         } else {
621             // Start a new list of nodes with this primary.
622             int index = nodes.size();
623             nodes.addElement(nodeFromWeight32(p));
624             rootPrimaryIndexes.insertElementAt(index, ~rootIndex);
625             return index;
626         }
627     }
628 
629     /** Finds or inserts the node for a secondary or tertiary weight. */
findOrInsertWeakNode(int index, int weight16, int level)630     private int findOrInsertWeakNode(int index, int weight16, int level) {
631         assert(0 <= index && index < nodes.size());
632         assert(Collator.SECONDARY <= level && level <= Collator.TERTIARY);
633 
634         if(weight16 == Collation.COMMON_WEIGHT16) {
635             return findCommonNode(index, level);
636         }
637 
638         // If this will be the first below-common weight for the parent node,
639         // then we will also need to insert a common weight after it.
640         long node = nodes.elementAti(index);
641         assert(strengthFromNode(node) < level);  // parent node is stronger
642         if(weight16 != 0 && weight16 < Collation.COMMON_WEIGHT16) {
643             int hasThisLevelBefore = level == Collator.SECONDARY ? HAS_BEFORE2 : HAS_BEFORE3;
644             if((node & hasThisLevelBefore) == 0) {
645                 // The parent node has an implied level-common weight.
646                 long commonNode =
647                     nodeFromWeight16(Collation.COMMON_WEIGHT16) | nodeFromStrength(level);
648                 if(level == Collator.SECONDARY) {
649                     // Move the HAS_BEFORE3 flag from the parent node
650                     // to the new secondary common node.
651                     commonNode |= node & HAS_BEFORE3;
652                     node &= ~(long)HAS_BEFORE3;
653                 }
654                 nodes.setElementAt(node | hasThisLevelBefore, index);
655                 // Insert below-common-weight node.
656                 int nextIndex = nextIndexFromNode(node);
657                 node = nodeFromWeight16(weight16) | nodeFromStrength(level);
658                 index = insertNodeBetween(index, nextIndex, node);
659                 // Insert common-weight node.
660                 insertNodeBetween(index, nextIndex, commonNode);
661                 // Return index of below-common-weight node.
662                 return index;
663             }
664         }
665 
666         // Find the root CE's weight for this level.
667         // Postpone insertion if not found:
668         // Insert the new root node before the next stronger node,
669         // or before the next root node with the same strength and a larger weight.
670         int nextIndex;
671         while((nextIndex = nextIndexFromNode(node)) != 0) {
672             node = nodes.elementAti(nextIndex);
673             int nextStrength = strengthFromNode(node);
674             if(nextStrength <= level) {
675                 // Insert before a stronger node.
676                 if(nextStrength < level) { break; }
677                 // nextStrength == level
678                 if(!isTailoredNode(node)) {
679                     int nextWeight16 = weight16FromNode(node);
680                     if(nextWeight16 == weight16) {
681                         // Found the node for the root CE up to this level.
682                         return nextIndex;
683                     }
684                     // Insert before a node with a larger same-strength weight.
685                     if(nextWeight16 > weight16) { break; }
686                 }
687             }
688             // Skip the next node.
689             index = nextIndex;
690         }
691         node = nodeFromWeight16(weight16) | nodeFromStrength(level);
692         return insertNodeBetween(index, nextIndex, node);
693     }
694 
695     /**
696      * Makes and inserts a new tailored node into the list, after the one at index.
697      * Skips over nodes of weaker strength to maintain collation order
698      * ("postpone insertion").
699      * @return the new node's index
700      */
701     private int insertTailoredNodeAfter(int index, int strength) {
702         assert(0 <= index && index < nodes.size());
703         if(strength >= Collator.SECONDARY) {
704             index = findCommonNode(index, Collator.SECONDARY);
705             if(strength >= Collator.TERTIARY) {
706                 index = findCommonNode(index, Collator.TERTIARY);
707             }
708         }
709         // Postpone insertion:
710         // Insert the new node before the next one with a strength at least as strong.
711         long node = nodes.elementAti(index);
712         int nextIndex;
713         while((nextIndex = nextIndexFromNode(node)) != 0) {
714             node = nodes.elementAti(nextIndex);
715             if(strengthFromNode(node) <= strength) { break; }
716             // Skip the next node which has a weaker (larger) strength than the new one.
717             index = nextIndex;
718         }
719         node = IS_TAILORED | nodeFromStrength(strength);
720         return insertNodeBetween(index, nextIndex, node);
721     }
722 
723     /**
724      * Inserts a new node into the list, between list-adjacent items.
725      * The node's previous and next indexes must not be set yet.
726      * @return the new node's index
727      */
728     private int insertNodeBetween(int index, int nextIndex, long node) {
729         assert(previousIndexFromNode(node) == 0);
730         assert(nextIndexFromNode(node) == 0);
731         assert(nextIndexFromNode(nodes.elementAti(index)) == nextIndex);
732         // Append the new node and link it to the existing nodes.
733         int newIndex = nodes.size();
734         node |= nodeFromPreviousIndex(index) | nodeFromNextIndex(nextIndex);
735         nodes.addElement(node);
736         // nodes[index].nextIndex = newIndex
737         node = nodes.elementAti(index);
738         nodes.setElementAt(changeNodeNextIndex(node, newIndex), index);
739         // nodes[nextIndex].previousIndex = newIndex
740         if(nextIndex != 0) {
741             node = nodes.elementAti(nextIndex);
742             nodes.setElementAt(changeNodePreviousIndex(node, newIndex), nextIndex);
743         }
744         return newIndex;
745     }
746 
747     /**
748      * Finds the node which implies or contains a common=05 weight of the given strength
749      * (secondary or tertiary), if the current node is stronger.
750      * Skips weaker nodes and tailored nodes if the current node is stronger
751      * and is followed by an explicit-common-weight node.
752      * Always returns the input index if that node is no stronger than the given strength.
753      */
754     private int findCommonNode(int index, int strength) {
755         assert(Collator.SECONDARY <= strength && strength <= Collator.TERTIARY);
756         long node = nodes.elementAti(index);
757         if(strengthFromNode(node) >= strength) {
758             // The current node is no stronger.
759             return index;
760         }
761         if(strength == Collator.SECONDARY ? !nodeHasBefore2(node) : !nodeHasBefore3(node)) {
762             // The current node implies the strength-common weight.
763             return index;
764         }
765         index = nextIndexFromNode(node);
766         node = nodes.elementAti(index);
767         assert(!isTailoredNode(node) && strengthFromNode(node) == strength &&
768                 weight16FromNode(node) < Collation.COMMON_WEIGHT16);
769         // Skip to the explicit common node.
770         do {
771             index = nextIndexFromNode(node);
772             node = nodes.elementAti(index);
773             assert(strengthFromNode(node) >= strength);
774         } while(isTailoredNode(node) || strengthFromNode(node) > strength ||
775                 weight16FromNode(node) < Collation.COMMON_WEIGHT16);
776         assert(weight16FromNode(node) == Collation.COMMON_WEIGHT16);
777         return index;
778     }
779 
780     private void setCaseBits(CharSequence nfdString) {
781         int numTailoredPrimaries = 0;
782         for(int i = 0; i < cesLength; ++i) {
783             if(ceStrength(ces[i]) == Collator.PRIMARY) { ++numTailoredPrimaries; }
784         }
785         // We should not be able to get too many case bits because
786         // cesLength<=31==MAX_EXPANSION_LENGTH.
787         // 31 pairs of case bits fit into an long without setting its sign bit.
788         assert(numTailoredPrimaries <= 31);
789 
790         long cases = 0;
791         if(numTailoredPrimaries > 0) {
792             CharSequence s = nfdString;
793             UTF16CollationIterator baseCEs = new UTF16CollationIterator(baseData, false, s, 0);
794             int baseCEsLength = baseCEs.fetchCEs() - 1;
795             assert(baseCEsLength >= 0 && baseCEs.getCE(baseCEsLength) == Collation.NO_CE);
796 
797             int lastCase = 0;
798             int numBasePrimaries = 0;
799             for(int i = 0; i < baseCEsLength; ++i) {
800                 long ce = baseCEs.getCE(i);
801                 if((ce >>> 32) != 0) {
802                     ++numBasePrimaries;
803                     int c = ((int)ce >> 14) & 3;
804                     assert(c == 0 || c == 2);  // lowercase or uppercase, no mixed case in any base CE
805                     if(numBasePrimaries < numTailoredPrimaries) {
806                         cases |= (long)c << ((numBasePrimaries - 1) * 2);
807                     } else if(numBasePrimaries == numTailoredPrimaries) {
808                         lastCase = c;
809                     } else if(c != lastCase) {
810                         // There are more base primary CEs than tailored primaries.
811                         // Set mixed case if the case bits of the remainder differ.
812                         lastCase = 1;
813                         // Nothing more can change.
814                         break;
815                     }
816                 }
817             }
818             if(numBasePrimaries >= numTailoredPrimaries) {
819                 cases |= (long)lastCase << ((numTailoredPrimaries - 1) * 2);
820             }
821         }
822 
823         for(int i = 0; i < cesLength; ++i) {
824             long ce = ces[i] & 0xffffffffffff3fffL;  // clear old case bits
825             int strength = ceStrength(ce);
826             if(strength == Collator.PRIMARY) {
827                 ce |= (cases & 3) << 14;
828                 cases >>>= 2;
829             } else if(strength == Collator.TERTIARY) {
830                 // Tertiary CEs must have uppercase bits.
831                 // See the LDML spec, and comments in class CollationCompare.
832                 ce |= 0x8000;
833             }
834             // Tertiary ignorable CEs must have 0 case bits.
835             // We set 0 case bits for secondary CEs too
836             // since currently only U+0345 is cased and maps to a secondary CE,
837             // and it is lowercase. Other secondaries are uncased.
838             // See [[:Cased:]&[:uca1=:]] where uca1 queries the root primary weight.
839             ces[i] = ce;
840         }
841     }
842 
843     /** Implements CollationRuleParser.Sink. */
844     @Override
845     void suppressContractions(UnicodeSet set) {
846         dataBuilder.suppressContractions(set);
847     }
848 
849     /** Implements CollationRuleParser.Sink. */
850     @Override
851     void optimize(UnicodeSet set) {
852         optimizeSet.addAll(set);
853     }
854 
855     /**
856      * Adds the mapping and its canonical closure.
857      * Takes ce32=dataBuilder.encodeCEs(...) so that the data builder
858      * need not re-encode the CEs multiple times.
859      */
860     private int addWithClosure(CharSequence nfdPrefix, CharSequence nfdString,
861                 long[] newCEs, int newCEsLength, int ce32) {
862         // Map from the NFD input to the CEs.
863         ce32 = addIfDifferent(nfdPrefix, nfdString, newCEs, newCEsLength, ce32);
864         ce32 = addOnlyClosure(nfdPrefix, nfdString, newCEs, newCEsLength, ce32);
865         addTailComposites(nfdPrefix, nfdString);
866         return ce32;
867     }
868 
869     private int addOnlyClosure(CharSequence nfdPrefix, CharSequence nfdString,
870                 long[] newCEs, int newCEsLength, int ce32) {
871         // Map from canonically equivalent input to the CEs. (But not from the all-NFD input.)
872         // TODO: make CanonicalIterator work with CharSequence, or maybe change arguments here to String
873         if(nfdPrefix.length() == 0) {
874             CanonicalIterator stringIter = new CanonicalIterator(nfdString.toString());
875             String prefix = "";
876             for(;;) {
877                 String str = stringIter.next();
878                 if(str == null) { break; }
879                 if(ignoreString(str) || str.contentEquals(nfdString)) { continue; }
880                 ce32 = addIfDifferent(prefix, str, newCEs, newCEsLength, ce32);
881             }
882         } else {
883             CanonicalIterator prefixIter = new CanonicalIterator(nfdPrefix.toString());
884             CanonicalIterator stringIter = new CanonicalIterator(nfdString.toString());
885             for(;;) {
886                 String prefix = prefixIter.next();
887                 if(prefix == null) { break; }
888                 if(ignorePrefix(prefix)) { continue; }
889                 boolean samePrefix = prefix.contentEquals(nfdPrefix);
890                 for(;;) {
891                     String str = stringIter.next();
892                     if(str == null) { break; }
893                     if(ignoreString(str) || (samePrefix && str.contentEquals(nfdString))) { continue; }
894                     ce32 = addIfDifferent(prefix, str, newCEs, newCEsLength, ce32);
895                 }
896                 stringIter.reset();
897             }
898         }
899         return ce32;
900     }
901 
902     private void addTailComposites(CharSequence nfdPrefix, CharSequence nfdString) {
903         // Look for the last starter in the NFD string.
904         int lastStarter;
905         int indexAfterLastStarter = nfdString.length();
906         for(;;) {
907             if(indexAfterLastStarter == 0) { return; }  // no starter at all
908             lastStarter = Character.codePointBefore(nfdString, indexAfterLastStarter);
909             if(nfd.getCombiningClass(lastStarter) == 0) { break; }
910             indexAfterLastStarter -= Character.charCount(lastStarter);
911         }
912         // No closure to Hangul syllables since we decompose them on the fly.
913         if(Hangul.isJamoL(lastStarter)) { return; }
914 
915         // Are there any composites whose decomposition starts with the lastStarter?
916         // Note: Normalizer2Impl does not currently return start sets for NFC_QC=Maybe characters.
917         // We might find some more equivalent mappings here if it did.
918         UnicodeSet composites = new UnicodeSet();
919         if(!nfcImpl.getCanonStartSet(lastStarter, composites)) { return; }
920 
921         StringBuilder newNFDString = new StringBuilder(), newString = new StringBuilder();
922         long[] newCEs = new long[Collation.MAX_EXPANSION_LENGTH];
923         UnicodeSetIterator iter = new UnicodeSetIterator(composites);
924         while(iter.next()) {
925             assert(iter.codepoint != UnicodeSetIterator.IS_STRING);
926             int composite = iter.codepoint;
927             String decomp = nfd.getDecomposition(composite);
928             if(!mergeCompositeIntoString(nfdString, indexAfterLastStarter, composite, decomp,
929                     newNFDString, newString)) {
930                 continue;
931             }
932             int newCEsLength = dataBuilder.getCEs(nfdPrefix, newNFDString, newCEs, 0);
933             if(newCEsLength > Collation.MAX_EXPANSION_LENGTH) {
934                 // Ignore mappings that we cannot store.
935                 continue;
936             }
937             // Note: It is possible that the newCEs do not make use of the mapping
938             // for which we are adding the tail composites, in which case we might be adding
939             // unnecessary mappings.
940             // For example, when we add tail composites for ae^ (^=combining circumflex),
941             // UCA discontiguous-contraction matching does not find any matches
942             // for ae_^ (_=any combining diacritic below) *unless* there is also
943             // a contraction mapping for ae.
944             // Thus, if there is no ae contraction, then the ae^ mapping is ignored
945             // while fetching the newCEs for ae_^.
946             // TODO: Try to detect this effectively.
947             // (Alternatively, print a warning when prefix contractions are missing.)
948 
949             // We do not need an explicit mapping for the NFD strings.
950             // It is fine if the NFD input collates like this via a sequence of mappings.
951             // It also saves a little bit of space, and may reduce the set of characters with contractions.
952             int ce32 = addIfDifferent(nfdPrefix, newString,
953                                           newCEs, newCEsLength, Collation.UNASSIGNED_CE32);
954             if(ce32 != Collation.UNASSIGNED_CE32) {
955                 // was different, was added
956                 addOnlyClosure(nfdPrefix, newNFDString, newCEs, newCEsLength, ce32);
957             }
958         }
959     }
960 
961     private boolean mergeCompositeIntoString(CharSequence nfdString, int indexAfterLastStarter,
962                 int composite, CharSequence decomp,
963                 StringBuilder newNFDString, StringBuilder newString) {
964         assert(Character.codePointBefore(nfdString, indexAfterLastStarter) ==
965                 Character.codePointAt(decomp, 0));
966         int lastStarterLength = Character.offsetByCodePoints(decomp, 0, 1);
967         if(lastStarterLength == decomp.length()) {
968             // Singleton decompositions should be found by addWithClosure()
969             // and the CanonicalIterator, so we can ignore them here.
970             return false;
971         }
972         if(equalSubSequences(nfdString, indexAfterLastStarter, decomp, lastStarterLength)) {
973             // same strings, nothing new to be found here
974             return false;
975         }
976 
977         // Make new FCD strings that combine a composite, or its decomposition,
978         // into the nfdString's last starter and the combining marks following it.
979         // Make an NFD version, and a version with the composite.
980         newNFDString.setLength(0);
981         newNFDString.append(nfdString, 0, indexAfterLastStarter);
982         newString.setLength(0);
983         newString.append(nfdString, 0, indexAfterLastStarter - lastStarterLength)
984             .appendCodePoint(composite);
985 
986         // The following is related to discontiguous contraction matching,
987         // but builds only FCD strings (or else returns false).
988         int sourceIndex = indexAfterLastStarter;
989         int decompIndex = lastStarterLength;
990         // Small optimization: We keep the source character across loop iterations
991         // because we do not always consume it,
992         // and then need not fetch it again nor look up its combining class again.
993         int sourceChar = Collation.SENTINEL_CP;
994         // The cc variables need to be declared before the loop so that at the end
995         // they are set to the last combining classes seen.
996         int sourceCC = 0;
997         int decompCC = 0;
998         for(;;) {
999             if(sourceChar < 0) {
1000                 if(sourceIndex >= nfdString.length()) { break; }
1001                 sourceChar = Character.codePointAt(nfdString, sourceIndex);
1002                 sourceCC = nfd.getCombiningClass(sourceChar);
1003                 assert(sourceCC != 0);
1004             }
1005             // We consume a decomposition character in each iteration.
1006             if(decompIndex >= decomp.length()) { break; }
1007             int decompChar = Character.codePointAt(decomp, decompIndex);
1008             decompCC = nfd.getCombiningClass(decompChar);
1009             // Compare the two characters and their combining classes.
1010             if(decompCC == 0) {
1011                 // Unable to merge because the source contains a non-zero combining mark
1012                 // but the composite's decomposition contains another starter.
1013                 // The strings would not be equivalent.
1014                 return false;
1015             } else if(sourceCC < decompCC) {
1016                 // Composite + sourceChar would not be FCD.
1017                 return false;
1018             } else if(decompCC < sourceCC) {
1019                 newNFDString.appendCodePoint(decompChar);
1020                 decompIndex += Character.charCount(decompChar);
1021             } else if(decompChar != sourceChar) {
1022                 // Blocked because same combining class.
1023                 return false;
1024             } else {  // match: decompChar == sourceChar
1025                 newNFDString.appendCodePoint(decompChar);
1026                 decompIndex += Character.charCount(decompChar);
1027                 sourceIndex += Character.charCount(decompChar);
1028                 sourceChar = Collation.SENTINEL_CP;
1029             }
1030         }
1031         // We are at the end of at least one of the two inputs.
1032         if(sourceChar >= 0) {  // more characters from nfdString but not from decomp
1033             if(sourceCC < decompCC) {
1034                 // Appending the next source character to the composite would not be FCD.
1035                 return false;
1036             }
1037             newNFDString.append(nfdString, sourceIndex, nfdString.length());
1038             newString.append(nfdString, sourceIndex, nfdString.length());
1039         } else if(decompIndex < decomp.length()) {  // more characters from decomp, not from nfdString
1040             newNFDString.append(decomp, decompIndex, decomp.length());
1041         }
1042         assert(nfd.isNormalized(newNFDString));
1043         assert(fcd.isNormalized(newString));
1044         assert(nfd.normalize(newString).equals(newNFDString.toString()));  // canonically equivalent
1045         return true;
1046     }
1047 
1048     private boolean equalSubSequences(CharSequence left, int leftStart, CharSequence right, int rightStart) {
1049         // C++ UnicodeString::compare(leftStart, 0x7fffffff, right, rightStart, 0x7fffffff) == 0
1050         int leftLength = left.length();
1051         if((leftLength - leftStart) != (right.length() - rightStart)) { return false; }
1052         while(leftStart < leftLength) {
1053             if(left.charAt(leftStart++) != right.charAt(rightStart++)) {
1054                 return false;
1055             }
1056         }
1057         return true;
1058     }
1059     private boolean ignorePrefix(CharSequence s) {
1060         // Do not map non-FCD prefixes.
1061         return !isFCD(s);
1062     }
1063     private boolean ignoreString(CharSequence s) {
1064         // Do not map non-FCD strings.
1065         // Do not map strings that start with Hangul syllables: We decompose those on the fly.
1066         return !isFCD(s) || Hangul.isHangul(s.charAt(0));
1067     }
1068     private boolean isFCD(CharSequence s) {
1069         return fcd.isNormalized(s);
1070     }
1071 
1072     private static final UnicodeSet COMPOSITES = new UnicodeSet("[:NFD_QC=N:]");
1073     static {
1074         // Hangul is decomposed on the fly during collation.
1075         COMPOSITES.remove(Hangul.HANGUL_BASE, Hangul.HANGUL_END);
1076     }
1077 
1078     private void closeOverComposites() {
1079         String prefix = "";  // empty
1080         UnicodeSetIterator iter = new UnicodeSetIterator(COMPOSITES);
1081         while(iter.next()) {
1082             assert(iter.codepoint != UnicodeSetIterator.IS_STRING);
1083             String nfdString = nfd.getDecomposition(iter.codepoint);
1084             cesLength = dataBuilder.getCEs(nfdString, ces, 0);
1085             if(cesLength > Collation.MAX_EXPANSION_LENGTH) {
1086                 // Too many CEs from the decomposition (unusual), ignore this composite.
1087                 // We could add a capacity parameter to getCEs() and reallocate if necessary.
1088                 // However, this can only really happen in contrived cases.
1089                 continue;
1090             }
1091             String composite = iter.getString();
1092             addIfDifferent(prefix, composite, ces, cesLength, Collation.UNASSIGNED_CE32);
1093         }
1094     }
1095 
1096     private int addIfDifferent(CharSequence prefix, CharSequence str,
1097                 long[] newCEs, int newCEsLength, int ce32) {
1098         long[] oldCEs = new long[Collation.MAX_EXPANSION_LENGTH];
1099         int oldCEsLength = dataBuilder.getCEs(prefix, str, oldCEs, 0);
1100         if(!sameCEs(newCEs, newCEsLength, oldCEs, oldCEsLength)) {
1101             if(ce32 == Collation.UNASSIGNED_CE32) {
1102                 ce32 = dataBuilder.encodeCEs(newCEs, newCEsLength);
1103             }
1104             dataBuilder.addCE32(prefix, str, ce32);
1105         }
1106         return ce32;
1107     }
1108 
1109     private static boolean sameCEs(long[] ces1, int ces1Length,
1110                 long[] ces2, int ces2Length) {
1111         if(ces1Length != ces2Length) {
1112             return false;
1113         }
1114         assert(ces1Length <= Collation.MAX_EXPANSION_LENGTH);
1115         for(int i = 0; i < ces1Length; ++i) {
1116             if(ces1[i] != ces2[i]) { return false; }
1117         }
1118         return true;
1119     }
1120 
1121     private static final int alignWeightRight(int w) {
1122         if(w != 0) {
1123             while((w & 0xff) == 0) { w >>>= 8; }
1124         }
1125         return w;
1126     }
1127 
1128     /**
1129      * Walks the tailoring graph and overwrites tailored nodes with new CEs.
1130      * After this, the graph is destroyed.
1131      * The nodes array can then be used only as a source of tailored CEs.
1132      */
1133     private void makeTailoredCEs() {
1134         CollationWeights primaries = new CollationWeights();
1135         CollationWeights secondaries = new CollationWeights();
1136         CollationWeights tertiaries = new CollationWeights();
1137         long[] nodesArray = nodes.getBuffer();
1138         if(DEBUG) {
1139             System.out.println("\nCollationBuilder.makeTailoredCEs()");
1140         }
1141 
1142         for(int rpi = 0; rpi < rootPrimaryIndexes.size(); ++rpi) {
1143             int i = rootPrimaryIndexes.elementAti(rpi);
1144             long node = nodesArray[i];
1145             long p = weight32FromNode(node);
1146             int s = p == 0 ? 0 : Collation.COMMON_WEIGHT16;
1147             int t = s;
1148             int q = 0;
1149             boolean pIsTailored = false;
1150             boolean sIsTailored = false;
1151             boolean tIsTailored = false;
1152             if(DEBUG) {
1153                 System.out.printf("\nprimary     %x\n", alignWeightRight((int)p));
1154             }
1155             int pIndex = p == 0 ? 0 : rootElements.findPrimary(p);
1156             int nextIndex = nextIndexFromNode(node);
1157             while(nextIndex != 0) {
1158                 i = nextIndex;
1159                 node = nodesArray[i];
1160                 nextIndex = nextIndexFromNode(node);
1161                 int strength = strengthFromNode(node);
1162                 if(strength == Collator.QUATERNARY) {
1163                     assert(isTailoredNode(node));
1164                     if(DEBUG) {
1165                         System.out.print("      quat+     ");
1166                     }
1167                     if(q == 3) {
1168                         // C++ U_BUFFER_OVERFLOW_ERROR
1169                         throw new UnsupportedOperationException("quaternary tailoring gap too small");
1170                     }
1171                     ++q;
1172                 } else {
1173                     if(strength == Collator.TERTIARY) {
1174                         if(isTailoredNode(node)) {
1175                             if(DEBUG) {
1176                                 System.out.print("    ter+        ");
1177                             }
1178                             if(!tIsTailored) {
1179                                 // First tailored tertiary node for [p, s].
1180                                 int tCount = countTailoredNodes(nodesArray, nextIndex,
1181                                                                     Collator.TERTIARY) + 1;
1182                                 int tLimit;
1183                                 if(t == 0) {
1184                                     // Gap at the beginning of the tertiary CE range.
1185                                     t = rootElements.getTertiaryBoundary() - 0x100;
1186                                     tLimit = (int)rootElements.getFirstTertiaryCE() & Collation.ONLY_TERTIARY_MASK;
1187                                 } else if(!pIsTailored && !sIsTailored) {
1188                                     // p and s are root weights.
1189                                     tLimit = rootElements.getTertiaryAfter(pIndex, s, t);
1190                                 } else if(t == Collation.BEFORE_WEIGHT16) {
1191                                     tLimit = Collation.COMMON_WEIGHT16;
1192                                 } else {
1193                                     // [p, s] is tailored.
1194                                     assert(t == Collation.COMMON_WEIGHT16);
1195                                     tLimit = rootElements.getTertiaryBoundary();
1196                                 }
1197                                 assert(tLimit == 0x4000 || (tLimit & ~Collation.ONLY_TERTIARY_MASK) == 0);
1198                                 tertiaries.initForTertiary();
1199                                 if(!tertiaries.allocWeights(t, tLimit, tCount)) {
1200                                     // C++ U_BUFFER_OVERFLOW_ERROR
1201                                     throw new UnsupportedOperationException("tertiary tailoring gap too small");
1202                                 }
1203                                 tIsTailored = true;
1204                             }
1205                             t = (int)tertiaries.nextWeight();
1206                             assert(t != 0xffffffff);
1207                         } else {
1208                             t = weight16FromNode(node);
1209                             tIsTailored = false;
1210                             if(DEBUG) {
1211                                 System.out.printf("    ter     %x\n", alignWeightRight(t));
1212                             }
1213                         }
1214                     } else {
1215                         if(strength == Collator.SECONDARY) {
1216                             if(isTailoredNode(node)) {
1217                                 if(DEBUG) {
1218                                     System.out.print("  sec+          ");
1219                                 }
1220                                 if(!sIsTailored) {
1221                                     // First tailored secondary node for p.
1222                                     int sCount = countTailoredNodes(nodesArray, nextIndex,
1223                                                                         Collator.SECONDARY) + 1;
1224                                     int sLimit;
1225                                     if(s == 0) {
1226                                         // Gap at the beginning of the secondary CE range.
1227                                         s = rootElements.getSecondaryBoundary() - 0x100;
1228                                         sLimit = (int)(rootElements.getFirstSecondaryCE() >> 16);
1229                                     } else if(!pIsTailored) {
1230                                         // p is a root primary.
1231                                         sLimit = rootElements.getSecondaryAfter(pIndex, s);
1232                                     } else if(s == Collation.BEFORE_WEIGHT16) {
1233                                         sLimit = Collation.COMMON_WEIGHT16;
1234                                     } else {
1235                                         // p is a tailored primary.
1236                                         assert(s == Collation.COMMON_WEIGHT16);
1237                                         sLimit = rootElements.getSecondaryBoundary();
1238                                     }
1239                                     if(s == Collation.COMMON_WEIGHT16) {
1240                                         // Do not tailor into the getSortKey() range of
1241                                         // compressed common secondaries.
1242                                         s = rootElements.getLastCommonSecondary();
1243                                     }
1244                                     secondaries.initForSecondary();
1245                                     if(!secondaries.allocWeights(s, sLimit, sCount)) {
1246                                         // C++ U_BUFFER_OVERFLOW_ERROR
1247                                         if(DEBUG) {
1248                                             System.out.printf("!secondaries.allocWeights(%x, %x, sCount=%d)\n",
1249                                                     alignWeightRight(s), alignWeightRight(sLimit),
1250                                                     alignWeightRight(sCount));
1251                                         }
1252                                         throw new UnsupportedOperationException("secondary tailoring gap too small");
1253                                     }
1254                                     sIsTailored = true;
1255                                 }
1256                                 s = (int)secondaries.nextWeight();
1257                                 assert(s != 0xffffffff);
1258                             } else {
1259                                 s = weight16FromNode(node);
1260                                 sIsTailored = false;
1261                                 if(DEBUG) {
1262                                     System.out.printf("  sec       %x\n", alignWeightRight(s));
1263                                 }
1264                             }
1265                         } else /* Collator.PRIMARY */ {
1266                             assert(isTailoredNode(node));
1267                             if(DEBUG) {
1268                                 System.out.print("pri+            ");
1269                             }
1270                             if(!pIsTailored) {
1271                                 // First tailored primary node in this list.
1272                                 int pCount = countTailoredNodes(nodesArray, nextIndex,
1273                                                                     Collator.PRIMARY) + 1;
1274                                 boolean isCompressible = baseData.isCompressiblePrimary(p);
1275                                 long pLimit =
1276                                     rootElements.getPrimaryAfter(p, pIndex, isCompressible);
1277                                 primaries.initForPrimary(isCompressible);
1278                                 if(!primaries.allocWeights(p, pLimit, pCount)) {
1279                                     // C++ U_BUFFER_OVERFLOW_ERROR  // TODO: introduce a more specific UErrorCode?
1280                                     throw new UnsupportedOperationException("primary tailoring gap too small");
1281                                 }
1282                                 pIsTailored = true;
1283                             }
1284                             p = primaries.nextWeight();
1285                             assert(p != 0xffffffffL);
1286                             s = Collation.COMMON_WEIGHT16;
1287                             sIsTailored = false;
1288                         }
1289                         t = s == 0 ? 0 : Collation.COMMON_WEIGHT16;
1290                         tIsTailored = false;
1291                     }
1292                     q = 0;
1293                 }
1294                 if(isTailoredNode(node)) {
1295                     nodesArray[i] = Collation.makeCE(p, s, t, q);
1296                     if(DEBUG) {
1297                         System.out.printf("%016x\n", nodesArray[i]);
1298                     }
1299                 }
1300             }
1301         }
1302     }
1303 
1304     /**
1305      * Counts the tailored nodes of the given strength up to the next node
1306      * which is either stronger or has an explicit weight of this strength.
1307      */
1308     private static int countTailoredNodes(long[] nodesArray, int i, int strength) {
1309         int count = 0;
1310         for(;;) {
1311             if(i == 0) { break; }
1312             long node = nodesArray[i];
1313             if(strengthFromNode(node) < strength) { break; }
1314             if(strengthFromNode(node) == strength) {
1315                 if(isTailoredNode(node)) {
1316                     ++count;
1317                 } else {
1318                     break;
1319                 }
1320             }
1321             i = nextIndexFromNode(node);
1322         }
1323         return count;
1324     }
1325 
1326     private static final class CEFinalizer implements CollationDataBuilder.CEModifier {
1327         CEFinalizer(long[] ces) {
1328             finalCEs = ces;
1329         }
1330         @Override
1331         public long modifyCE32(int ce32) {
1332             assert(!Collation.isSpecialCE32(ce32));
1333             if(CollationBuilder.isTempCE32(ce32)) {
1334                 // retain case bits
1335                 return finalCEs[CollationBuilder.indexFromTempCE32(ce32)] | ((ce32 & 0xc0) << 8);
1336             } else {
1337                 return Collation.NO_CE;
1338             }
1339         }
1340         @Override
1341         public long modifyCE(long ce) {
1342             if(CollationBuilder.isTempCE(ce)) {
1343                 // retain case bits
1344                 return finalCEs[CollationBuilder.indexFromTempCE(ce)] | (ce & 0xc000);
1345             } else {
1346                 return Collation.NO_CE;
1347             }
1348         }
1349 
1350         private long[] finalCEs;
1351     };
1352 
1353     /** Replaces temporary CEs with the final CEs they point to. */
1354     private void finalizeCEs() {
1355         CollationDataBuilder newBuilder = new CollationDataBuilder();
1356         newBuilder.initForTailoring(baseData);
1357         CEFinalizer finalizer = new CEFinalizer(nodes.getBuffer());
1358         newBuilder.copyFrom(dataBuilder, finalizer);
1359         dataBuilder = newBuilder;
1360     }
1361 
1362     /**
1363      * Encodes "temporary CE" data into a CE that fits into the CE32 data structure,
1364      * with 2-byte primary, 1-byte secondary and 6-bit tertiary,
1365      * with valid CE byte values.
1366      *
1367      * The index must not exceed 20 bits (0xfffff).
1368      * The strength must fit into 2 bits (Collator.PRIMARY..Collator.QUATERNARY).
1369      *
1370      * Temporary CEs are distinguished from real CEs by their use of
1371      * secondary weights 06..45 which are otherwise reserved for compressed sort keys.
1372      *
1373      * The case bits are unused and available.
1374      */
1375     private static long tempCEFromIndexAndStrength(int index, int strength) {
1376         return
1377             // CE byte offsets, to ensure valid CE bytes, and case bits 11
1378             0x4040000006002000L +
1379             // index bits 19..13 -> primary byte 1 = CE bits 63..56 (byte values 40..BF)
1380             ((long)(index & 0xfe000) << 43) +
1381             // index bits 12..6 -> primary byte 2 = CE bits 55..48 (byte values 40..BF)
1382             ((long)(index & 0x1fc0) << 42) +
1383             // index bits 5..0 -> secondary byte 1 = CE bits 31..24 (byte values 06..45)
1384             ((index & 0x3f) << 24) +
1385             // strength bits 1..0 -> tertiary byte 1 = CE bits 13..8 (byte values 20..23)
1386             (strength << 8);
1387     }
1388     private static int indexFromTempCE(long tempCE) {
1389         tempCE -= 0x4040000006002000L;
1390         return
1391             ((int)(tempCE >> 43) & 0xfe000) |
1392             ((int)(tempCE >> 42) & 0x1fc0) |
1393             ((int)(tempCE >> 24) & 0x3f);
1394     }
1395     private static int strengthFromTempCE(long tempCE) {
1396         return ((int)tempCE >> 8) & 3;
1397     }
1398     private static boolean isTempCE(long ce) {
1399         int sec = (int)ce >>> 24;
1400         return 6 <= sec && sec <= 0x45;
1401     }
1402 
1403     private static int indexFromTempCE32(int tempCE32) {
1404         tempCE32 -= 0x40400620;
1405         return
1406             ((tempCE32 >> 11) & 0xfe000) |
1407             ((tempCE32 >> 10) & 0x1fc0) |
1408             ((tempCE32 >> 8) & 0x3f);
1409     }
isTempCE32(int ce32)1410     private static boolean isTempCE32(int ce32) {
1411         return
1412             (ce32 & 0xff) >= 2 &&  // not a long-primary/long-secondary CE32
1413             6 <= ((ce32 >> 8) & 0xff) && ((ce32 >> 8) & 0xff) <= 0x45;
1414     }
1415 
ceStrength(long ce)1416     private static int ceStrength(long ce) {
1417         return
1418             isTempCE(ce) ? strengthFromTempCE(ce) :
1419             (ce & 0xff00000000000000L) != 0 ? Collator.PRIMARY :
1420             ((int)ce & 0xff000000) != 0 ? Collator.SECONDARY :
1421             ce != 0 ? Collator.TERTIARY :
1422             Collator.IDENTICAL;
1423     }
1424 
1425     /** At most 1M nodes, limited by the 20 bits in node bit fields. */
1426     private static final int MAX_INDEX = 0xfffff;
1427     /**
1428      * Node bit 6 is set on a primary node if there are nodes
1429      * with secondary values below the common secondary weight (05).
1430      */
1431     private static final int HAS_BEFORE2 = 0x40;
1432     /**
1433      * Node bit 5 is set on a primary or secondary node if there are nodes
1434      * with tertiary values below the common tertiary weight (05).
1435      */
1436     private static final int HAS_BEFORE3 = 0x20;
1437     /**
1438      * Node bit 3 distinguishes a tailored node, which has no weight value,
1439      * from a node with an explicit (root or default) weight.
1440      */
1441     private static final int IS_TAILORED = 8;
1442 
nodeFromWeight32(long weight32)1443     private static long nodeFromWeight32(long weight32) {
1444         return weight32 << 32;
1445     }
nodeFromWeight16(int weight16)1446     private static long nodeFromWeight16(int weight16) {
1447         return (long)weight16 << 48;
1448     }
nodeFromPreviousIndex(int previous)1449     private static long nodeFromPreviousIndex(int previous) {
1450         return (long)previous << 28;
1451     }
nodeFromNextIndex(int next)1452     private static long nodeFromNextIndex(int next) {
1453         return next << 8;
1454     }
nodeFromStrength(int strength)1455     private static long nodeFromStrength(int strength) {
1456         return strength;
1457     }
1458 
weight32FromNode(long node)1459     private static long weight32FromNode(long node) {
1460         return node >>> 32;
1461     }
weight16FromNode(long node)1462     private static int weight16FromNode(long node) {
1463         return (int)(node >> 48) & 0xffff;
1464     }
previousIndexFromNode(long node)1465     private static int previousIndexFromNode(long node) {
1466         return (int)(node >> 28) & MAX_INDEX;
1467     }
nextIndexFromNode(long node)1468     private static int nextIndexFromNode(long node) {
1469         return ((int)node >> 8) & MAX_INDEX;
1470     }
strengthFromNode(long node)1471     private static int strengthFromNode(long node) {
1472         return (int)node & 3;
1473     }
1474 
nodeHasBefore2(long node)1475     private static boolean nodeHasBefore2(long node) {
1476         return (node & HAS_BEFORE2) != 0;
1477     }
nodeHasBefore3(long node)1478     private static boolean nodeHasBefore3(long node) {
1479         return (node & HAS_BEFORE3) != 0;
1480     }
nodeHasAnyBefore(long node)1481     private static boolean nodeHasAnyBefore(long node) {
1482         return (node & (HAS_BEFORE2 | HAS_BEFORE3)) != 0;
1483     }
isTailoredNode(long node)1484     private static boolean isTailoredNode(long node) {
1485         return (node & IS_TAILORED) != 0;
1486     }
1487 
changeNodePreviousIndex(long node, int previous)1488     private static long changeNodePreviousIndex(long node, int previous) {
1489         return (node & 0xffff00000fffffffL) | nodeFromPreviousIndex(previous);
1490     }
changeNodeNextIndex(long node, int next)1491     private static long changeNodeNextIndex(long node, int next) {
1492         return (node & 0xfffffffff00000ffL) | nodeFromNextIndex(next);
1493     }
1494 
1495     private Normalizer2 nfd, fcd;
1496     private Normalizer2Impl nfcImpl;
1497 
1498     private CollationTailoring base;
1499     private CollationData baseData;
1500     private CollationRootElements rootElements;
1501     private long variableTop;
1502 
1503     private CollationDataBuilder dataBuilder;
1504     private boolean fastLatinEnabled;
1505     private UnicodeSet optimizeSet = new UnicodeSet();
1506 
1507     private long[] ces = new long[Collation.MAX_EXPANSION_LENGTH];
1508     private int cesLength;
1509 
1510     /**
1511      * Indexes of nodes with root primary weights, sorted by primary.
1512      * Compact form of a TreeMap from root primary to node index.
1513      *
1514      * This is a performance optimization for finding reset positions.
1515      * Without this, we would have to search through the entire nodes list.
1516      * It also allows storing root primary weights in list head nodes,
1517      * without previous index, leaving room in root primary nodes for 32-bit primary weights.
1518      */
1519     private UVector32 rootPrimaryIndexes;
1520     /**
1521      * Data structure for assigning tailored weights and CEs.
1522      * Doubly-linked lists of nodes in mostly collation order.
1523      * Each list starts with a root primary node and ends with a nextIndex of 0.
1524      *
1525      * When there are any nodes in the list, then there is always a root primary node at index 0.
1526      * This allows some code not to have to check explicitly for nextIndex==0.
1527      *
1528      * Root primary nodes have 32-bit weights but do not have previous indexes.
1529      * All other nodes have at most 16-bit weights and do have previous indexes.
1530      *
1531      * Nodes with explicit weights store root collator weights,
1532      * or default weak weights (e.g., secondary 05) for stronger nodes.
1533      * "Tailored" nodes, with the IS_TAILORED bit set,
1534      * do not store explicit weights but rather
1535      * create a difference of a certain strength from the preceding node.
1536      *
1537      * A root node is followed by either
1538      * - a root/default node of the same strength, or
1539      * - a root/default node of the next-weaker strength, or
1540      * - a tailored node of the same strength.
1541      *
1542      * A node of a given strength normally implies "common" weights on weaker levels.
1543      *
1544      * A node with HAS_BEFORE2 must be immediately followed by
1545      * a secondary node with an explicit below-common weight, then a secondary tailored node,
1546      * and later an explicit common-secondary node.
1547      * The below-common weight can be a root weight,
1548      * or it can be BEFORE_WEIGHT16 for tailoring before an implied common weight
1549      * or before the lowest root weight.
1550      * (&[before 2] resets to an explicit secondary node so that
1551      * the following addRelation(secondary) tailors right after that.
1552      * If we did not have this node and instead were to reset on the primary node,
1553      * then addRelation(secondary) would skip forward to the the COMMON_WEIGHT16 node.)
1554      *
1555      * If the flag is not set, then there are no explicit secondary nodes
1556      * with the common or lower weights.
1557      *
1558      * Same for HAS_BEFORE3 for tertiary nodes and weights.
1559      * A node must not have both flags set.
1560      *
1561      * Tailored CEs are initially represented in a CollationDataBuilder as temporary CEs
1562      * which point to stable indexes in this list,
1563      * and temporary CEs stored in a CollationDataBuilder only point to tailored nodes.
1564      *
1565      * A temporary CE in the ces[] array may point to a non-tailored reset-before-position node,
1566      * until the next relation is added.
1567      *
1568      * At the end, the tailored weights are allocated as necessary,
1569      * then the tailored nodes are replaced with final CEs,
1570      * and the CollationData is rewritten by replacing temporary CEs with final ones.
1571      *
1572      * We cannot simply insert new nodes in the middle of the array
1573      * because that would invalidate the indexes stored in existing temporary CEs.
1574      * We need to use a linked graph with stable indexes to existing nodes.
1575      * A doubly-linked list seems easiest to maintain.
1576      *
1577      * Each node is stored as an long, with its fields stored as bit fields.
1578      *
1579      * Root primary node:
1580      * - primary weight: 32 bits 63..32
1581      * - reserved/unused/zero: 4 bits 31..28
1582      *
1583      * Weaker root nodes & tailored nodes:
1584      * - a weight: 16 bits 63..48
1585      *   + a root or default weight for a non-tailored node
1586      *   + unused/zero for a tailored node
1587      * - index to the previous node: 20 bits 47..28
1588      *
1589      * All types of nodes:
1590      * - index to the next node: 20 bits 27..8
1591      *   + nextIndex=0 in last node per root-primary list
1592      * - reserved/unused/zero bits: bits 7, 4, 2
1593      * - HAS_BEFORE2: bit 6
1594      * - HAS_BEFORE3: bit 5
1595      * - IS_TAILORED: bit 3
1596      * - the difference strength (primary/secondary/tertiary/quaternary): 2 bits 1..0
1597      *
1598      * We could allocate structs with pointers, but we would have to store them
1599      * in a pointer list so that they can be indexed from temporary CEs,
1600      * and they would require more memory allocations.
1601      */
1602     private UVector64 nodes;
1603 }
1604