• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* GENERATED SOURCE. DO NOT MODIFY. */
2 // © 2016 and later: Unicode, Inc. and others.
3 // License & terms of use: http://www.unicode.org/copyright.html#License
4 /*
5 *******************************************************************************
6 * Copyright (C) 2013-2015, International Business Machines
7 * Corporation and others.  All Rights Reserved.
8 *******************************************************************************
9 * CollationFastLatinBuilder.java, ported from collationfastlatinbuilder.h/.cpp
10 *
11 * C++ version created on: 2013aug09
12 * created by: Markus W. Scherer
13 */
14 
15 package ohos.global.icu.impl.coll;
16 
17 import ohos.global.icu.lang.UScript;
18 import ohos.global.icu.text.Collator;
19 import ohos.global.icu.util.CharsTrie;
20 
21 final class CollationFastLatinBuilder {
22     // #define DEBUG_COLLATION_FAST_LATIN_BUILDER 0  // 0 or 1 or 2
23 
24     /**
25      * Compare two signed long values as if they were unsigned.
26      */
compareInt64AsUnsigned(long a, long b)27     private static final int compareInt64AsUnsigned(long a, long b) {
28         a += 0x8000000000000000L;
29         b += 0x8000000000000000L;
30         if(a < b) {
31             return -1;
32         } else if(a > b) {
33             return 1;
34         } else {
35             return 0;
36         }
37     }
38 
39     /**
40      * Like Java Collections.binarySearch(List, String, Comparator).
41      *
42      * @return the index>=0 where the item was found,
43      *         or the index<0 for inserting the string at ~index in sorted order
44      */
binarySearch(long[] list, int limit, long ce)45     private static final int binarySearch(long[] list, int limit, long ce) {
46         if (limit == 0) { return ~0; }
47         int start = 0;
48         for (;;) {
49             int i = (int)(((long)start + (long)limit) / 2);
50             int cmp = compareInt64AsUnsigned(ce, list[i]);
51             if (cmp == 0) {
52                 return i;
53             } else if (cmp < 0) {
54                 if (i == start) {
55                     return ~start;  // insert ce before i
56                 }
57                 limit = i;
58             } else {
59                 if (i == start) {
60                     return ~(start + 1);  // insert ce after i
61                 }
62                 start = i;
63             }
64         }
65     }
66 
CollationFastLatinBuilder()67     CollationFastLatinBuilder() {
68         ce0 = 0;
69         ce1 = 0;
70         contractionCEs = new UVector64();
71         uniqueCEs = new UVector64();
72         miniCEs = null;
73         firstDigitPrimary = 0;
74         firstLatinPrimary = 0;
75         lastLatinPrimary = 0;
76         firstShortPrimary = 0;
77         shortPrimaryOverflow = false;
78         headerLength = 0;
79     }
80 
forData(CollationData data)81     boolean forData(CollationData data) {
82         if(result.length() != 0) {  // This builder is not reusable.
83             throw new IllegalStateException("attempt to reuse a CollationFastLatinBuilder");
84         }
85         if(!loadGroups(data)) { return false; }
86 
87         // Fast handling of digits.
88         firstShortPrimary = firstDigitPrimary;
89         getCEs(data);
90         encodeUniqueCEs();
91         if(shortPrimaryOverflow) {
92             // Give digits long mini primaries,
93             // so that there are more short primaries for letters.
94             firstShortPrimary = firstLatinPrimary;
95             resetCEs();
96             getCEs(data);
97             encodeUniqueCEs();
98         }
99         // Note: If we still have a short-primary overflow but not a long-primary overflow,
100         // then we could calculate how many more long primaries would fit,
101         // and set the firstShortPrimary to that many after the current firstShortPrimary,
102         // and try again.
103         // However, this might only benefit the en_US_POSIX tailoring,
104         // and it is simpler to suppress building fast Latin data for it in genrb,
105         // or by returning false here if shortPrimaryOverflow.
106 
107         boolean ok = !shortPrimaryOverflow;
108         if(ok) {
109             encodeCharCEs();
110             encodeContractions();
111         }
112         contractionCEs.removeAllElements();  // might reduce heap memory usage
113         uniqueCEs.removeAllElements();
114         return ok;
115     }
116 
117     // C++ returns one combined array with the contents of the result buffer.
118     // Java returns two arrays (header & table) because we cannot use pointer arithmetic,
119     // and we do not want to index into the table with an offset.
getHeader()120     char[] getHeader() {
121         char[] resultArray = new char[headerLength];
122         result.getChars(0, headerLength, resultArray, 0);
123         return resultArray;
124     }
125 
getTable()126     char[] getTable() {
127         char[] resultArray = new char[result.length() - headerLength];
128         result.getChars(headerLength, result.length(), resultArray, 0);
129         return resultArray;
130     }
131 
loadGroups(CollationData data)132     private boolean loadGroups(CollationData data) {
133         headerLength = 1 + NUM_SPECIAL_GROUPS;
134         int r0 = (CollationFastLatin.VERSION << 8) | headerLength;
135         result.append((char)r0);
136         // The first few reordering groups should be special groups
137         // (space, punct, ..., digit) followed by Latn, then Grek and other scripts.
138         for(int i = 0; i < NUM_SPECIAL_GROUPS; ++i) {
139             lastSpecialPrimaries[i] = data.getLastPrimaryForGroup(Collator.ReorderCodes.FIRST + i);
140             if(lastSpecialPrimaries[i] == 0) {
141                 // missing data
142                 return false;
143             }
144             result.append(0);  // reserve a slot for this group
145         }
146 
147         firstDigitPrimary = data.getFirstPrimaryForGroup(Collator.ReorderCodes.DIGIT);
148         firstLatinPrimary = data.getFirstPrimaryForGroup(UScript.LATIN);
149         lastLatinPrimary = data.getLastPrimaryForGroup(UScript.LATIN);
150         if(firstDigitPrimary == 0 || firstLatinPrimary == 0) {
151             // missing data
152             return false;
153         }
154         return true;
155     }
156 
inSameGroup(long p, long q)157     private boolean inSameGroup(long p, long q) {
158         // Both or neither need to be encoded as short primaries,
159         // so that we can test only one and use the same bit mask.
160         if(p >= firstShortPrimary) {
161             return q >= firstShortPrimary;
162         } else if(q >= firstShortPrimary) {
163             return false;
164         }
165         // Both or neither must be potentially-variable,
166         // so that we can test only one and determine if both are variable.
167         long lastVariablePrimary = lastSpecialPrimaries[NUM_SPECIAL_GROUPS - 1];
168         if(p > lastVariablePrimary) {
169             return q > lastVariablePrimary;
170         } else if(q > lastVariablePrimary) {
171             return false;
172         }
173         // Both will be encoded with long mini primaries.
174         // They must be in the same special reordering group,
175         // so that we can test only one and determine if both are variable.
176         assert(p != 0 && q != 0);
177         for(int i = 0;; ++i) {  // will terminate
178             long lastPrimary = lastSpecialPrimaries[i];
179             if(p <= lastPrimary) {
180                 return q <= lastPrimary;
181             } else if(q <= lastPrimary) {
182                 return false;
183             }
184         }
185     }
186 
resetCEs()187     private void resetCEs() {
188         contractionCEs.removeAllElements();
189         uniqueCEs.removeAllElements();
190         shortPrimaryOverflow = false;
191         result.setLength(headerLength);
192     }
193 
getCEs(CollationData data)194     private void getCEs(CollationData data) {
195         int i = 0;
196         for(char c = 0;; ++i, ++c) {
197             if(c == CollationFastLatin.LATIN_LIMIT) {
198                 c = CollationFastLatin.PUNCT_START;
199             } else if(c == CollationFastLatin.PUNCT_LIMIT) {
200                 break;
201             }
202             CollationData d;
203             int ce32 = data.getCE32(c);
204             if(ce32 == Collation.FALLBACK_CE32) {
205                 d = data.base;
206                 ce32 = d.getCE32(c);
207             } else {
208                 d = data;
209             }
210             if(getCEsFromCE32(d, c, ce32)) {
211                 charCEs[i][0] = ce0;
212                 charCEs[i][1] = ce1;
213                 addUniqueCE(ce0);
214                 addUniqueCE(ce1);
215             } else {
216                 // bail out for c
217                 charCEs[i][0] = ce0 = Collation.NO_CE;
218                 charCEs[i][1] = ce1 = 0;
219             }
220             if(c == 0 && !isContractionCharCE(ce0)) {
221                 // Always map U+0000 to a contraction.
222                 // Write a contraction list with only a default value if there is no real contraction.
223                 assert(contractionCEs.isEmpty());
224                 addContractionEntry(CollationFastLatin.CONTR_CHAR_MASK, ce0, ce1);
225                 charCEs[0][0] = (Collation.NO_CE_PRIMARY << 32) | CONTRACTION_FLAG;
226                 charCEs[0][1] = 0;
227             }
228         }
229         // Terminate the last contraction list.
230         contractionCEs.addElement(CollationFastLatin.CONTR_CHAR_MASK);
231     }
232 
getCEsFromCE32(CollationData data, int c, int ce32)233     private boolean getCEsFromCE32(CollationData data, int c, int ce32) {
234         ce32 = data.getFinalCE32(ce32);
235         ce1 = 0;
236         if(Collation.isSimpleOrLongCE32(ce32)) {
237             ce0 = Collation.ceFromCE32(ce32);
238         } else {
239             switch(Collation.tagFromCE32(ce32)) {
240             case Collation.LATIN_EXPANSION_TAG:
241                 ce0 = Collation.latinCE0FromCE32(ce32);
242                 ce1 = Collation.latinCE1FromCE32(ce32);
243                 break;
244             case Collation.EXPANSION32_TAG: {
245                 int index = Collation.indexFromCE32(ce32);
246                 int length = Collation.lengthFromCE32(ce32);
247                 if(length <= 2) {
248                     ce0 = Collation.ceFromCE32(data.ce32s[index]);
249                     if(length == 2) {
250                         ce1 = Collation.ceFromCE32(data.ce32s[index + 1]);
251                     }
252                     break;
253                 } else {
254                     return false;
255                 }
256             }
257             case Collation.EXPANSION_TAG: {
258                 int index = Collation.indexFromCE32(ce32);
259                 int length = Collation.lengthFromCE32(ce32);
260                 if(length <= 2) {
261                     ce0 = data.ces[index];
262                     if(length == 2) {
263                         ce1 = data.ces[index + 1];
264                     }
265                     break;
266                 } else {
267                     return false;
268                 }
269             }
270             // Note: We could support PREFIX_TAG (assert c>=0)
271             // by recursing on its default CE32 and checking that none of the prefixes starts
272             // with a fast Latin character.
273             // However, currently (2013) there are only the L-before-middle-dot
274             // prefix mappings in the Latin range, and those would be rejected anyway.
275             case Collation.CONTRACTION_TAG:
276                 assert(c >= 0);
277                 return getCEsFromContractionCE32(data, ce32);
278             case Collation.OFFSET_TAG:
279                 assert(c >= 0);
280                 ce0 = data.getCEFromOffsetCE32(c, ce32);
281                 break;
282             default:
283                 return false;
284             }
285         }
286         // A mapping can be completely ignorable.
287         if(ce0 == 0) { return ce1 == 0; }
288         // We do not support an ignorable ce0 unless it is completely ignorable.
289         long p0 = ce0 >>> 32;
290         if(p0 == 0) { return false; }
291         // We only support primaries up to the Latin script.
292         if(p0 > lastLatinPrimary) { return false; }
293         // We support non-common secondary and case weights only together with short primaries.
294         int lower32_0 = (int)ce0;
295         if(p0 < firstShortPrimary) {
296             int sc0 = lower32_0 & Collation.SECONDARY_AND_CASE_MASK;
297             if(sc0 != Collation.COMMON_SECONDARY_CE) { return false; }
298         }
299         // No below-common tertiary weights.
300         if((lower32_0 & Collation.ONLY_TERTIARY_MASK) < Collation.COMMON_WEIGHT16) { return false; }
301         if(ce1 != 0) {
302             // Both primaries must be in the same group,
303             // or both must get short mini primaries,
304             // or a short-primary CE is followed by a secondary CE.
305             // This is so that we can test the first primary and use the same mask for both,
306             // and determine for both whether they are variable.
307             long p1 = ce1 >>> 32;
308             if(p1 == 0 ? p0 < firstShortPrimary : !inSameGroup(p0, p1)) { return false; }
309             int lower32_1 = (int)ce1;
310             // No tertiary CEs.
311             if((lower32_1 >>> 16) == 0) { return false; }
312             // We support non-common secondary and case weights
313             // only for secondary CEs or together with short primaries.
314             if(p1 != 0 && p1 < firstShortPrimary) {
315                 int sc1 = lower32_1 & Collation.SECONDARY_AND_CASE_MASK;
316                 if(sc1 != Collation.COMMON_SECONDARY_CE) { return false; }
317             }
318             // No below-common tertiary weights.
319             if((lower32_0 & Collation.ONLY_TERTIARY_MASK) < Collation.COMMON_WEIGHT16) { return false; }
320         }
321         // No quaternary weights.
322         if(((ce0 | ce1) & Collation.QUATERNARY_MASK) != 0) { return false; }
323         return true;
324     }
325 
getCEsFromContractionCE32(CollationData data, int ce32)326     private boolean getCEsFromContractionCE32(CollationData data, int ce32) {
327         int trieIndex = Collation.indexFromCE32(ce32);
328         ce32 = data.getCE32FromContexts(trieIndex);  // Default if no suffix match.
329         // Since the original ce32 is not a prefix mapping,
330         // the default ce32 must not be another contraction.
331         assert(!Collation.isContractionCE32(ce32));
332         int contractionIndex = contractionCEs.size();
333         if(getCEsFromCE32(data, Collation.SENTINEL_CP, ce32)) {
334             addContractionEntry(CollationFastLatin.CONTR_CHAR_MASK, ce0, ce1);
335         } else {
336             // Bail out for c-without-contraction.
337             addContractionEntry(CollationFastLatin.CONTR_CHAR_MASK, Collation.NO_CE, 0);
338         }
339         // Handle an encodable contraction unless the next contraction is too long
340         // and starts with the same character.
341         int prevX = -1;
342         boolean addContraction = false;
343         CharsTrie.Iterator suffixes = CharsTrie.iterator(data.contexts, trieIndex + 2, 0);
344         while(suffixes.hasNext()) {
345             CharsTrie.Entry entry = suffixes.next();
346             CharSequence suffix = entry.chars;
347             int x = CollationFastLatin.getCharIndex(suffix.charAt(0));
348             if(x < 0) { continue; }  // ignore anything but fast Latin text
349             if(x == prevX) {
350                 if(addContraction) {
351                     // Bail out for all contractions starting with this character.
352                     addContractionEntry(x, Collation.NO_CE, 0);
353                     addContraction = false;
354                 }
355                 continue;
356             }
357             if(addContraction) {
358                 addContractionEntry(prevX, ce0, ce1);
359             }
360             ce32 = entry.value;
361             if(suffix.length() == 1 && getCEsFromCE32(data, Collation.SENTINEL_CP, ce32)) {
362                 addContraction = true;
363             } else {
364                 addContractionEntry(x, Collation.NO_CE, 0);
365                 addContraction = false;
366             }
367             prevX = x;
368         }
369         if(addContraction) {
370             addContractionEntry(prevX, ce0, ce1);
371         }
372         // Note: There might not be any fast Latin contractions, but
373         // we need to enter contraction handling anyway so that we can bail out
374         // when there is a non-fast-Latin character following.
375         // For example: Danish &Y<<u+umlaut, when we compare Y vs. u\u0308 we need to see the
376         // following umlaut and bail out, rather than return the difference of Y vs. u.
377         ce0 = (Collation.NO_CE_PRIMARY << 32) | CONTRACTION_FLAG | contractionIndex;
378         ce1 = 0;
379         return true;
380     }
381 
addContractionEntry(int x, long cce0, long cce1)382     private void addContractionEntry(int x, long cce0, long cce1) {
383         contractionCEs.addElement(x);
384         contractionCEs.addElement(cce0);
385         contractionCEs.addElement(cce1);
386         addUniqueCE(cce0);
387         addUniqueCE(cce1);
388     }
389 
addUniqueCE(long ce)390     private void addUniqueCE(long ce) {
391         if(ce == 0 || (ce >>> 32) == Collation.NO_CE_PRIMARY) { return; }
392         ce &= ~(long)Collation.CASE_MASK;  // blank out case bits
393         int i = binarySearch(uniqueCEs.getBuffer(), uniqueCEs.size(), ce);
394         if(i < 0) {
395             uniqueCEs.insertElementAt(ce, ~i);
396         }
397     }
398 
getMiniCE(long ce)399     private int getMiniCE(long ce) {
400         ce &= ~(long)Collation.CASE_MASK;  // blank out case bits
401         int index = binarySearch(uniqueCEs.getBuffer(), uniqueCEs.size(), ce);
402         assert(index >= 0);
403         return miniCEs[index];
404     }
405 
encodeUniqueCEs()406     private void encodeUniqueCEs() {
407         miniCEs = new char[uniqueCEs.size()];
408         int group = 0;
409         long lastGroupPrimary = lastSpecialPrimaries[group];
410         // The lowest unique CE must be at least a secondary CE.
411         assert(((int)uniqueCEs.elementAti(0) >>> 16) != 0);
412         long prevPrimary = 0;
413         int prevSecondary = 0;
414         int pri = 0;
415         int sec = 0;
416         int ter = CollationFastLatin.COMMON_TER;
417         for(int i = 0; i < uniqueCEs.size(); ++i) {
418             long ce = uniqueCEs.elementAti(i);
419             // Note: At least one of the p/s/t weights changes from one unique CE to the next.
420             // (uniqueCEs does not store case bits.)
421             long p = ce >>> 32;
422             if(p != prevPrimary) {
423                 while(p > lastGroupPrimary) {
424                     assert(pri <= CollationFastLatin.MAX_LONG);
425                     // Set the group's header entry to the
426                     // last "long primary" in or before the group.
427                     result.setCharAt(1 + group, (char)pri);
428                     if(++group < NUM_SPECIAL_GROUPS) {
429                         lastGroupPrimary = lastSpecialPrimaries[group];
430                     } else {
431                         lastGroupPrimary = 0xffffffffL;
432                         break;
433                     }
434                 }
435                 if(p < firstShortPrimary) {
436                     if(pri == 0) {
437                         pri = CollationFastLatin.MIN_LONG;
438                     } else if(pri < CollationFastLatin.MAX_LONG) {
439                         pri += CollationFastLatin.LONG_INC;
440                     } else {
441     /* #if DEBUG_COLLATION_FAST_LATIN_BUILDER
442                         printf("long-primary overflow for %08x\n", p);
443     #endif */
444                         miniCEs[i] = CollationFastLatin.BAIL_OUT;
445                         continue;
446                     }
447                 } else {
448                     if(pri < CollationFastLatin.MIN_SHORT) {
449                         pri = CollationFastLatin.MIN_SHORT;
450                     } else if(pri < (CollationFastLatin.MAX_SHORT - CollationFastLatin.SHORT_INC)) {
451                         // Reserve the highest primary weight for U+FFFF.
452                         pri += CollationFastLatin.SHORT_INC;
453                     } else {
454     /* #if DEBUG_COLLATION_FAST_LATIN_BUILDER
455                         printf("short-primary overflow for %08x\n", p);
456     #endif */
457                         shortPrimaryOverflow = true;
458                         miniCEs[i] = CollationFastLatin.BAIL_OUT;
459                         continue;
460                     }
461                 }
462                 prevPrimary = p;
463                 prevSecondary = Collation.COMMON_WEIGHT16;
464                 sec = CollationFastLatin.COMMON_SEC;
465                 ter = CollationFastLatin.COMMON_TER;
466             }
467             int lower32 = (int)ce;
468             int s = lower32 >>> 16;
469             if(s != prevSecondary) {
470                 if(pri == 0) {
471                     if(sec == 0) {
472                         sec = CollationFastLatin.MIN_SEC_HIGH;
473                     } else if(sec < CollationFastLatin.MAX_SEC_HIGH) {
474                         sec += CollationFastLatin.SEC_INC;
475                     } else {
476                         miniCEs[i] = CollationFastLatin.BAIL_OUT;
477                         continue;
478                     }
479                     prevSecondary = s;
480                     ter = CollationFastLatin.COMMON_TER;
481                 } else if(s < Collation.COMMON_WEIGHT16) {
482                     if(sec == CollationFastLatin.COMMON_SEC) {
483                         sec = CollationFastLatin.MIN_SEC_BEFORE;
484                     } else if(sec < CollationFastLatin.MAX_SEC_BEFORE) {
485                         sec += CollationFastLatin.SEC_INC;
486                     } else {
487                         miniCEs[i] = CollationFastLatin.BAIL_OUT;
488                         continue;
489                     }
490                 } else if(s == Collation.COMMON_WEIGHT16) {
491                     sec = CollationFastLatin.COMMON_SEC;
492                 } else {
493                     if(sec < CollationFastLatin.MIN_SEC_AFTER) {
494                         sec = CollationFastLatin.MIN_SEC_AFTER;
495                     } else if(sec < CollationFastLatin.MAX_SEC_AFTER) {
496                         sec += CollationFastLatin.SEC_INC;
497                     } else {
498                         miniCEs[i] = CollationFastLatin.BAIL_OUT;
499                         continue;
500                     }
501                 }
502                 prevSecondary = s;
503                 ter = CollationFastLatin.COMMON_TER;
504             }
505             assert((lower32 & Collation.CASE_MASK) == 0);  // blanked out in uniqueCEs
506             int t = lower32 & Collation.ONLY_TERTIARY_MASK;
507             if(t > Collation.COMMON_WEIGHT16) {
508                 if(ter < CollationFastLatin.MAX_TER_AFTER) {
509                     ++ter;
510                 } else {
511                     miniCEs[i] = CollationFastLatin.BAIL_OUT;
512                     continue;
513                 }
514             }
515             if(CollationFastLatin.MIN_LONG <= pri && pri <= CollationFastLatin.MAX_LONG) {
516                 assert(sec == CollationFastLatin.COMMON_SEC);
517                 miniCEs[i] = (char)(pri | ter);
518             } else {
519                 miniCEs[i] = (char)(pri | sec | ter);
520             }
521         }
522     /* #if DEBUG_COLLATION_FAST_LATIN_BUILDER
523         printf("last mini primary: %04x\n", pri);
524     #endif */
525     /* #if DEBUG_COLLATION_FAST_LATIN_BUILDER >= 2
526         for(int i = 0; i < uniqueCEs.size(); ++i) {
527             long ce = uniqueCEs.elementAti(i);
528             printf("unique CE 0x%016lx -> 0x%04x\n", ce, miniCEs[i]);
529         }
530     #endif */
531     }
532 
encodeCharCEs()533     private void encodeCharCEs() {
534         int miniCEsStart = result.length();
535         for(int i = 0; i < CollationFastLatin.NUM_FAST_CHARS; ++i) {
536             result.append(0);  // initialize to completely ignorable
537         }
538         int indexBase = result.length();
539         for(int i = 0; i < CollationFastLatin.NUM_FAST_CHARS; ++i) {
540             long ce = charCEs[i][0];
541             if(isContractionCharCE(ce)) { continue; }  // defer contraction
542             int miniCE = encodeTwoCEs(ce, charCEs[i][1]);
543             if((miniCE >>> 16) > 0) {   // if ((unsigned)miniCE > 0xffff)
544                 // Note: There is a chance that this new expansion is the same as a previous one,
545                 // and if so, then we could reuse the other expansion.
546                 // However, that seems unlikely.
547                 int expansionIndex = result.length() - indexBase;
548                 if(expansionIndex > CollationFastLatin.INDEX_MASK) {
549                     miniCE = CollationFastLatin.BAIL_OUT;
550                 } else {
551                     result.append((char)(miniCE >> 16)).append((char)miniCE);
552                     miniCE = CollationFastLatin.EXPANSION | expansionIndex;
553                 }
554             }
555             result.setCharAt(miniCEsStart + i, (char)miniCE);
556         }
557     }
558 
encodeContractions()559     private void encodeContractions() {
560         // We encode all contraction lists so that the first word of a list
561         // terminates the previous list, and we only need one additional terminator at the end.
562         int indexBase = headerLength + CollationFastLatin.NUM_FAST_CHARS;
563         int firstContractionIndex = result.length();
564         for(int i = 0; i < CollationFastLatin.NUM_FAST_CHARS; ++i) {
565             long ce = charCEs[i][0];
566             if(!isContractionCharCE(ce)) { continue; }
567             int contractionIndex = result.length() - indexBase;
568             if(contractionIndex > CollationFastLatin.INDEX_MASK) {
569                 result.setCharAt(headerLength + i, (char) CollationFastLatin.BAIL_OUT);
570                 continue;
571             }
572             boolean firstTriple = true;
573             for(int index = (int)ce & 0x7fffffff;; index += 3) {
574                 long x = contractionCEs.elementAti(index);
575                 if(x == CollationFastLatin.CONTR_CHAR_MASK && !firstTriple) { break; }
576                 long cce0 = contractionCEs.elementAti(index + 1);
577                 long cce1 = contractionCEs.elementAti(index + 2);
578                 int miniCE = encodeTwoCEs(cce0, cce1);
579                 if(miniCE == CollationFastLatin.BAIL_OUT) {
580                     result.append((char)(x | (1 << CollationFastLatin.CONTR_LENGTH_SHIFT)));
581                 } else if((miniCE >>> 16) == 0) {  // if ((unsigned)miniCE <= 0xffff)
582                     result.append((char)(x | (2 << CollationFastLatin.CONTR_LENGTH_SHIFT)));
583                     result.append((char)miniCE);
584                 } else {
585                     result.append((char)(x | (3 << CollationFastLatin.CONTR_LENGTH_SHIFT)));
586                     result.append((char)(miniCE >> 16)).append((char)miniCE);
587                 }
588                 firstTriple = false;
589             }
590             // Note: There is a chance that this new contraction list is the same as a previous one,
591             // and if so, then we could truncate the result and reuse the other list.
592             // However, that seems unlikely.
593             result.setCharAt(headerLength + i,
594                             (char)(CollationFastLatin.CONTRACTION | contractionIndex));
595         }
596         if(result.length() > firstContractionIndex) {
597             // Terminate the last contraction list.
598             result.append((char)CollationFastLatin.CONTR_CHAR_MASK);
599         }
600     /* #if DEBUG_COLLATION_FAST_LATIN_BUILDER
601         printf("** fast Latin %d * 2 = %d bytes\n", result.length(), result.length() * 2);
602         puts("   header & below-digit groups map");
603         int i = 0;
604         for(; i < headerLength; ++i) {
605             printf(" %04x", result[i]);
606         }
607         printf("\n   char mini CEs");
608         assert(CollationFastLatin.NUM_FAST_CHARS % 16 == 0);
609         for(; i < indexBase; i += 16) {
610             int c = i - headerLength;
611             if(c >= CollationFastLatin.LATIN_LIMIT) {
612                 c = CollationFastLatin.PUNCT_START + c - CollationFastLatin.LATIN_LIMIT;
613             }
614             printf("\n %04x:", c);
615             for(int j = 0; j < 16; ++j) {
616                 printf(" %04x", result[i + j]);
617             }
618         }
619         printf("\n   expansions & contractions");
620         for(; i < result.length(); ++i) {
621             if((i - indexBase) % 16 == 0) { puts(""); }
622             printf(" %04x", result[i]);
623         }
624         puts("");
625     #endif */
626     }
627 
encodeTwoCEs(long first, long second)628     private int encodeTwoCEs(long first, long second) {
629         if(first == 0) {
630             return 0;  // completely ignorable
631         }
632         if(first == Collation.NO_CE) {
633             return CollationFastLatin.BAIL_OUT;
634         }
635         assert((first >>> 32) != Collation.NO_CE_PRIMARY);
636 
637         int miniCE = getMiniCE(first);
638         if(miniCE == CollationFastLatin.BAIL_OUT) { return miniCE; }
639         if(miniCE >= CollationFastLatin.MIN_SHORT) {
640             // Extract & copy the case bits.
641             // Shift them from normal CE bits 15..14 to mini CE bits 4..3.
642             int c = (((int)first & Collation.CASE_MASK) >> (14 - 3));
643             // Only in mini CEs: Ignorable case bits = 0, lowercase = 1.
644             c += CollationFastLatin.LOWER_CASE;
645             miniCE |= c;
646         }
647         if(second == 0) { return miniCE; }
648 
649         int miniCE1 = getMiniCE(second);
650         if(miniCE1 == CollationFastLatin.BAIL_OUT) { return miniCE1; }
651 
652         int case1 = (int)second & Collation.CASE_MASK;
653         if(miniCE >= CollationFastLatin.MIN_SHORT &&
654                 (miniCE & CollationFastLatin.SECONDARY_MASK) == CollationFastLatin.COMMON_SEC) {
655             // Try to combine the two mini CEs into one.
656             int sec1 = miniCE1 & CollationFastLatin.SECONDARY_MASK;
657             int ter1 = miniCE1 & CollationFastLatin.TERTIARY_MASK;
658             if(sec1 >= CollationFastLatin.MIN_SEC_HIGH && case1 == 0 &&
659                     ter1 == CollationFastLatin.COMMON_TER) {
660                 // sec1>=sec_high implies pri1==0.
661                 return (miniCE & ~CollationFastLatin.SECONDARY_MASK) | sec1;
662             }
663         }
664 
665         if(miniCE1 <= CollationFastLatin.SECONDARY_MASK || CollationFastLatin.MIN_SHORT <= miniCE1) {
666             // Secondary CE, or a CE with a short primary, copy the case bits.
667             case1 = (case1 >> (14 - 3)) + CollationFastLatin.LOWER_CASE;
668             miniCE1 |= case1;
669         }
670         return (miniCE << 16) | miniCE1;
671     }
672 
isContractionCharCE(long ce)673     private static boolean isContractionCharCE(long ce) {
674         return (ce >>> 32) == Collation.NO_CE_PRIMARY && ce != Collation.NO_CE;
675     }
676 
677     // space, punct, symbol, currency (not digit)
678     private static final int NUM_SPECIAL_GROUPS =
679             Collator.ReorderCodes.CURRENCY - Collator.ReorderCodes.FIRST + 1;
680 
681     private static final long CONTRACTION_FLAG = 0x80000000L;
682 
683     // temporary "buffer"
684     private long ce0, ce1;
685 
686     private long[][] charCEs = new long[CollationFastLatin.NUM_FAST_CHARS][2];
687 
688     private UVector64 contractionCEs;
689     private UVector64 uniqueCEs;
690 
691     /** One 16-bit mini CE per unique CE. */
692     private char[] miniCEs;
693 
694     // These are constant for a given root collator.
695     long[] lastSpecialPrimaries = new long[NUM_SPECIAL_GROUPS];
696     private long firstDigitPrimary;
697     private long firstLatinPrimary;
698     private long lastLatinPrimary;
699     // This determines the first normal primary weight which is mapped to
700     // a short mini primary. It must be >=firstDigitPrimary.
701     private long firstShortPrimary;
702 
703     private boolean shortPrimaryOverflow;
704 
705     private StringBuilder result = new StringBuilder();
706     private int headerLength;
707 }
708