• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2017 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 * Copyright (C) 2012-2015, International Business Machines
6 * Corporation and others.  All Rights Reserved.
7 *******************************************************************************
8 * collationbasedatabuilder.cpp
9 *
10 * created on: 2012aug11
11 * created by: Markus W. Scherer
12 */
13 
14 #include "unicode/utypes.h"
15 
16 #if !UCONFIG_NO_COLLATION
17 
18 #include "unicode/localpointer.h"
19 #include "unicode/ucharstriebuilder.h"
20 #include "unicode/uniset.h"
21 #include "unicode/unistr.h"
22 #include "unicode/utf16.h"
23 #include "collation.h"
24 #include "collationbasedatabuilder.h"
25 #include "collationdata.h"
26 #include "collationdatabuilder.h"
27 #include "collationrootelements.h"
28 #include "normalizer2impl.h"
29 #include "uassert.h"
30 #include "utrie2.h"
31 #include "uvectr32.h"
32 #include "uvectr64.h"
33 #include "uvector.h"
34 
35 U_NAMESPACE_BEGIN
36 
37 namespace {
38 
39 /**
40  * Compare two signed int64_t values as if they were unsigned.
41  */
42 int32_t
compareInt64AsUnsigned(int64_t a,int64_t b)43 compareInt64AsUnsigned(int64_t a, int64_t b) {
44     if((uint64_t)a < (uint64_t)b) {
45         return -1;
46     } else if((uint64_t)a > (uint64_t)b) {
47         return 1;
48     } else {
49         return 0;
50     }
51 }
52 
53 // TODO: Try to merge this with the binarySearch in alphaindex.cpp.
54 /**
55  * Like Java Collections.binarySearch(List, String, Comparator).
56  *
57  * @return the index>=0 where the item was found,
58  *         or the index<0 for inserting the string at ~index in sorted order
59  */
60 int32_t
binarySearch(const UVector64 & list,int64_t ce)61 binarySearch(const UVector64 &list, int64_t ce) {
62     if (list.size() == 0) { return ~0; }
63     int32_t start = 0;
64     int32_t limit = list.size();
65     for (;;) {
66         int32_t i = (start + limit) / 2;
67         int32_t cmp = compareInt64AsUnsigned(ce, list.elementAti(i));
68         if (cmp == 0) {
69             return i;
70         } else if (cmp < 0) {
71             if (i == start) {
72                 return ~start;  // insert ce before i
73             }
74             limit = i;
75         } else {
76             if (i == start) {
77                 return ~(start + 1);  // insert ce after i
78             }
79             start = i;
80         }
81     }
82 }
83 
84 }  // namespace
85 
CollationBaseDataBuilder(UBool icu4xMode,UErrorCode & errorCode)86 CollationBaseDataBuilder::CollationBaseDataBuilder(UBool icu4xMode, UErrorCode &errorCode)
87         : CollationDataBuilder(icu4xMode, errorCode),
88           numericPrimary(0x12000000),
89           firstHanPrimary(0), lastHanPrimary(0), hanStep(2),
90           rootElements(errorCode),
91           scriptStartsLength(1) {
92     uprv_memset(scriptsIndex, 0, sizeof(scriptsIndex));
93     uprv_memset(scriptStarts, 0, sizeof(scriptStarts));
94     this->icu4xMode = icu4xMode;
95 }
96 
~CollationBaseDataBuilder()97 CollationBaseDataBuilder::~CollationBaseDataBuilder() {
98 }
99 
100 void
init(UErrorCode & errorCode)101 CollationBaseDataBuilder::init(UErrorCode &errorCode) {
102     if(U_FAILURE(errorCode)) { return; }
103     if(trie != NULL) {
104         errorCode = U_INVALID_STATE_ERROR;
105         return;
106     }
107 
108     // Not compressible:
109     // - digits
110     // - Latin
111     // - Hani
112     // - trail weights
113     // Some scripts are compressible, some are not.
114     uprv_memset(compressibleBytes, false, 256);
115     compressibleBytes[Collation::UNASSIGNED_IMPLICIT_BYTE] = true;
116 
117     // For a base, the default is to compute an unassigned-character implicit CE.
118     // This includes surrogate code points; see the last option in
119     // UCA section 7.1.1 Handling Ill-Formed Code Unit Sequences.
120     trie = utrie2_open(Collation::UNASSIGNED_CE32, Collation::FFFD_CE32, &errorCode);
121 
122     // Preallocate trie blocks for Latin in the hope that proximity helps with CPU caches.
123     // In the ICU4X case, only preallocate ASCII, because we don't store CE32s for
124     // precomposed characters.
125     for(UChar32 c = 0; c < (icu4xMode ? 0x80 : 0x180); ++c) {
126         utrie2_set32(trie, c, Collation::UNASSIGNED_CE32, &errorCode);
127     }
128 
129     utrie2_set32(trie, 0xfffe, Collation::MERGE_SEPARATOR_CE32, &errorCode);
130     // No root element for the merge separator which has 02 weights.
131     // Some code assumes that the root first primary CE is the "space first primary"
132     // from FractionalUCA.txt.
133 
134     if (!icu4xMode) {
135         uint32_t hangulCE32 = Collation::makeCE32FromTagAndIndex(Collation::HANGUL_TAG, 0);
136         utrie2_setRange32(trie, Hangul::HANGUL_BASE, Hangul::HANGUL_END, hangulCE32, true, &errorCode);
137     }
138 
139     // Add a mapping for the first-unassigned boundary,
140     // which is the AlphabeticIndex overflow boundary.
141     UnicodeString s((UChar)0xfdd1);  // Script boundary contractions start with U+FDD1.
142     s.append((UChar)0xfdd0);  // Zzzz script sample character U+FDD0.
143     int64_t ce = Collation::makeCE(Collation::FIRST_UNASSIGNED_PRIMARY);
144     add(UnicodeString(), s, &ce, 1, errorCode);
145 
146     // Add a tailoring boundary, but not a mapping, for [first trailing].
147     ce = Collation::makeCE(Collation::FIRST_TRAILING_PRIMARY);
148     rootElements.addElement(ce, errorCode);
149 
150     // U+FFFD maps to a CE with the third-highest primary weight,
151     // for predictable handling of ill-formed UTF-8.
152     uint32_t ce32 = Collation::FFFD_CE32;
153     utrie2_set32(trie, 0xfffd, ce32, &errorCode);
154     addRootElement(Collation::ceFromSimpleCE32(ce32), errorCode);
155 
156     // U+FFFF maps to a CE with the highest primary weight.
157     ce32 = Collation::MAX_REGULAR_CE32;
158     utrie2_set32(trie, 0xffff, ce32, &errorCode);
159     addRootElement(Collation::ceFromSimpleCE32(ce32), errorCode);
160 }
161 
162 void
initHanRanges(const UChar32 ranges[],int32_t length,UErrorCode & errorCode)163 CollationBaseDataBuilder::initHanRanges(const UChar32 ranges[], int32_t length,
164                                         UErrorCode &errorCode) {
165     if(U_FAILURE(errorCode) || length == 0) { return; }
166     if((length & 1) != 0) {  // incomplete start/end pairs
167         errorCode = U_ILLEGAL_ARGUMENT_ERROR;
168         return;
169     }
170     if(isAssigned(0x4e00)) {  // already set
171         errorCode = U_INVALID_STATE_ERROR;
172         return;
173     }
174     int32_t numHanCodePoints = 0;
175     for(int32_t i = 0; i < length; i += 2) {
176         UChar32 start = ranges[i];
177         UChar32 end = ranges[i + 1];
178         numHanCodePoints += end - start + 1;
179     }
180     // Multiply the number of code points by (gap+1).
181     // Add hanStep+2 for tailoring after the last Han character.
182     int32_t gap = 1;
183     hanStep = gap + 1;
184     int32_t numHan = numHanCodePoints * hanStep + hanStep + 2;
185     // Numbers of Han primaries per lead byte determined by
186     // numbers of 2nd (not compressible) times 3rd primary byte values.
187     int32_t numHanPerLeadByte = 254 * 254;
188     int32_t numHanLeadBytes = (numHan + numHanPerLeadByte - 1) / numHanPerLeadByte;
189     uint32_t hanPrimary = (uint32_t)(Collation::UNASSIGNED_IMPLICIT_BYTE - numHanLeadBytes) << 24;
190     hanPrimary |= 0x20200;
191     firstHanPrimary = hanPrimary;
192     for(int32_t i = 0; i < length; i += 2) {
193         UChar32 start = ranges[i];
194         UChar32 end = ranges[i + 1];
195         hanPrimary = setPrimaryRangeAndReturnNext(start, end, hanPrimary, hanStep, errorCode);
196     }
197     // One past the actual last one, but that is harmless for tailoring.
198     // It saves us from subtracting "hanStep" and handling underflows.
199     lastHanPrimary = hanPrimary;
200 }
201 
202 UBool
isCompressibleLeadByte(uint32_t b) const203 CollationBaseDataBuilder::isCompressibleLeadByte(uint32_t b) const {
204     return compressibleBytes[b];
205 }
206 
207 void
setCompressibleLeadByte(uint32_t b)208 CollationBaseDataBuilder::setCompressibleLeadByte(uint32_t b) {
209     compressibleBytes[b] = true;
210 }
211 
212 int32_t
diffTwoBytePrimaries(uint32_t p1,uint32_t p2,UBool isCompressible)213 CollationBaseDataBuilder::diffTwoBytePrimaries(uint32_t p1, uint32_t p2, UBool isCompressible) {
214     if((p1 & 0xff000000) == (p2 & 0xff000000)) {
215         // Same lead bytes.
216         return (int32_t)(p2 - p1) >> 16;
217     } else {
218         int32_t linear1;
219         int32_t linear2;
220         int32_t factor;
221         if(isCompressible) {
222             // Second byte for compressible lead byte: 251 bytes 04..FE
223             linear1 = (int32_t)((p1 >> 16) & 0xff) - 4;
224             linear2 = (int32_t)((p2 >> 16) & 0xff) - 4;
225             factor = 251;
226         } else {
227             // Second byte for incompressible lead byte: 254 bytes 02..FF
228             linear1 = (int32_t)((p1 >> 16) & 0xff) - 2;
229             linear2 = (int32_t)((p2 >> 16) & 0xff) - 2;
230             factor = 254;
231         }
232         linear1 += factor * (int32_t)((p1 >> 24) & 0xff);
233         linear2 += factor * (int32_t)((p2 >> 24) & 0xff);
234         return linear2 - linear1;
235     }
236 }
237 
238 int32_t
diffThreeBytePrimaries(uint32_t p1,uint32_t p2,UBool isCompressible)239 CollationBaseDataBuilder::diffThreeBytePrimaries(uint32_t p1, uint32_t p2, UBool isCompressible) {
240     if((p1 & 0xffff0000) == (p2 & 0xffff0000)) {
241         // Same first two bytes.
242         return (int32_t)(p2 - p1) >> 8;
243     } else {
244         // Third byte: 254 bytes 02..FF
245         int32_t linear1 = (int32_t)((p1 >> 8) & 0xff) - 2;
246         int32_t linear2 = (int32_t)((p2 >> 8) & 0xff) - 2;
247         int32_t factor;
248         if(isCompressible) {
249             // Second byte for compressible lead byte: 251 bytes 04..FE
250             linear1 += 254 * ((int32_t)((p1 >> 16) & 0xff) - 4);
251             linear2 += 254 * ((int32_t)((p2 >> 16) & 0xff) - 4);
252             factor = 251 * 254;
253         } else {
254             // Second byte for incompressible lead byte: 254 bytes 02..FF
255             linear1 += 254 * ((int32_t)((p1 >> 16) & 0xff) - 2);
256             linear2 += 254 * ((int32_t)((p2 >> 16) & 0xff) - 2);
257             factor = 254 * 254;
258         }
259         linear1 += factor * (int32_t)((p1 >> 24) & 0xff);
260         linear2 += factor * (int32_t)((p2 >> 24) & 0xff);
261         return linear2 - linear1;
262     }
263 }
264 
265 uint32_t
encodeCEs(const int64_t ces[],int32_t cesLength,UErrorCode & errorCode)266 CollationBaseDataBuilder::encodeCEs(const int64_t ces[], int32_t cesLength, UErrorCode &errorCode) {
267     addRootElements(ces, cesLength, errorCode);
268     return CollationDataBuilder::encodeCEs(ces, cesLength, errorCode);
269 }
270 
271 void
addRootElements(const int64_t ces[],int32_t cesLength,UErrorCode & errorCode)272 CollationBaseDataBuilder::addRootElements(const int64_t ces[], int32_t cesLength,
273                                           UErrorCode &errorCode) {
274     if(U_FAILURE(errorCode)) { return; }
275     for(int32_t i = 0; i < cesLength; ++i) {
276         addRootElement(ces[i], errorCode);
277     }
278 }
279 
280 void
addRootElement(int64_t ce,UErrorCode & errorCode)281 CollationBaseDataBuilder::addRootElement(int64_t ce, UErrorCode &errorCode) {
282     if(U_FAILURE(errorCode) || ce == 0) { return; }
283     // Remove case bits.
284     ce &= INT64_C(0xffffffffffff3fff);
285     U_ASSERT((ce & 0xc0) == 0);  // quaternary==0
286     // Ignore the CE if it has a Han primary weight and common secondary/tertiary weights.
287     // We will add it later, as part of the Han ranges.
288     uint32_t p = (uint32_t)(ce >> 32);
289     uint32_t secTer = (uint32_t)ce;
290     if(firstHanPrimary <= p && p <= lastHanPrimary) {
291         if(secTer < Collation::COMMON_SEC_AND_TER_CE) {
292             // buildRootElementsTable() does not currently handle this case.
293             errorCode = U_ILLEGAL_ARGUMENT_ERROR;
294             return;
295         }
296         if(secTer == Collation::COMMON_SEC_AND_TER_CE) {
297             return;
298         }
299     }
300     if(secTer != Collation::COMMON_SEC_AND_TER_CE) {  // minor optimization
301         // Check that secondary and tertiary weights are > 01.
302         uint32_t s = secTer >> 16;
303         uint32_t t = secTer & Collation::ONLY_TERTIARY_MASK;
304         if((s != 0 && s <= Collation::BEFORE_WEIGHT16) ||
305                 (t != 0 && t <= Collation::BEFORE_WEIGHT16)) {
306             errorCode = U_ILLEGAL_ARGUMENT_ERROR;
307             return;
308         }
309     }
310     // Check that primaries have at most 3 bytes.
311     if((p & 0xff) != 0) {
312         errorCode = U_ILLEGAL_ARGUMENT_ERROR;
313         return;
314     }
315     int32_t i = binarySearch(rootElements, ce);
316     if(i < 0) {
317         rootElements.insertElementAt(ce, ~i, errorCode);
318     }
319 }
320 
321 void
addScriptStart(int32_t script,uint32_t p)322 CollationBaseDataBuilder::addScriptStart(int32_t script, uint32_t p) {
323     // The primary weight must be the lowest possible for a two-byte prefix.
324     // It could be 2, 3, or 4 bytes long. We round down to the two-byte boundary.
325     U_ASSERT((p & 0xff) == 0 || (p & 0xff) == 2);
326     p >>= 8;
327     U_ASSERT((p & 0xff) == 0 || (p & 0xff) == 2);
328     p >>= 8;
329     uint32_t lowestP2 = compressibleBytes[p >> 8] ? 4 : 2;
330     if((p & 0xff) == lowestP2) {
331         // The script really starts on a lead byte boundary. Round down to that.
332         p &= 0xff00;
333     }
334     // Script starts should be added in ascending order, otherwise we would need to sort them.
335     if(script < UCOL_REORDER_CODE_FIRST) {
336         U_ASSERT(0 <= script && script < USCRIPT_CODE_LIMIT);
337     } else {
338         U_ASSERT(script <= (UCOL_REORDER_CODE_FIRST + 15));
339         script = USCRIPT_CODE_LIMIT + script - UCOL_REORDER_CODE_FIRST;
340     }
341     if(scriptStartsLength != 0 && scriptStarts[scriptStartsLength - 1] == p) {
342         // Two scripts share a range (e.g., Hira & Kana).
343         scriptsIndex[script] = (uint16_t)(scriptStartsLength - 1);
344     } else {
345         U_ASSERT(scriptStartsLength == 0 || scriptStarts[scriptStartsLength - 1] <= p);
346         U_ASSERT(scriptStartsLength < UPRV_LENGTHOF(scriptStarts));
347         scriptsIndex[script] = (uint16_t)scriptStartsLength;
348         scriptStarts[scriptStartsLength++] = (uint16_t)p;
349     }
350     if(script == USCRIPT_UNKNOWN) {
351         // The last script start is for unassigned code points
352         // (with high implicit primary weights).
353         // Add one more entry with the limit of this range,
354         // which is the start of the trailing-weights range.
355         U_ASSERT(scriptStartsLength < UPRV_LENGTHOF(scriptStarts));
356         scriptStarts[scriptStartsLength++] =
357                 (uint16_t)((Collation::FIRST_TRAILING_PRIMARY >> 16) & 0xff00);
358     }
359 }
360 
361 void
build(CollationData & data,UErrorCode & errorCode)362 CollationBaseDataBuilder::build(CollationData &data, UErrorCode &errorCode) {
363     buildMappings(data, errorCode);
364     data.numericPrimary = numericPrimary;
365     data.compressibleBytes = compressibleBytes;
366 
367     int32_t numScripts = USCRIPT_CODE_LIMIT;
368     while(numScripts > 0 && scriptsIndex[numScripts - 1] == 0) { --numScripts; }
369     // Move the 16 special groups (not all used)
370     // down for contiguous storage of the script and special-group indexes.
371     for(int32_t i = 0; i < 16; ++i) {
372         scriptsIndex[numScripts + i] = scriptsIndex[USCRIPT_CODE_LIMIT + i];
373     }
374     data.numScripts = numScripts;
375     data.scriptsIndex = scriptsIndex;
376     data.scriptStarts = scriptStarts;
377     data.scriptStartsLength = scriptStartsLength;
378     buildFastLatinTable(data, errorCode);
379 }
380 
381 void
buildRootElementsTable(UVector32 & table,UErrorCode & errorCode)382 CollationBaseDataBuilder::buildRootElementsTable(UVector32 &table, UErrorCode &errorCode) {
383     // Limit sentinel for root elements.
384     // This allows us to reduce range checks at runtime.
385     rootElements.addElement(Collation::makeCE(CollationRootElements::PRIMARY_SENTINEL), errorCode);
386     if(U_FAILURE(errorCode)) { return; }
387     uint32_t nextHanPrimary = firstHanPrimary;  // Set to 0xffffffff after the last Han range.
388     uint32_t prevPrimary = 0;  // Start with primary ignorable CEs.
389     UBool needCommonSecTerUnit = false;
390     UBool hasDeltaUnit = false;
391     for(int32_t i = 0; i < rootElements.size(); ++i) {
392         int64_t ce = rootElements.elementAti(i);
393         uint32_t p = (uint32_t)(ce >> 32);
394         uint32_t secTer = (uint32_t)ce & Collation::ONLY_SEC_TER_MASK;
395         if((p != prevPrimary || secTer > Collation::COMMON_SEC_AND_TER_CE) && needCommonSecTerUnit) {
396             // The last primary had low sec/ter weights but no common sec/ter combination.
397             // The next unit is either a new primary or an above-common sec/ter unit.
398             // Insert a common sec/ter unit so that the builder will reliably
399             // tailor to either before or after a common weight but not across it.
400             table.addElement((int32_t)Collation::COMMON_SEC_AND_TER_CE |
401                             CollationRootElements::SEC_TER_DELTA_FLAG, errorCode);
402         }
403         if(p != prevPrimary) {
404             U_ASSERT((p & 0xff) == 0);
405             int32_t end;
406             if(p >= nextHanPrimary) {
407                 // Add a Han primary weight or range.
408                 // We omitted them initially, and omitted all CEs with Han primaries
409                 // and common secondary/tertiary weights.
410                 U_ASSERT(p > lastHanPrimary || secTer > Collation::COMMON_SEC_AND_TER_CE);
411                 if(p == nextHanPrimary) {
412                     // One single Han primary with non-common secondary/tertiary weights.
413                     table.addElement((int32_t)p, errorCode);
414                     if(p < lastHanPrimary) {
415                         // Prepare for the next Han range.
416                         nextHanPrimary = Collation::incThreeBytePrimaryByOffset(p, false, hanStep);
417                     } else {
418                         // p is the last Han primary.
419                         nextHanPrimary = 0xffffffff;
420                     }
421                 } else {
422                     // p > nextHanPrimary: Add a Han primary range, starting with nextHanPrimary.
423                     table.addElement((int32_t)nextHanPrimary, errorCode);
424                     if(nextHanPrimary == lastHanPrimary) {
425                         // nextHanPrimary == lastHanPrimary < p
426                         // We just wrote the single last Han primary.
427                         nextHanPrimary = 0xffffffff;
428                         table.addElement((int32_t)p, errorCode);
429                     } else if(p < lastHanPrimary) {
430                         // nextHanPrimary < p < lastHanPrimary
431                         // End the Han range on p, prepare for the next range.
432                         table.addElement((int32_t)p | hanStep, errorCode);
433                         nextHanPrimary = Collation::incThreeBytePrimaryByOffset(p, false, hanStep);
434                     } else if(p == lastHanPrimary) {
435                         // nextHanPrimary < p == lastHanPrimary
436                         // End the last Han range on p.
437                         table.addElement((int32_t)p | hanStep, errorCode);
438                         nextHanPrimary = 0xffffffff;
439                     } else {
440                         // nextHanPrimary < lastHanPrimary < p
441                         // End the last Han range, then write p.
442                         table.addElement((int32_t)lastHanPrimary | hanStep, errorCode);
443                         nextHanPrimary = 0xffffffff;
444                         table.addElement((int32_t)p, errorCode);
445                     }
446                 }
447             } else if(prevPrimary != 0 &&
448                     // If there has not been an intervening delta unit,
449                     // then we will try to combine the previous primary and
450                     // the next several primaries into a range.
451                     !hasDeltaUnit &&
452                     // Might get a range with more than two primaries if the current CE
453                     // has common sec/ter weights.
454                     secTer == Collation::COMMON_SEC_AND_TER_CE &&
455                     (end = writeRootElementsRange(prevPrimary, p, i + 1, table, errorCode)) != 0) {
456                 // Multiple CEs with only common secondary/tertiary weights were
457                 // combined into a primary range.
458                 // The range end was written, ending with the primary of rootElements[end].
459                 ce = rootElements.elementAti(end);
460                 p = (uint32_t)(ce >> 32);
461                 secTer = (uint32_t)ce & Collation::ONLY_SEC_TER_MASK;
462                 i = end;
463             } else {
464                 // Write the primary weight of a normal CE.
465                 table.addElement((int32_t)p, errorCode);
466             }
467             prevPrimary = p;
468             needCommonSecTerUnit = false;
469             hasDeltaUnit = false;
470         }
471         if(secTer == Collation::COMMON_SEC_AND_TER_CE && !needCommonSecTerUnit) {
472             // The common secondar/tertiary weights are implied in the primary unit.
473         } else {
474             if(secTer < Collation::COMMON_SEC_AND_TER_CE) {
475                 // Remember to not suppress a common sec/ter unit if p!=0.
476                 needCommonSecTerUnit = p != 0;
477             } else if(secTer == Collation::COMMON_SEC_AND_TER_CE) {
478                 // Real common sec/ter unit, no need to insert an artificial one.
479                 needCommonSecTerUnit = false;
480             }
481             // For each new set of secondary/tertiary weights we write a delta unit.
482             table.addElement((int32_t)secTer | CollationRootElements::SEC_TER_DELTA_FLAG, errorCode);
483             hasDeltaUnit = true;
484         }
485     }
486 }
487 
488 int32_t
writeRootElementsRange(uint32_t prevPrimary,uint32_t p,int32_t i,UVector32 & table,UErrorCode & errorCode)489 CollationBaseDataBuilder::writeRootElementsRange(
490         uint32_t prevPrimary, uint32_t p, int32_t i,
491         UVector32 &table, UErrorCode &errorCode) {
492     if(U_FAILURE(errorCode) || i >= rootElements.size()) { return 0; }
493     U_ASSERT(prevPrimary < p);
494     // No ranges of single-byte primaries.
495     if((p & prevPrimary & 0xff0000) == 0) { return 0; }
496     // Lead bytes of compressible primaries must match.
497     UBool isCompressible = isCompressiblePrimary(p);
498     if((isCompressible || isCompressiblePrimary(prevPrimary)) &&
499             (p & 0xff000000) != (prevPrimary & 0xff000000)) {
500         return 0;
501     }
502     // Number of bytes in the primaries.
503     UBool twoBytes;
504     // Number of primaries from prevPrimary to p.
505     int32_t step;
506     if((p & 0xff00) == 0) {
507         // 2-byte primary
508         if((prevPrimary & 0xff00) != 0) { return 0; }  // length mismatch
509         twoBytes = true;
510         step = diffTwoBytePrimaries(prevPrimary, p, isCompressible);
511     } else {
512         // 3-byte primary
513         if((prevPrimary & 0xff00) == 0) { return 0; }  // length mismatch
514         twoBytes = false;
515         step = diffThreeBytePrimaries(prevPrimary, p, isCompressible);
516     }
517     if(step > (int32_t)CollationRootElements::PRIMARY_STEP_MASK) { return 0; }
518     // See if there are more than two CEs with primaries increasing by "step"
519     // and with only common secondary/tertiary weights on all but the last one.
520     int32_t end = 0;  // Initially 0: No range for just two primaries.
521     for(;;) {
522         prevPrimary = p;
523         // Calculate which primary we expect next.
524         uint32_t nextPrimary;  // = p + step
525         if(twoBytes) {
526             nextPrimary = Collation::incTwoBytePrimaryByOffset(p, isCompressible, step);
527         } else {
528             nextPrimary = Collation::incThreeBytePrimaryByOffset(p, isCompressible, step);
529         }
530         // Fetch the actual next CE.
531         int64_t ce = rootElements.elementAti(i);
532         p = (uint32_t)(ce >> 32);
533         uint32_t secTer = (uint32_t)ce & Collation::ONLY_SEC_TER_MASK;
534         // Does this primary increase by "step" from the last one?
535         if(p != nextPrimary ||
536                 // Do not cross into a new lead byte if either is compressible.
537                 ((p & 0xff000000) != (prevPrimary & 0xff000000) &&
538                     (isCompressible || isCompressiblePrimary(p)))) {
539             // The range ends with the previous CE.
540             p = prevPrimary;
541             break;
542         }
543         // Extend the range to include this primary.
544         end = i++;
545         // This primary is the last in the range if it has non-common weights
546         // or if we are at the end of the list.
547         if(secTer != Collation::COMMON_SEC_AND_TER_CE || i >= rootElements.size()) { break; }
548     }
549     if(end != 0) {
550         table.addElement((int32_t)p | step, errorCode);
551     }
552     return end;
553 }
554 
555 U_NAMESPACE_END
556 
557 #endif  // !UCONFIG_NO_COLLATION
558