1 // © 2017 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 * Copyright (C) 2012-2015, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * collationbasedatabuilder.cpp
9 *
10 * created on: 2012aug11
11 * created by: Markus W. Scherer
12 */
13
14 #include "unicode/utypes.h"
15
16 #if !UCONFIG_NO_COLLATION
17
18 #include "unicode/localpointer.h"
19 #include "unicode/ucharstriebuilder.h"
20 #include "unicode/uniset.h"
21 #include "unicode/unistr.h"
22 #include "unicode/utf16.h"
23 #include "collation.h"
24 #include "collationbasedatabuilder.h"
25 #include "collationdata.h"
26 #include "collationdatabuilder.h"
27 #include "collationrootelements.h"
28 #include "normalizer2impl.h"
29 #include "uassert.h"
30 #include "utrie2.h"
31 #include "uvectr32.h"
32 #include "uvectr64.h"
33 #include "uvector.h"
34
35 U_NAMESPACE_BEGIN
36
37 namespace {
38
39 /**
40 * Compare two signed int64_t values as if they were unsigned.
41 */
42 int32_t
compareInt64AsUnsigned(int64_t a,int64_t b)43 compareInt64AsUnsigned(int64_t a, int64_t b) {
44 if((uint64_t)a < (uint64_t)b) {
45 return -1;
46 } else if((uint64_t)a > (uint64_t)b) {
47 return 1;
48 } else {
49 return 0;
50 }
51 }
52
53 // TODO: Try to merge this with the binarySearch in alphaindex.cpp.
54 /**
55 * Like Java Collections.binarySearch(List, String, Comparator).
56 *
57 * @return the index>=0 where the item was found,
58 * or the index<0 for inserting the string at ~index in sorted order
59 */
60 int32_t
binarySearch(const UVector64 & list,int64_t ce)61 binarySearch(const UVector64 &list, int64_t ce) {
62 if (list.size() == 0) { return ~0; }
63 int32_t start = 0;
64 int32_t limit = list.size();
65 for (;;) {
66 int32_t i = (start + limit) / 2;
67 int32_t cmp = compareInt64AsUnsigned(ce, list.elementAti(i));
68 if (cmp == 0) {
69 return i;
70 } else if (cmp < 0) {
71 if (i == start) {
72 return ~start; // insert ce before i
73 }
74 limit = i;
75 } else {
76 if (i == start) {
77 return ~(start + 1); // insert ce after i
78 }
79 start = i;
80 }
81 }
82 }
83
84 } // namespace
85
CollationBaseDataBuilder(UBool icu4xMode,UErrorCode & errorCode)86 CollationBaseDataBuilder::CollationBaseDataBuilder(UBool icu4xMode, UErrorCode &errorCode)
87 : CollationDataBuilder(icu4xMode, errorCode),
88 numericPrimary(0x12000000),
89 firstHanPrimary(0), lastHanPrimary(0), hanStep(2),
90 rootElements(errorCode),
91 scriptStartsLength(1) {
92 uprv_memset(scriptsIndex, 0, sizeof(scriptsIndex));
93 uprv_memset(scriptStarts, 0, sizeof(scriptStarts));
94 this->icu4xMode = icu4xMode;
95 }
96
~CollationBaseDataBuilder()97 CollationBaseDataBuilder::~CollationBaseDataBuilder() {
98 }
99
100 void
init(UErrorCode & errorCode)101 CollationBaseDataBuilder::init(UErrorCode &errorCode) {
102 if(U_FAILURE(errorCode)) { return; }
103 if(trie != NULL) {
104 errorCode = U_INVALID_STATE_ERROR;
105 return;
106 }
107
108 // Not compressible:
109 // - digits
110 // - Latin
111 // - Hani
112 // - trail weights
113 // Some scripts are compressible, some are not.
114 uprv_memset(compressibleBytes, false, 256);
115 compressibleBytes[Collation::UNASSIGNED_IMPLICIT_BYTE] = true;
116
117 // For a base, the default is to compute an unassigned-character implicit CE.
118 // This includes surrogate code points; see the last option in
119 // UCA section 7.1.1 Handling Ill-Formed Code Unit Sequences.
120 trie = utrie2_open(Collation::UNASSIGNED_CE32, Collation::FFFD_CE32, &errorCode);
121
122 // Preallocate trie blocks for Latin in the hope that proximity helps with CPU caches.
123 // In the ICU4X case, only preallocate ASCII, because we don't store CE32s for
124 // precomposed characters.
125 for(UChar32 c = 0; c < (icu4xMode ? 0x80 : 0x180); ++c) {
126 utrie2_set32(trie, c, Collation::UNASSIGNED_CE32, &errorCode);
127 }
128
129 utrie2_set32(trie, 0xfffe, Collation::MERGE_SEPARATOR_CE32, &errorCode);
130 // No root element for the merge separator which has 02 weights.
131 // Some code assumes that the root first primary CE is the "space first primary"
132 // from FractionalUCA.txt.
133
134 if (!icu4xMode) {
135 uint32_t hangulCE32 = Collation::makeCE32FromTagAndIndex(Collation::HANGUL_TAG, 0);
136 utrie2_setRange32(trie, Hangul::HANGUL_BASE, Hangul::HANGUL_END, hangulCE32, true, &errorCode);
137 }
138
139 // Add a mapping for the first-unassigned boundary,
140 // which is the AlphabeticIndex overflow boundary.
141 UnicodeString s((UChar)0xfdd1); // Script boundary contractions start with U+FDD1.
142 s.append((UChar)0xfdd0); // Zzzz script sample character U+FDD0.
143 int64_t ce = Collation::makeCE(Collation::FIRST_UNASSIGNED_PRIMARY);
144 add(UnicodeString(), s, &ce, 1, errorCode);
145
146 // Add a tailoring boundary, but not a mapping, for [first trailing].
147 ce = Collation::makeCE(Collation::FIRST_TRAILING_PRIMARY);
148 rootElements.addElement(ce, errorCode);
149
150 // U+FFFD maps to a CE with the third-highest primary weight,
151 // for predictable handling of ill-formed UTF-8.
152 uint32_t ce32 = Collation::FFFD_CE32;
153 utrie2_set32(trie, 0xfffd, ce32, &errorCode);
154 addRootElement(Collation::ceFromSimpleCE32(ce32), errorCode);
155
156 // U+FFFF maps to a CE with the highest primary weight.
157 ce32 = Collation::MAX_REGULAR_CE32;
158 utrie2_set32(trie, 0xffff, ce32, &errorCode);
159 addRootElement(Collation::ceFromSimpleCE32(ce32), errorCode);
160 }
161
162 void
initHanRanges(const UChar32 ranges[],int32_t length,UErrorCode & errorCode)163 CollationBaseDataBuilder::initHanRanges(const UChar32 ranges[], int32_t length,
164 UErrorCode &errorCode) {
165 if(U_FAILURE(errorCode) || length == 0) { return; }
166 if((length & 1) != 0) { // incomplete start/end pairs
167 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
168 return;
169 }
170 if(isAssigned(0x4e00)) { // already set
171 errorCode = U_INVALID_STATE_ERROR;
172 return;
173 }
174 int32_t numHanCodePoints = 0;
175 for(int32_t i = 0; i < length; i += 2) {
176 UChar32 start = ranges[i];
177 UChar32 end = ranges[i + 1];
178 numHanCodePoints += end - start + 1;
179 }
180 // Multiply the number of code points by (gap+1).
181 // Add hanStep+2 for tailoring after the last Han character.
182 int32_t gap = 1;
183 hanStep = gap + 1;
184 int32_t numHan = numHanCodePoints * hanStep + hanStep + 2;
185 // Numbers of Han primaries per lead byte determined by
186 // numbers of 2nd (not compressible) times 3rd primary byte values.
187 int32_t numHanPerLeadByte = 254 * 254;
188 int32_t numHanLeadBytes = (numHan + numHanPerLeadByte - 1) / numHanPerLeadByte;
189 uint32_t hanPrimary = (uint32_t)(Collation::UNASSIGNED_IMPLICIT_BYTE - numHanLeadBytes) << 24;
190 hanPrimary |= 0x20200;
191 firstHanPrimary = hanPrimary;
192 for(int32_t i = 0; i < length; i += 2) {
193 UChar32 start = ranges[i];
194 UChar32 end = ranges[i + 1];
195 hanPrimary = setPrimaryRangeAndReturnNext(start, end, hanPrimary, hanStep, errorCode);
196 }
197 // One past the actual last one, but that is harmless for tailoring.
198 // It saves us from subtracting "hanStep" and handling underflows.
199 lastHanPrimary = hanPrimary;
200 }
201
202 UBool
isCompressibleLeadByte(uint32_t b) const203 CollationBaseDataBuilder::isCompressibleLeadByte(uint32_t b) const {
204 return compressibleBytes[b];
205 }
206
207 void
setCompressibleLeadByte(uint32_t b)208 CollationBaseDataBuilder::setCompressibleLeadByte(uint32_t b) {
209 compressibleBytes[b] = true;
210 }
211
212 int32_t
diffTwoBytePrimaries(uint32_t p1,uint32_t p2,UBool isCompressible)213 CollationBaseDataBuilder::diffTwoBytePrimaries(uint32_t p1, uint32_t p2, UBool isCompressible) {
214 if((p1 & 0xff000000) == (p2 & 0xff000000)) {
215 // Same lead bytes.
216 return (int32_t)(p2 - p1) >> 16;
217 } else {
218 int32_t linear1;
219 int32_t linear2;
220 int32_t factor;
221 if(isCompressible) {
222 // Second byte for compressible lead byte: 251 bytes 04..FE
223 linear1 = (int32_t)((p1 >> 16) & 0xff) - 4;
224 linear2 = (int32_t)((p2 >> 16) & 0xff) - 4;
225 factor = 251;
226 } else {
227 // Second byte for incompressible lead byte: 254 bytes 02..FF
228 linear1 = (int32_t)((p1 >> 16) & 0xff) - 2;
229 linear2 = (int32_t)((p2 >> 16) & 0xff) - 2;
230 factor = 254;
231 }
232 linear1 += factor * (int32_t)((p1 >> 24) & 0xff);
233 linear2 += factor * (int32_t)((p2 >> 24) & 0xff);
234 return linear2 - linear1;
235 }
236 }
237
238 int32_t
diffThreeBytePrimaries(uint32_t p1,uint32_t p2,UBool isCompressible)239 CollationBaseDataBuilder::diffThreeBytePrimaries(uint32_t p1, uint32_t p2, UBool isCompressible) {
240 if((p1 & 0xffff0000) == (p2 & 0xffff0000)) {
241 // Same first two bytes.
242 return (int32_t)(p2 - p1) >> 8;
243 } else {
244 // Third byte: 254 bytes 02..FF
245 int32_t linear1 = (int32_t)((p1 >> 8) & 0xff) - 2;
246 int32_t linear2 = (int32_t)((p2 >> 8) & 0xff) - 2;
247 int32_t factor;
248 if(isCompressible) {
249 // Second byte for compressible lead byte: 251 bytes 04..FE
250 linear1 += 254 * ((int32_t)((p1 >> 16) & 0xff) - 4);
251 linear2 += 254 * ((int32_t)((p2 >> 16) & 0xff) - 4);
252 factor = 251 * 254;
253 } else {
254 // Second byte for incompressible lead byte: 254 bytes 02..FF
255 linear1 += 254 * ((int32_t)((p1 >> 16) & 0xff) - 2);
256 linear2 += 254 * ((int32_t)((p2 >> 16) & 0xff) - 2);
257 factor = 254 * 254;
258 }
259 linear1 += factor * (int32_t)((p1 >> 24) & 0xff);
260 linear2 += factor * (int32_t)((p2 >> 24) & 0xff);
261 return linear2 - linear1;
262 }
263 }
264
265 uint32_t
encodeCEs(const int64_t ces[],int32_t cesLength,UErrorCode & errorCode)266 CollationBaseDataBuilder::encodeCEs(const int64_t ces[], int32_t cesLength, UErrorCode &errorCode) {
267 addRootElements(ces, cesLength, errorCode);
268 return CollationDataBuilder::encodeCEs(ces, cesLength, errorCode);
269 }
270
271 void
addRootElements(const int64_t ces[],int32_t cesLength,UErrorCode & errorCode)272 CollationBaseDataBuilder::addRootElements(const int64_t ces[], int32_t cesLength,
273 UErrorCode &errorCode) {
274 if(U_FAILURE(errorCode)) { return; }
275 for(int32_t i = 0; i < cesLength; ++i) {
276 addRootElement(ces[i], errorCode);
277 }
278 }
279
280 void
addRootElement(int64_t ce,UErrorCode & errorCode)281 CollationBaseDataBuilder::addRootElement(int64_t ce, UErrorCode &errorCode) {
282 if(U_FAILURE(errorCode) || ce == 0) { return; }
283 // Remove case bits.
284 ce &= INT64_C(0xffffffffffff3fff);
285 U_ASSERT((ce & 0xc0) == 0); // quaternary==0
286 // Ignore the CE if it has a Han primary weight and common secondary/tertiary weights.
287 // We will add it later, as part of the Han ranges.
288 uint32_t p = (uint32_t)(ce >> 32);
289 uint32_t secTer = (uint32_t)ce;
290 if(firstHanPrimary <= p && p <= lastHanPrimary) {
291 if(secTer < Collation::COMMON_SEC_AND_TER_CE) {
292 // buildRootElementsTable() does not currently handle this case.
293 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
294 return;
295 }
296 if(secTer == Collation::COMMON_SEC_AND_TER_CE) {
297 return;
298 }
299 }
300 if(secTer != Collation::COMMON_SEC_AND_TER_CE) { // minor optimization
301 // Check that secondary and tertiary weights are > 01.
302 uint32_t s = secTer >> 16;
303 uint32_t t = secTer & Collation::ONLY_TERTIARY_MASK;
304 if((s != 0 && s <= Collation::BEFORE_WEIGHT16) ||
305 (t != 0 && t <= Collation::BEFORE_WEIGHT16)) {
306 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
307 return;
308 }
309 }
310 // Check that primaries have at most 3 bytes.
311 if((p & 0xff) != 0) {
312 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
313 return;
314 }
315 int32_t i = binarySearch(rootElements, ce);
316 if(i < 0) {
317 rootElements.insertElementAt(ce, ~i, errorCode);
318 }
319 }
320
321 void
addScriptStart(int32_t script,uint32_t p)322 CollationBaseDataBuilder::addScriptStart(int32_t script, uint32_t p) {
323 // The primary weight must be the lowest possible for a two-byte prefix.
324 // It could be 2, 3, or 4 bytes long. We round down to the two-byte boundary.
325 U_ASSERT((p & 0xff) == 0 || (p & 0xff) == 2);
326 p >>= 8;
327 U_ASSERT((p & 0xff) == 0 || (p & 0xff) == 2);
328 p >>= 8;
329 uint32_t lowestP2 = compressibleBytes[p >> 8] ? 4 : 2;
330 if((p & 0xff) == lowestP2) {
331 // The script really starts on a lead byte boundary. Round down to that.
332 p &= 0xff00;
333 }
334 // Script starts should be added in ascending order, otherwise we would need to sort them.
335 if(script < UCOL_REORDER_CODE_FIRST) {
336 U_ASSERT(0 <= script && script < USCRIPT_CODE_LIMIT);
337 } else {
338 U_ASSERT(script <= (UCOL_REORDER_CODE_FIRST + 15));
339 script = USCRIPT_CODE_LIMIT + script - UCOL_REORDER_CODE_FIRST;
340 }
341 if(scriptStartsLength != 0 && scriptStarts[scriptStartsLength - 1] == p) {
342 // Two scripts share a range (e.g., Hira & Kana).
343 scriptsIndex[script] = (uint16_t)(scriptStartsLength - 1);
344 } else {
345 U_ASSERT(scriptStartsLength == 0 || scriptStarts[scriptStartsLength - 1] <= p);
346 U_ASSERT(scriptStartsLength < UPRV_LENGTHOF(scriptStarts));
347 scriptsIndex[script] = (uint16_t)scriptStartsLength;
348 scriptStarts[scriptStartsLength++] = (uint16_t)p;
349 }
350 if(script == USCRIPT_UNKNOWN) {
351 // The last script start is for unassigned code points
352 // (with high implicit primary weights).
353 // Add one more entry with the limit of this range,
354 // which is the start of the trailing-weights range.
355 U_ASSERT(scriptStartsLength < UPRV_LENGTHOF(scriptStarts));
356 scriptStarts[scriptStartsLength++] =
357 (uint16_t)((Collation::FIRST_TRAILING_PRIMARY >> 16) & 0xff00);
358 }
359 }
360
361 void
build(CollationData & data,UErrorCode & errorCode)362 CollationBaseDataBuilder::build(CollationData &data, UErrorCode &errorCode) {
363 buildMappings(data, errorCode);
364 data.numericPrimary = numericPrimary;
365 data.compressibleBytes = compressibleBytes;
366
367 int32_t numScripts = USCRIPT_CODE_LIMIT;
368 while(numScripts > 0 && scriptsIndex[numScripts - 1] == 0) { --numScripts; }
369 // Move the 16 special groups (not all used)
370 // down for contiguous storage of the script and special-group indexes.
371 for(int32_t i = 0; i < 16; ++i) {
372 scriptsIndex[numScripts + i] = scriptsIndex[USCRIPT_CODE_LIMIT + i];
373 }
374 data.numScripts = numScripts;
375 data.scriptsIndex = scriptsIndex;
376 data.scriptStarts = scriptStarts;
377 data.scriptStartsLength = scriptStartsLength;
378 buildFastLatinTable(data, errorCode);
379 }
380
381 void
buildRootElementsTable(UVector32 & table,UErrorCode & errorCode)382 CollationBaseDataBuilder::buildRootElementsTable(UVector32 &table, UErrorCode &errorCode) {
383 // Limit sentinel for root elements.
384 // This allows us to reduce range checks at runtime.
385 rootElements.addElement(Collation::makeCE(CollationRootElements::PRIMARY_SENTINEL), errorCode);
386 if(U_FAILURE(errorCode)) { return; }
387 uint32_t nextHanPrimary = firstHanPrimary; // Set to 0xffffffff after the last Han range.
388 uint32_t prevPrimary = 0; // Start with primary ignorable CEs.
389 UBool needCommonSecTerUnit = false;
390 UBool hasDeltaUnit = false;
391 for(int32_t i = 0; i < rootElements.size(); ++i) {
392 int64_t ce = rootElements.elementAti(i);
393 uint32_t p = (uint32_t)(ce >> 32);
394 uint32_t secTer = (uint32_t)ce & Collation::ONLY_SEC_TER_MASK;
395 if((p != prevPrimary || secTer > Collation::COMMON_SEC_AND_TER_CE) && needCommonSecTerUnit) {
396 // The last primary had low sec/ter weights but no common sec/ter combination.
397 // The next unit is either a new primary or an above-common sec/ter unit.
398 // Insert a common sec/ter unit so that the builder will reliably
399 // tailor to either before or after a common weight but not across it.
400 table.addElement((int32_t)Collation::COMMON_SEC_AND_TER_CE |
401 CollationRootElements::SEC_TER_DELTA_FLAG, errorCode);
402 }
403 if(p != prevPrimary) {
404 U_ASSERT((p & 0xff) == 0);
405 int32_t end;
406 if(p >= nextHanPrimary) {
407 // Add a Han primary weight or range.
408 // We omitted them initially, and omitted all CEs with Han primaries
409 // and common secondary/tertiary weights.
410 U_ASSERT(p > lastHanPrimary || secTer > Collation::COMMON_SEC_AND_TER_CE);
411 if(p == nextHanPrimary) {
412 // One single Han primary with non-common secondary/tertiary weights.
413 table.addElement((int32_t)p, errorCode);
414 if(p < lastHanPrimary) {
415 // Prepare for the next Han range.
416 nextHanPrimary = Collation::incThreeBytePrimaryByOffset(p, false, hanStep);
417 } else {
418 // p is the last Han primary.
419 nextHanPrimary = 0xffffffff;
420 }
421 } else {
422 // p > nextHanPrimary: Add a Han primary range, starting with nextHanPrimary.
423 table.addElement((int32_t)nextHanPrimary, errorCode);
424 if(nextHanPrimary == lastHanPrimary) {
425 // nextHanPrimary == lastHanPrimary < p
426 // We just wrote the single last Han primary.
427 nextHanPrimary = 0xffffffff;
428 table.addElement((int32_t)p, errorCode);
429 } else if(p < lastHanPrimary) {
430 // nextHanPrimary < p < lastHanPrimary
431 // End the Han range on p, prepare for the next range.
432 table.addElement((int32_t)p | hanStep, errorCode);
433 nextHanPrimary = Collation::incThreeBytePrimaryByOffset(p, false, hanStep);
434 } else if(p == lastHanPrimary) {
435 // nextHanPrimary < p == lastHanPrimary
436 // End the last Han range on p.
437 table.addElement((int32_t)p | hanStep, errorCode);
438 nextHanPrimary = 0xffffffff;
439 } else {
440 // nextHanPrimary < lastHanPrimary < p
441 // End the last Han range, then write p.
442 table.addElement((int32_t)lastHanPrimary | hanStep, errorCode);
443 nextHanPrimary = 0xffffffff;
444 table.addElement((int32_t)p, errorCode);
445 }
446 }
447 } else if(prevPrimary != 0 &&
448 // If there has not been an intervening delta unit,
449 // then we will try to combine the previous primary and
450 // the next several primaries into a range.
451 !hasDeltaUnit &&
452 // Might get a range with more than two primaries if the current CE
453 // has common sec/ter weights.
454 secTer == Collation::COMMON_SEC_AND_TER_CE &&
455 (end = writeRootElementsRange(prevPrimary, p, i + 1, table, errorCode)) != 0) {
456 // Multiple CEs with only common secondary/tertiary weights were
457 // combined into a primary range.
458 // The range end was written, ending with the primary of rootElements[end].
459 ce = rootElements.elementAti(end);
460 p = (uint32_t)(ce >> 32);
461 secTer = (uint32_t)ce & Collation::ONLY_SEC_TER_MASK;
462 i = end;
463 } else {
464 // Write the primary weight of a normal CE.
465 table.addElement((int32_t)p, errorCode);
466 }
467 prevPrimary = p;
468 needCommonSecTerUnit = false;
469 hasDeltaUnit = false;
470 }
471 if(secTer == Collation::COMMON_SEC_AND_TER_CE && !needCommonSecTerUnit) {
472 // The common secondar/tertiary weights are implied in the primary unit.
473 } else {
474 if(secTer < Collation::COMMON_SEC_AND_TER_CE) {
475 // Remember to not suppress a common sec/ter unit if p!=0.
476 needCommonSecTerUnit = p != 0;
477 } else if(secTer == Collation::COMMON_SEC_AND_TER_CE) {
478 // Real common sec/ter unit, no need to insert an artificial one.
479 needCommonSecTerUnit = false;
480 }
481 // For each new set of secondary/tertiary weights we write a delta unit.
482 table.addElement((int32_t)secTer | CollationRootElements::SEC_TER_DELTA_FLAG, errorCode);
483 hasDeltaUnit = true;
484 }
485 }
486 }
487
488 int32_t
writeRootElementsRange(uint32_t prevPrimary,uint32_t p,int32_t i,UVector32 & table,UErrorCode & errorCode)489 CollationBaseDataBuilder::writeRootElementsRange(
490 uint32_t prevPrimary, uint32_t p, int32_t i,
491 UVector32 &table, UErrorCode &errorCode) {
492 if(U_FAILURE(errorCode) || i >= rootElements.size()) { return 0; }
493 U_ASSERT(prevPrimary < p);
494 // No ranges of single-byte primaries.
495 if((p & prevPrimary & 0xff0000) == 0) { return 0; }
496 // Lead bytes of compressible primaries must match.
497 UBool isCompressible = isCompressiblePrimary(p);
498 if((isCompressible || isCompressiblePrimary(prevPrimary)) &&
499 (p & 0xff000000) != (prevPrimary & 0xff000000)) {
500 return 0;
501 }
502 // Number of bytes in the primaries.
503 UBool twoBytes;
504 // Number of primaries from prevPrimary to p.
505 int32_t step;
506 if((p & 0xff00) == 0) {
507 // 2-byte primary
508 if((prevPrimary & 0xff00) != 0) { return 0; } // length mismatch
509 twoBytes = true;
510 step = diffTwoBytePrimaries(prevPrimary, p, isCompressible);
511 } else {
512 // 3-byte primary
513 if((prevPrimary & 0xff00) == 0) { return 0; } // length mismatch
514 twoBytes = false;
515 step = diffThreeBytePrimaries(prevPrimary, p, isCompressible);
516 }
517 if(step > (int32_t)CollationRootElements::PRIMARY_STEP_MASK) { return 0; }
518 // See if there are more than two CEs with primaries increasing by "step"
519 // and with only common secondary/tertiary weights on all but the last one.
520 int32_t end = 0; // Initially 0: No range for just two primaries.
521 for(;;) {
522 prevPrimary = p;
523 // Calculate which primary we expect next.
524 uint32_t nextPrimary; // = p + step
525 if(twoBytes) {
526 nextPrimary = Collation::incTwoBytePrimaryByOffset(p, isCompressible, step);
527 } else {
528 nextPrimary = Collation::incThreeBytePrimaryByOffset(p, isCompressible, step);
529 }
530 // Fetch the actual next CE.
531 int64_t ce = rootElements.elementAti(i);
532 p = (uint32_t)(ce >> 32);
533 uint32_t secTer = (uint32_t)ce & Collation::ONLY_SEC_TER_MASK;
534 // Does this primary increase by "step" from the last one?
535 if(p != nextPrimary ||
536 // Do not cross into a new lead byte if either is compressible.
537 ((p & 0xff000000) != (prevPrimary & 0xff000000) &&
538 (isCompressible || isCompressiblePrimary(p)))) {
539 // The range ends with the previous CE.
540 p = prevPrimary;
541 break;
542 }
543 // Extend the range to include this primary.
544 end = i++;
545 // This primary is the last in the range if it has non-common weights
546 // or if we are at the end of the list.
547 if(secTer != Collation::COMMON_SEC_AND_TER_CE || i >= rootElements.size()) { break; }
548 }
549 if(end != 0) {
550 table.addElement((int32_t)p | step, errorCode);
551 }
552 return end;
553 }
554
555 U_NAMESPACE_END
556
557 #endif // !UCONFIG_NO_COLLATION
558