1 /*
2 *******************************************************************************
3 * Copyright (C) 2013-2014, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
6 * collationfastlatin.cpp
7 *
8 * created on: 2013aug18
9 * created by: Markus W. Scherer
10 */
11
12 #include "unicode/utypes.h"
13
14 #if !UCONFIG_NO_COLLATION
15
16 #include "unicode/ucol.h"
17 #include "collationdata.h"
18 #include "collationfastlatin.h"
19 #include "collationsettings.h"
20 #include "putilimp.h" // U_ALIGN_CODE
21 #include "uassert.h"
22
23 U_NAMESPACE_BEGIN
24
25 int32_t
getOptions(const CollationData * data,const CollationSettings & settings,uint16_t * primaries,int32_t capacity)26 CollationFastLatin::getOptions(const CollationData *data, const CollationSettings &settings,
27 uint16_t *primaries, int32_t capacity) {
28 const uint16_t *table = data->fastLatinTable;
29 if(table == NULL) { return -1; }
30 U_ASSERT(capacity == LATIN_LIMIT);
31 if(capacity != LATIN_LIMIT) { return -1; }
32
33 uint32_t miniVarTop;
34 if((settings.options & CollationSettings::ALTERNATE_MASK) == 0) {
35 // No mini primaries are variable, set a variableTop just below the
36 // lowest long mini primary.
37 miniVarTop = MIN_LONG - 1;
38 } else {
39 uint32_t v1 = settings.variableTop >> 24;
40 int32_t headerLength = *table & 0xff;
41 int32_t i = headerLength - 1;
42 if(i <= 0 || v1 > (table[i] & 0x7f)) {
43 return -1; // variableTop >= digits, should not occur
44 }
45 while(i > 1 && v1 <= (table[i - 1] & 0x7f)) { --i; }
46 // In the table header, the miniVarTop is in bits 15..7, with 4 zero bits 19..16 implied.
47 // Shift right to make it comparable with long mini primaries in bits 15..3.
48 miniVarTop = (table[i] & 0xff80) >> 4;
49 }
50
51 const uint8_t *reorderTable = settings.reorderTable;
52 if(reorderTable != NULL) {
53 const uint16_t *scripts = data->scripts;
54 int32_t length = data->scriptsLength;
55 uint32_t prevLastByte = 0;
56 for(int32_t i = 0; i < length;) {
57 // reordered last byte of the group
58 uint32_t lastByte = reorderTable[scripts[i] & 0xff];
59 if(lastByte < prevLastByte) {
60 // The permutation affects the groups up to Latin.
61 return -1;
62 }
63 if(scripts[i + 2] == USCRIPT_LATIN) { break; }
64 i = i + 2 + scripts[i + 1];
65 prevLastByte = lastByte;
66 }
67 }
68
69 table += (table[0] & 0xff); // skip the header
70 for(UChar32 c = 0; c < LATIN_LIMIT; ++c) {
71 uint32_t p = table[c];
72 if(p >= MIN_SHORT) {
73 p &= SHORT_PRIMARY_MASK;
74 } else if(p > miniVarTop) {
75 p &= LONG_PRIMARY_MASK;
76 } else {
77 p = 0;
78 }
79 primaries[c] = (uint16_t)p;
80 }
81 if((settings.options & CollationSettings::NUMERIC) != 0) {
82 // Bail out for digits.
83 for(UChar32 c = 0x30; c <= 0x39; ++c) { primaries[c] = 0; }
84 }
85
86 // Shift the miniVarTop above other options.
87 return ((int32_t)miniVarTop << 16) | settings.options;
88 }
89
90 int32_t
compareUTF16(const uint16_t * table,const uint16_t * primaries,int32_t options,const UChar * left,int32_t leftLength,const UChar * right,int32_t rightLength)91 CollationFastLatin::compareUTF16(const uint16_t *table, const uint16_t *primaries, int32_t options,
92 const UChar *left, int32_t leftLength,
93 const UChar *right, int32_t rightLength) {
94 // This is a modified copy of CollationCompare::compareUpToQuaternary(),
95 // optimized for common Latin text.
96 // Keep them in sync!
97 // Keep compareUTF16() and compareUTF8() in sync very closely!
98
99 U_ASSERT((table[0] >> 8) == VERSION);
100 table += (table[0] & 0xff); // skip the header
101 uint32_t variableTop = (uint32_t)options >> 16; // see getOptions()
102 options &= 0xffff; // needed for CollationSettings::getStrength() to work
103
104 // Check for supported characters, fetch mini CEs, and compare primaries.
105 U_ALIGN_CODE(16);
106 int32_t leftIndex = 0, rightIndex = 0;
107 /**
108 * Single mini CE or a pair.
109 * The current mini CE is in the lower 16 bits, the next one is in the upper 16 bits.
110 * If there is only one, then it is in the lower bits, and the upper bits are 0.
111 */
112 uint32_t leftPair = 0, rightPair = 0;
113 for(;;) {
114 // We fetch CEs until we get a non-ignorable primary or reach the end.
115 while(leftPair == 0) {
116 if(leftIndex == leftLength) {
117 leftPair = EOS;
118 break;
119 }
120 UChar32 c = left[leftIndex++];
121 if(c <= LATIN_MAX) {
122 leftPair = primaries[c];
123 if(leftPair != 0) { break; }
124 if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) {
125 return BAIL_OUT_RESULT;
126 }
127 leftPair = table[c];
128 } else if(PUNCT_START <= c && c < PUNCT_LIMIT) {
129 leftPair = table[c - PUNCT_START + LATIN_LIMIT];
130 } else {
131 leftPair = lookup(table, c);
132 }
133 if(leftPair >= MIN_SHORT) {
134 leftPair &= SHORT_PRIMARY_MASK;
135 break;
136 } else if(leftPair > variableTop) {
137 leftPair &= LONG_PRIMARY_MASK;
138 break;
139 } else {
140 leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength);
141 if(leftPair == BAIL_OUT) { return BAIL_OUT_RESULT; }
142 leftPair = getPrimaries(variableTop, leftPair);
143 }
144 }
145
146 while(rightPair == 0) {
147 if(rightIndex == rightLength) {
148 rightPair = EOS;
149 break;
150 }
151 UChar32 c = right[rightIndex++];
152 if(c <= LATIN_MAX) {
153 rightPair = primaries[c];
154 if(rightPair != 0) { break; }
155 if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) {
156 return BAIL_OUT_RESULT;
157 }
158 rightPair = table[c];
159 } else if(PUNCT_START <= c && c < PUNCT_LIMIT) {
160 rightPair = table[c - PUNCT_START + LATIN_LIMIT];
161 } else {
162 rightPair = lookup(table, c);
163 }
164 if(rightPair >= MIN_SHORT) {
165 rightPair &= SHORT_PRIMARY_MASK;
166 break;
167 } else if(rightPair > variableTop) {
168 rightPair &= LONG_PRIMARY_MASK;
169 break;
170 } else {
171 rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength);
172 if(rightPair == BAIL_OUT) { return BAIL_OUT_RESULT; }
173 rightPair = getPrimaries(variableTop, rightPair);
174 }
175 }
176
177 if(leftPair == rightPair) {
178 if(leftPair == EOS) { break; }
179 leftPair = rightPair = 0;
180 continue;
181 }
182 uint32_t leftPrimary = leftPair & 0xffff;
183 uint32_t rightPrimary = rightPair & 0xffff;
184 if(leftPrimary != rightPrimary) {
185 // Return the primary difference.
186 return (leftPrimary < rightPrimary) ? UCOL_LESS : UCOL_GREATER;
187 }
188 if(leftPair == EOS) { break; }
189 leftPair >>= 16;
190 rightPair >>= 16;
191 }
192 // In the following, we need to re-fetch each character because we did not buffer the CEs,
193 // but we know that the string is well-formed and
194 // only contains supported characters and mappings.
195
196 // We might skip the secondary level but continue with the case level
197 // which is turned on separately.
198 if(CollationSettings::getStrength(options) >= UCOL_SECONDARY) {
199 leftIndex = rightIndex = 0;
200 leftPair = rightPair = 0;
201 for(;;) {
202 while(leftPair == 0) {
203 if(leftIndex == leftLength) {
204 leftPair = EOS;
205 break;
206 }
207 UChar32 c = left[leftIndex++];
208 if(c <= LATIN_MAX) {
209 leftPair = table[c];
210 } else if(PUNCT_START <= c && c < PUNCT_LIMIT) {
211 leftPair = table[c - PUNCT_START + LATIN_LIMIT];
212 } else {
213 leftPair = lookup(table, c);
214 }
215 if(leftPair >= MIN_SHORT) {
216 leftPair = getSecondariesFromOneShortCE(leftPair);
217 break;
218 } else if(leftPair > variableTop) {
219 leftPair = COMMON_SEC_PLUS_OFFSET;
220 break;
221 } else {
222 leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength);
223 leftPair = getSecondaries(variableTop, leftPair);
224 }
225 }
226
227 while(rightPair == 0) {
228 if(rightIndex == rightLength) {
229 rightPair = EOS;
230 break;
231 }
232 UChar32 c = right[rightIndex++];
233 if(c <= LATIN_MAX) {
234 rightPair = table[c];
235 } else if(PUNCT_START <= c && c < PUNCT_LIMIT) {
236 rightPair = table[c - PUNCT_START + LATIN_LIMIT];
237 } else {
238 rightPair = lookup(table, c);
239 }
240 if(rightPair >= MIN_SHORT) {
241 rightPair = getSecondariesFromOneShortCE(rightPair);
242 break;
243 } else if(rightPair > variableTop) {
244 rightPair = COMMON_SEC_PLUS_OFFSET;
245 break;
246 } else {
247 rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength);
248 rightPair = getSecondaries(variableTop, rightPair);
249 }
250 }
251
252 if(leftPair == rightPair) {
253 if(leftPair == EOS) { break; }
254 leftPair = rightPair = 0;
255 continue;
256 }
257 uint32_t leftSecondary = leftPair & 0xffff;
258 uint32_t rightSecondary = rightPair & 0xffff;
259 if(leftSecondary != rightSecondary) {
260 if((options & CollationSettings::BACKWARD_SECONDARY) != 0) {
261 // Full support for backwards secondary requires backwards contraction matching
262 // and moving backwards between merge separators.
263 return BAIL_OUT_RESULT;
264 }
265 return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER;
266 }
267 if(leftPair == EOS) { break; }
268 leftPair >>= 16;
269 rightPair >>= 16;
270 }
271 }
272
273 if((options & CollationSettings::CASE_LEVEL) != 0) {
274 UBool strengthIsPrimary = CollationSettings::getStrength(options) == UCOL_PRIMARY;
275 leftIndex = rightIndex = 0;
276 leftPair = rightPair = 0;
277 for(;;) {
278 while(leftPair == 0) {
279 if(leftIndex == leftLength) {
280 leftPair = EOS;
281 break;
282 }
283 UChar32 c = left[leftIndex++];
284 leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
285 if(leftPair < MIN_LONG) {
286 leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength);
287 }
288 leftPair = getCases(variableTop, strengthIsPrimary, leftPair);
289 }
290
291 while(rightPair == 0) {
292 if(rightIndex == rightLength) {
293 rightPair = EOS;
294 break;
295 }
296 UChar32 c = right[rightIndex++];
297 rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
298 if(rightPair < MIN_LONG) {
299 rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength);
300 }
301 rightPair = getCases(variableTop, strengthIsPrimary, rightPair);
302 }
303
304 if(leftPair == rightPair) {
305 if(leftPair == EOS) { break; }
306 leftPair = rightPair = 0;
307 continue;
308 }
309 uint32_t leftCase = leftPair & 0xffff;
310 uint32_t rightCase = rightPair & 0xffff;
311 if(leftCase != rightCase) {
312 if((options & CollationSettings::UPPER_FIRST) == 0) {
313 return (leftCase < rightCase) ? UCOL_LESS : UCOL_GREATER;
314 } else {
315 return (leftCase < rightCase) ? UCOL_GREATER : UCOL_LESS;
316 }
317 }
318 if(leftPair == EOS) { break; }
319 leftPair >>= 16;
320 rightPair >>= 16;
321 }
322 }
323 if(CollationSettings::getStrength(options) <= UCOL_SECONDARY) { return UCOL_EQUAL; }
324
325 // Remove the case bits from the tertiary weight when caseLevel is on or caseFirst is off.
326 UBool withCaseBits = CollationSettings::isTertiaryWithCaseBits(options);
327
328 leftIndex = rightIndex = 0;
329 leftPair = rightPair = 0;
330 for(;;) {
331 while(leftPair == 0) {
332 if(leftIndex == leftLength) {
333 leftPair = EOS;
334 break;
335 }
336 UChar32 c = left[leftIndex++];
337 leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
338 if(leftPair < MIN_LONG) {
339 leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength);
340 }
341 leftPair = getTertiaries(variableTop, withCaseBits, leftPair);
342 }
343
344 while(rightPair == 0) {
345 if(rightIndex == rightLength) {
346 rightPair = EOS;
347 break;
348 }
349 UChar32 c = right[rightIndex++];
350 rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
351 if(rightPair < MIN_LONG) {
352 rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength);
353 }
354 rightPair = getTertiaries(variableTop, withCaseBits, rightPair);
355 }
356
357 if(leftPair == rightPair) {
358 if(leftPair == EOS) { break; }
359 leftPair = rightPair = 0;
360 continue;
361 }
362 uint32_t leftTertiary = leftPair & 0xffff;
363 uint32_t rightTertiary = rightPair & 0xffff;
364 if(leftTertiary != rightTertiary) {
365 if(CollationSettings::sortsTertiaryUpperCaseFirst(options)) {
366 // Pass through EOS and MERGE_WEIGHT
367 // and keep real tertiary weights larger than the MERGE_WEIGHT.
368 // Tertiary CEs (secondary ignorables) are not supported in fast Latin.
369 if(leftTertiary > MERGE_WEIGHT) {
370 leftTertiary ^= CASE_MASK;
371 }
372 if(rightTertiary > MERGE_WEIGHT) {
373 rightTertiary ^= CASE_MASK;
374 }
375 }
376 return (leftTertiary < rightTertiary) ? UCOL_LESS : UCOL_GREATER;
377 }
378 if(leftPair == EOS) { break; }
379 leftPair >>= 16;
380 rightPair >>= 16;
381 }
382 if(CollationSettings::getStrength(options) <= UCOL_TERTIARY) { return UCOL_EQUAL; }
383
384 leftIndex = rightIndex = 0;
385 leftPair = rightPair = 0;
386 for(;;) {
387 while(leftPair == 0) {
388 if(leftIndex == leftLength) {
389 leftPair = EOS;
390 break;
391 }
392 UChar32 c = left[leftIndex++];
393 leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
394 if(leftPair < MIN_LONG) {
395 leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength);
396 }
397 leftPair = getQuaternaries(variableTop, leftPair);
398 }
399
400 while(rightPair == 0) {
401 if(rightIndex == rightLength) {
402 rightPair = EOS;
403 break;
404 }
405 UChar32 c = right[rightIndex++];
406 rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
407 if(rightPair < MIN_LONG) {
408 rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength);
409 }
410 rightPair = getQuaternaries(variableTop, rightPair);
411 }
412
413 if(leftPair == rightPair) {
414 if(leftPair == EOS) { break; }
415 leftPair = rightPair = 0;
416 continue;
417 }
418 uint32_t leftQuaternary = leftPair & 0xffff;
419 uint32_t rightQuaternary = rightPair & 0xffff;
420 if(leftQuaternary != rightQuaternary) {
421 return (leftQuaternary < rightQuaternary) ? UCOL_LESS : UCOL_GREATER;
422 }
423 if(leftPair == EOS) { break; }
424 leftPair >>= 16;
425 rightPair >>= 16;
426 }
427 return UCOL_EQUAL;
428 }
429
430 int32_t
compareUTF8(const uint16_t * table,const uint16_t * primaries,int32_t options,const uint8_t * left,int32_t leftLength,const uint8_t * right,int32_t rightLength)431 CollationFastLatin::compareUTF8(const uint16_t *table, const uint16_t *primaries, int32_t options,
432 const uint8_t *left, int32_t leftLength,
433 const uint8_t *right, int32_t rightLength) {
434 // Keep compareUTF16() and compareUTF8() in sync very closely!
435
436 U_ASSERT((table[0] >> 8) == VERSION);
437 table += (table[0] & 0xff); // skip the header
438 uint32_t variableTop = (uint32_t)options >> 16; // see RuleBasedCollator::getFastLatinOptions()
439 options &= 0xffff; // needed for CollationSettings::getStrength() to work
440
441 // Check for supported characters, fetch mini CEs, and compare primaries.
442 U_ALIGN_CODE(16);
443 int32_t leftIndex = 0, rightIndex = 0;
444 /**
445 * Single mini CE or a pair.
446 * The current mini CE is in the lower 16 bits, the next one is in the upper 16 bits.
447 * If there is only one, then it is in the lower bits, and the upper bits are 0.
448 */
449 uint32_t leftPair = 0, rightPair = 0;
450 // Note: There is no need to assemble the code point.
451 // We only need to look up the table entry for the character,
452 // and nextPair() looks for whether c==0.
453 for(;;) {
454 // We fetch CEs until we get a non-ignorable primary or reach the end.
455 while(leftPair == 0) {
456 if(leftIndex == leftLength) {
457 leftPair = EOS;
458 break;
459 }
460 UChar32 c = left[leftIndex++];
461 uint8_t t;
462 if(c <= 0x7f) {
463 leftPair = primaries[c];
464 if(leftPair != 0) { break; }
465 if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) {
466 return BAIL_OUT_RESULT;
467 }
468 leftPair = table[c];
469 } else if(c <= LATIN_MAX_UTF8_LEAD && 0xc2 <= c && leftIndex != leftLength &&
470 0x80 <= (t = left[leftIndex]) && t <= 0xbf) {
471 ++leftIndex;
472 c = ((c - 0xc2) << 6) + t;
473 leftPair = primaries[c];
474 if(leftPair != 0) { break; }
475 leftPair = table[c];
476 } else {
477 leftPair = lookupUTF8(table, c, left, leftIndex, leftLength);
478 }
479 if(leftPair >= MIN_SHORT) {
480 leftPair &= SHORT_PRIMARY_MASK;
481 break;
482 } else if(leftPair > variableTop) {
483 leftPair &= LONG_PRIMARY_MASK;
484 break;
485 } else {
486 leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength);
487 if(leftPair == BAIL_OUT) { return BAIL_OUT_RESULT; }
488 leftPair = getPrimaries(variableTop, leftPair);
489 }
490 }
491
492 while(rightPair == 0) {
493 if(rightIndex == rightLength) {
494 rightPair = EOS;
495 break;
496 }
497 UChar32 c = right[rightIndex++];
498 uint8_t t;
499 if(c <= 0x7f) {
500 rightPair = primaries[c];
501 if(rightPair != 0) { break; }
502 if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) {
503 return BAIL_OUT_RESULT;
504 }
505 rightPair = table[c];
506 } else if(c <= LATIN_MAX_UTF8_LEAD && 0xc2 <= c && rightIndex != rightLength &&
507 0x80 <= (t = right[rightIndex]) && t <= 0xbf) {
508 ++rightIndex;
509 c = ((c - 0xc2) << 6) + t;
510 rightPair = primaries[c];
511 if(rightPair != 0) { break; }
512 rightPair = table[c];
513 } else {
514 rightPair = lookupUTF8(table, c, right, rightIndex, rightLength);
515 }
516 if(rightPair >= MIN_SHORT) {
517 rightPair &= SHORT_PRIMARY_MASK;
518 break;
519 } else if(rightPair > variableTop) {
520 rightPair &= LONG_PRIMARY_MASK;
521 break;
522 } else {
523 rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength);
524 if(rightPair == BAIL_OUT) { return BAIL_OUT_RESULT; }
525 rightPair = getPrimaries(variableTop, rightPair);
526 }
527 }
528
529 if(leftPair == rightPair) {
530 if(leftPair == EOS) { break; }
531 leftPair = rightPair = 0;
532 continue;
533 }
534 uint32_t leftPrimary = leftPair & 0xffff;
535 uint32_t rightPrimary = rightPair & 0xffff;
536 if(leftPrimary != rightPrimary) {
537 // Return the primary difference.
538 return (leftPrimary < rightPrimary) ? UCOL_LESS : UCOL_GREATER;
539 }
540 if(leftPair == EOS) { break; }
541 leftPair >>= 16;
542 rightPair >>= 16;
543 }
544 // In the following, we need to re-fetch each character because we did not buffer the CEs,
545 // but we know that the string is well-formed and
546 // only contains supported characters and mappings.
547
548 // We might skip the secondary level but continue with the case level
549 // which is turned on separately.
550 if(CollationSettings::getStrength(options) >= UCOL_SECONDARY) {
551 leftIndex = rightIndex = 0;
552 leftPair = rightPair = 0;
553 for(;;) {
554 while(leftPair == 0) {
555 if(leftIndex == leftLength) {
556 leftPair = EOS;
557 break;
558 }
559 UChar32 c = left[leftIndex++];
560 if(c <= 0x7f) {
561 leftPair = table[c];
562 } else if(c <= LATIN_MAX_UTF8_LEAD) {
563 leftPair = table[((c - 0xc2) << 6) + left[leftIndex++]];
564 } else {
565 leftPair = lookupUTF8Unsafe(table, c, left, leftIndex);
566 }
567 if(leftPair >= MIN_SHORT) {
568 leftPair = getSecondariesFromOneShortCE(leftPair);
569 break;
570 } else if(leftPair > variableTop) {
571 leftPair = COMMON_SEC_PLUS_OFFSET;
572 break;
573 } else {
574 leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength);
575 leftPair = getSecondaries(variableTop, leftPair);
576 }
577 }
578
579 while(rightPair == 0) {
580 if(rightIndex == rightLength) {
581 rightPair = EOS;
582 break;
583 }
584 UChar32 c = right[rightIndex++];
585 if(c <= 0x7f) {
586 rightPair = table[c];
587 } else if(c <= LATIN_MAX_UTF8_LEAD) {
588 rightPair = table[((c - 0xc2) << 6) + right[rightIndex++]];
589 } else {
590 rightPair = lookupUTF8Unsafe(table, c, right, rightIndex);
591 }
592 if(rightPair >= MIN_SHORT) {
593 rightPair = getSecondariesFromOneShortCE(rightPair);
594 break;
595 } else if(rightPair > variableTop) {
596 rightPair = COMMON_SEC_PLUS_OFFSET;
597 break;
598 } else {
599 rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength);
600 rightPair = getSecondaries(variableTop, rightPair);
601 }
602 }
603
604 if(leftPair == rightPair) {
605 if(leftPair == EOS) { break; }
606 leftPair = rightPair = 0;
607 continue;
608 }
609 uint32_t leftSecondary = leftPair & 0xffff;
610 uint32_t rightSecondary = rightPair & 0xffff;
611 if(leftSecondary != rightSecondary) {
612 if((options & CollationSettings::BACKWARD_SECONDARY) != 0) {
613 // Full support for backwards secondary requires backwards contraction matching
614 // and moving backwards between merge separators.
615 return BAIL_OUT_RESULT;
616 }
617 return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER;
618 }
619 if(leftPair == EOS) { break; }
620 leftPair >>= 16;
621 rightPair >>= 16;
622 }
623 }
624
625 if((options & CollationSettings::CASE_LEVEL) != 0) {
626 UBool strengthIsPrimary = CollationSettings::getStrength(options) == UCOL_PRIMARY;
627 leftIndex = rightIndex = 0;
628 leftPair = rightPair = 0;
629 for(;;) {
630 while(leftPair == 0) {
631 if(leftIndex == leftLength) {
632 leftPair = EOS;
633 break;
634 }
635 UChar32 c = left[leftIndex++];
636 leftPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, left, leftIndex);
637 if(leftPair < MIN_LONG) {
638 leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength);
639 }
640 leftPair = getCases(variableTop, strengthIsPrimary, leftPair);
641 }
642
643 while(rightPair == 0) {
644 if(rightIndex == rightLength) {
645 rightPair = EOS;
646 break;
647 }
648 UChar32 c = right[rightIndex++];
649 rightPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, right, rightIndex);
650 if(rightPair < MIN_LONG) {
651 rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength);
652 }
653 rightPair = getCases(variableTop, strengthIsPrimary, rightPair);
654 }
655
656 if(leftPair == rightPair) {
657 if(leftPair == EOS) { break; }
658 leftPair = rightPair = 0;
659 continue;
660 }
661 uint32_t leftCase = leftPair & 0xffff;
662 uint32_t rightCase = rightPair & 0xffff;
663 if(leftCase != rightCase) {
664 if((options & CollationSettings::UPPER_FIRST) == 0) {
665 return (leftCase < rightCase) ? UCOL_LESS : UCOL_GREATER;
666 } else {
667 return (leftCase < rightCase) ? UCOL_GREATER : UCOL_LESS;
668 }
669 }
670 if(leftPair == EOS) { break; }
671 leftPair >>= 16;
672 rightPair >>= 16;
673 }
674 }
675 if(CollationSettings::getStrength(options) <= UCOL_SECONDARY) { return UCOL_EQUAL; }
676
677 // Remove the case bits from the tertiary weight when caseLevel is on or caseFirst is off.
678 UBool withCaseBits = CollationSettings::isTertiaryWithCaseBits(options);
679
680 leftIndex = rightIndex = 0;
681 leftPair = rightPair = 0;
682 for(;;) {
683 while(leftPair == 0) {
684 if(leftIndex == leftLength) {
685 leftPair = EOS;
686 break;
687 }
688 UChar32 c = left[leftIndex++];
689 leftPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, left, leftIndex);
690 if(leftPair < MIN_LONG) {
691 leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength);
692 }
693 leftPair = getTertiaries(variableTop, withCaseBits, leftPair);
694 }
695
696 while(rightPair == 0) {
697 if(rightIndex == rightLength) {
698 rightPair = EOS;
699 break;
700 }
701 UChar32 c = right[rightIndex++];
702 rightPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, right, rightIndex);
703 if(rightPair < MIN_LONG) {
704 rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength);
705 }
706 rightPair = getTertiaries(variableTop, withCaseBits, rightPair);
707 }
708
709 if(leftPair == rightPair) {
710 if(leftPair == EOS) { break; }
711 leftPair = rightPair = 0;
712 continue;
713 }
714 uint32_t leftTertiary = leftPair & 0xffff;
715 uint32_t rightTertiary = rightPair & 0xffff;
716 if(leftTertiary != rightTertiary) {
717 if(CollationSettings::sortsTertiaryUpperCaseFirst(options)) {
718 // Pass through EOS and MERGE_WEIGHT
719 // and keep real tertiary weights larger than the MERGE_WEIGHT.
720 // Tertiary CEs (secondary ignorables) are not supported in fast Latin.
721 if(leftTertiary > MERGE_WEIGHT) {
722 leftTertiary ^= CASE_MASK;
723 }
724 if(rightTertiary > MERGE_WEIGHT) {
725 rightTertiary ^= CASE_MASK;
726 }
727 }
728 return (leftTertiary < rightTertiary) ? UCOL_LESS : UCOL_GREATER;
729 }
730 if(leftPair == EOS) { break; }
731 leftPair >>= 16;
732 rightPair >>= 16;
733 }
734 if(CollationSettings::getStrength(options) <= UCOL_TERTIARY) { return UCOL_EQUAL; }
735
736 leftIndex = rightIndex = 0;
737 leftPair = rightPair = 0;
738 for(;;) {
739 while(leftPair == 0) {
740 if(leftIndex == leftLength) {
741 leftPair = EOS;
742 break;
743 }
744 UChar32 c = left[leftIndex++];
745 leftPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, left, leftIndex);
746 if(leftPair < MIN_LONG) {
747 leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength);
748 }
749 leftPair = getQuaternaries(variableTop, leftPair);
750 }
751
752 while(rightPair == 0) {
753 if(rightIndex == rightLength) {
754 rightPair = EOS;
755 break;
756 }
757 UChar32 c = right[rightIndex++];
758 rightPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, right, rightIndex);
759 if(rightPair < MIN_LONG) {
760 rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength);
761 }
762 rightPair = getQuaternaries(variableTop, rightPair);
763 }
764
765 if(leftPair == rightPair) {
766 if(leftPair == EOS) { break; }
767 leftPair = rightPair = 0;
768 continue;
769 }
770 uint32_t leftQuaternary = leftPair & 0xffff;
771 uint32_t rightQuaternary = rightPair & 0xffff;
772 if(leftQuaternary != rightQuaternary) {
773 return (leftQuaternary < rightQuaternary) ? UCOL_LESS : UCOL_GREATER;
774 }
775 if(leftPair == EOS) { break; }
776 leftPair >>= 16;
777 rightPair >>= 16;
778 }
779 return UCOL_EQUAL;
780 }
781
782 uint32_t
lookup(const uint16_t * table,UChar32 c)783 CollationFastLatin::lookup(const uint16_t *table, UChar32 c) {
784 U_ASSERT(c > LATIN_MAX);
785 if(PUNCT_START <= c && c < PUNCT_LIMIT) {
786 return table[c - PUNCT_START + LATIN_LIMIT];
787 } else if(c == 0xfffe) {
788 return MERGE_WEIGHT;
789 } else if(c == 0xffff) {
790 return MAX_SHORT | COMMON_SEC | LOWER_CASE | COMMON_TER;
791 } else {
792 return BAIL_OUT;
793 }
794 }
795
796 uint32_t
lookupUTF8(const uint16_t * table,UChar32 c,const uint8_t * s8,int32_t & sIndex,int32_t sLength)797 CollationFastLatin::lookupUTF8(const uint16_t *table, UChar32 c,
798 const uint8_t *s8, int32_t &sIndex, int32_t sLength) {
799 // The caller handled ASCII and valid/supported Latin.
800 U_ASSERT(c > 0x7f);
801 int32_t i2 = sIndex + 1;
802 if(i2 < sLength || sLength < 0) {
803 uint8_t t1 = s8[sIndex];
804 uint8_t t2 = s8[i2];
805 sIndex += 2;
806 if(c == 0xe2 && t1 == 0x80 && 0x80 <= t2 && t2 <= 0xbf) {
807 return table[(LATIN_LIMIT - 0x80) + t2]; // 2000..203F -> 0180..01BF
808 } else if(c == 0xef && t1 == 0xbf) {
809 if(t2 == 0xbe) {
810 return MERGE_WEIGHT; // U+FFFE
811 } else if(t2 == 0xbf) {
812 return MAX_SHORT | COMMON_SEC | LOWER_CASE | COMMON_TER; // U+FFFF
813 }
814 }
815 }
816 return BAIL_OUT;
817 }
818
819 uint32_t
lookupUTF8Unsafe(const uint16_t * table,UChar32 c,const uint8_t * s8,int32_t & sIndex)820 CollationFastLatin::lookupUTF8Unsafe(const uint16_t *table, UChar32 c,
821 const uint8_t *s8, int32_t &sIndex) {
822 // The caller handled ASCII.
823 // The string is well-formed and contains only supported characters.
824 U_ASSERT(c > 0x7f);
825 if(c <= LATIN_MAX_UTF8_LEAD) {
826 return table[((c - 0xc2) << 6) + s8[sIndex++]]; // 0080..017F
827 }
828 uint8_t t2 = s8[sIndex + 1];
829 sIndex += 2;
830 if(c == 0xe2) {
831 return table[(LATIN_LIMIT - 0x80) + t2]; // 2000..203F -> 0180..01BF
832 } else if(t2 == 0xbe) {
833 return MERGE_WEIGHT; // U+FFFE
834 } else {
835 return MAX_SHORT | COMMON_SEC | LOWER_CASE | COMMON_TER; // U+FFFF
836 }
837 }
838
839 uint32_t
nextPair(const uint16_t * table,UChar32 c,uint32_t ce,const UChar * s16,const uint8_t * s8,int32_t & sIndex,int32_t & sLength)840 CollationFastLatin::nextPair(const uint16_t *table, UChar32 c, uint32_t ce,
841 const UChar *s16, const uint8_t *s8, int32_t &sIndex, int32_t &sLength) {
842 if(ce >= MIN_LONG || ce < CONTRACTION) {
843 return ce; // simple or special mini CE
844 } else if(ce >= EXPANSION) {
845 int32_t index = NUM_FAST_CHARS + (ce & INDEX_MASK);
846 return ((uint32_t)table[index + 1] << 16) | table[index];
847 } else /* ce >= CONTRACTION */ {
848 if(c == 0 && sLength < 0) {
849 sLength = sIndex - 1;
850 return EOS;
851 }
852 // Contraction list: Default mapping followed by
853 // 0 or more single-character contraction suffix mappings.
854 int32_t index = NUM_FAST_CHARS + (ce & INDEX_MASK);
855 if(sIndex != sLength) {
856 // Read the next character.
857 int32_t c2;
858 int32_t nextIndex = sIndex;
859 if(s16 != NULL) {
860 c2 = s16[nextIndex++];
861 if(c2 > LATIN_MAX) {
862 if(PUNCT_START <= c2 && c2 < PUNCT_LIMIT) {
863 c2 = c2 - PUNCT_START + LATIN_LIMIT; // 2000..203F -> 0180..01BF
864 } else if(c2 == 0xfffe || c2 == 0xffff) {
865 c2 = -1; // U+FFFE & U+FFFF cannot occur in contractions.
866 } else {
867 return BAIL_OUT;
868 }
869 }
870 } else {
871 c2 = s8[nextIndex++];
872 if(c2 > 0x7f) {
873 uint8_t t;
874 if(c2 <= 0xc5 && 0xc2 <= c2 && nextIndex != sLength &&
875 0x80 <= (t = s8[nextIndex]) && t <= 0xbf) {
876 c2 = ((c2 - 0xc2) << 6) + t; // 0080..017F
877 ++nextIndex;
878 } else {
879 int32_t i2 = nextIndex + 1;
880 if(i2 < sLength || sLength < 0) {
881 if(c2 == 0xe2 && s8[nextIndex] == 0x80 &&
882 0x80 <= (t = s8[i2]) && t <= 0xbf) {
883 c2 = (LATIN_LIMIT - 0x80) + t; // 2000..203F -> 0180..01BF
884 } else if(c2 == 0xef && s8[nextIndex] == 0xbf &&
885 ((t = s8[i2]) == 0xbe || t == 0xbf)) {
886 c2 = -1; // U+FFFE & U+FFFF cannot occur in contractions.
887 } else {
888 return BAIL_OUT;
889 }
890 } else {
891 return BAIL_OUT;
892 }
893 nextIndex += 2;
894 }
895 }
896 }
897 if(c2 == 0 && sLength < 0) {
898 sLength = sIndex;
899 c2 = -1;
900 }
901 // Look for the next character in the contraction suffix list,
902 // which is in ascending order of single suffix characters.
903 int32_t i = index;
904 int32_t head = table[i]; // first skip the default mapping
905 int32_t x;
906 do {
907 i += head >> CONTR_LENGTH_SHIFT;
908 head = table[i];
909 x = head & CONTR_CHAR_MASK;
910 } while(x < c2);
911 if(x == c2) {
912 index = i;
913 sIndex = nextIndex;
914 }
915 }
916 // Return the CE or CEs for the default or contraction mapping.
917 int32_t length = table[index] >> CONTR_LENGTH_SHIFT;
918 if(length == 1) {
919 return BAIL_OUT;
920 }
921 ce = table[index + 1];
922 if(length == 2) {
923 return ce;
924 } else {
925 return ((uint32_t)table[index + 2] << 16) | ce;
926 }
927 }
928 }
929
930 uint32_t
getSecondaries(uint32_t variableTop,uint32_t pair)931 CollationFastLatin::getSecondaries(uint32_t variableTop, uint32_t pair) {
932 if(pair <= 0xffff) {
933 // one mini CE
934 if(pair >= MIN_SHORT) {
935 pair = getSecondariesFromOneShortCE(pair);
936 } else if(pair > variableTop) {
937 pair = COMMON_SEC_PLUS_OFFSET;
938 } else if(pair >= MIN_LONG) {
939 pair = 0; // variable
940 }
941 // else special mini CE
942 } else {
943 uint32_t ce = pair & 0xffff;
944 if(ce >= MIN_SHORT) {
945 pair = (pair & TWO_SECONDARIES_MASK) + TWO_SEC_OFFSETS;
946 } else if(ce > variableTop) {
947 pair = TWO_COMMON_SEC_PLUS_OFFSET;
948 } else {
949 U_ASSERT(ce >= MIN_LONG);
950 pair = 0; // variable
951 }
952 }
953 return pair;
954 }
955
956 uint32_t
getCases(uint32_t variableTop,UBool strengthIsPrimary,uint32_t pair)957 CollationFastLatin::getCases(uint32_t variableTop, UBool strengthIsPrimary, uint32_t pair) {
958 // Primary+caseLevel: Ignore case level weights of primary ignorables.
959 // Otherwise: Ignore case level weights of secondary ignorables.
960 // For details see the comments in the CollationCompare class.
961 // Tertiary CEs (secondary ignorables) are not supported in fast Latin.
962 if(pair <= 0xffff) {
963 // one mini CE
964 if(pair >= MIN_SHORT) {
965 // A high secondary weight means we really have two CEs,
966 // a primary CE and a secondary CE.
967 uint32_t ce = pair;
968 pair &= CASE_MASK; // explicit weight of primary CE
969 if(!strengthIsPrimary && (ce & SECONDARY_MASK) >= MIN_SEC_HIGH) {
970 pair |= LOWER_CASE << 16; // implied weight of secondary CE
971 }
972 } else if(pair > variableTop) {
973 pair = LOWER_CASE;
974 } else if(pair >= MIN_LONG) {
975 pair = 0; // variable
976 }
977 // else special mini CE
978 } else {
979 // two mini CEs, same primary groups, neither expands like above
980 uint32_t ce = pair & 0xffff;
981 if(ce >= MIN_SHORT) {
982 if(strengthIsPrimary && (pair & (SHORT_PRIMARY_MASK << 16)) == 0) {
983 pair &= CASE_MASK;
984 } else {
985 pair &= TWO_CASES_MASK;
986 }
987 } else if(ce > variableTop) {
988 pair = TWO_LOWER_CASES;
989 } else {
990 U_ASSERT(ce >= MIN_LONG);
991 pair = 0; // variable
992 }
993 }
994 return pair;
995 }
996
997 uint32_t
getTertiaries(uint32_t variableTop,UBool withCaseBits,uint32_t pair)998 CollationFastLatin::getTertiaries(uint32_t variableTop, UBool withCaseBits, uint32_t pair) {
999 if(pair <= 0xffff) {
1000 // one mini CE
1001 if(pair >= MIN_SHORT) {
1002 // A high secondary weight means we really have two CEs,
1003 // a primary CE and a secondary CE.
1004 uint32_t ce = pair;
1005 if(withCaseBits) {
1006 pair = (pair & CASE_AND_TERTIARY_MASK) + TER_OFFSET;
1007 if((ce & SECONDARY_MASK) >= MIN_SEC_HIGH) {
1008 pair |= (LOWER_CASE | COMMON_TER_PLUS_OFFSET) << 16;
1009 }
1010 } else {
1011 pair = (pair & TERTIARY_MASK) + TER_OFFSET;
1012 if((ce & SECONDARY_MASK) >= MIN_SEC_HIGH) {
1013 pair |= COMMON_TER_PLUS_OFFSET << 16;
1014 }
1015 }
1016 } else if(pair > variableTop) {
1017 pair = (pair & TERTIARY_MASK) + TER_OFFSET;
1018 if(withCaseBits) {
1019 pair |= LOWER_CASE;
1020 }
1021 } else if(pair >= MIN_LONG) {
1022 pair = 0; // variable
1023 }
1024 // else special mini CE
1025 } else {
1026 // two mini CEs, same primary groups, neither expands like above
1027 uint32_t ce = pair & 0xffff;
1028 if(ce >= MIN_SHORT) {
1029 if(withCaseBits) {
1030 pair &= TWO_CASES_MASK | TWO_TERTIARIES_MASK;
1031 } else {
1032 pair &= TWO_TERTIARIES_MASK;
1033 }
1034 pair += TWO_TER_OFFSETS;
1035 } else if(ce > variableTop) {
1036 pair = (pair & TWO_TERTIARIES_MASK) + TWO_TER_OFFSETS;
1037 if(withCaseBits) {
1038 pair |= TWO_LOWER_CASES;
1039 }
1040 } else {
1041 U_ASSERT(ce >= MIN_LONG);
1042 pair = 0; // variable
1043 }
1044 }
1045 return pair;
1046 }
1047
1048 uint32_t
getQuaternaries(uint32_t variableTop,uint32_t pair)1049 CollationFastLatin::getQuaternaries(uint32_t variableTop, uint32_t pair) {
1050 // Return the primary weight of a variable CE,
1051 // or the maximum primary weight for a non-variable, not-completely-ignorable CE.
1052 if(pair <= 0xffff) {
1053 // one mini CE
1054 if(pair >= MIN_SHORT) {
1055 // A high secondary weight means we really have two CEs,
1056 // a primary CE and a secondary CE.
1057 if((pair & SECONDARY_MASK) >= MIN_SEC_HIGH) {
1058 pair = TWO_SHORT_PRIMARIES_MASK;
1059 } else {
1060 pair = SHORT_PRIMARY_MASK;
1061 }
1062 } else if(pair > variableTop) {
1063 pair = SHORT_PRIMARY_MASK;
1064 } else if(pair >= MIN_LONG) {
1065 pair &= LONG_PRIMARY_MASK; // variable
1066 }
1067 // else special mini CE
1068 } else {
1069 // two mini CEs, same primary groups, neither expands like above
1070 uint32_t ce = pair & 0xffff;
1071 if(ce > variableTop) {
1072 pair = TWO_SHORT_PRIMARIES_MASK;
1073 } else {
1074 U_ASSERT(ce >= MIN_LONG);
1075 pair &= TWO_LONG_PRIMARIES_MASK; // variable
1076 }
1077 }
1078 return pair;
1079 }
1080
1081 U_NAMESPACE_END
1082
1083 #endif // !UCONFIG_NO_COLLATION
1084