• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 *******************************************************************************
3 * Copyright (C) 2013-2014, International Business Machines
4 * Corporation and others.  All Rights Reserved.
5 *******************************************************************************
6 * collationfastlatin.cpp
7 *
8 * created on: 2013aug18
9 * created by: Markus W. Scherer
10 */
11 
12 #include "unicode/utypes.h"
13 
14 #if !UCONFIG_NO_COLLATION
15 
16 #include "unicode/ucol.h"
17 #include "collationdata.h"
18 #include "collationfastlatin.h"
19 #include "collationsettings.h"
20 #include "putilimp.h"  // U_ALIGN_CODE
21 #include "uassert.h"
22 
23 U_NAMESPACE_BEGIN
24 
25 int32_t
getOptions(const CollationData * data,const CollationSettings & settings,uint16_t * primaries,int32_t capacity)26 CollationFastLatin::getOptions(const CollationData *data, const CollationSettings &settings,
27                                uint16_t *primaries, int32_t capacity) {
28     const uint16_t *table = data->fastLatinTable;
29     if(table == NULL) { return -1; }
30     U_ASSERT(capacity == LATIN_LIMIT);
31     if(capacity != LATIN_LIMIT) { return -1; }
32 
33     uint32_t miniVarTop;
34     if((settings.options & CollationSettings::ALTERNATE_MASK) == 0) {
35         // No mini primaries are variable, set a variableTop just below the
36         // lowest long mini primary.
37         miniVarTop = MIN_LONG - 1;
38     } else {
39         uint32_t v1 = settings.variableTop >> 24;
40         int32_t headerLength = *table & 0xff;
41         int32_t i = headerLength - 1;
42         if(i <= 0 || v1 > (table[i] & 0x7f)) {
43             return -1;  // variableTop >= digits, should not occur
44         }
45         while(i > 1 && v1 <= (table[i - 1] & 0x7f)) { --i; }
46         // In the table header, the miniVarTop is in bits 15..7, with 4 zero bits 19..16 implied.
47         // Shift right to make it comparable with long mini primaries in bits 15..3.
48         miniVarTop = (table[i] & 0xff80) >> 4;
49     }
50 
51     const uint8_t *reorderTable = settings.reorderTable;
52     if(reorderTable != NULL) {
53         const uint16_t *scripts = data->scripts;
54         int32_t length = data->scriptsLength;
55         uint32_t prevLastByte = 0;
56         for(int32_t i = 0; i < length;) {
57             // reordered last byte of the group
58             uint32_t lastByte = reorderTable[scripts[i] & 0xff];
59             if(lastByte < prevLastByte) {
60                 // The permutation affects the groups up to Latin.
61                 return -1;
62             }
63             if(scripts[i + 2] == USCRIPT_LATIN) { break; }
64             i = i + 2 + scripts[i + 1];
65             prevLastByte = lastByte;
66         }
67     }
68 
69     table += (table[0] & 0xff);  // skip the header
70     for(UChar32 c = 0; c < LATIN_LIMIT; ++c) {
71         uint32_t p = table[c];
72         if(p >= MIN_SHORT) {
73             p &= SHORT_PRIMARY_MASK;
74         } else if(p > miniVarTop) {
75             p &= LONG_PRIMARY_MASK;
76         } else {
77             p = 0;
78         }
79         primaries[c] = (uint16_t)p;
80     }
81     if((settings.options & CollationSettings::NUMERIC) != 0) {
82         // Bail out for digits.
83         for(UChar32 c = 0x30; c <= 0x39; ++c) { primaries[c] = 0; }
84     }
85 
86     // Shift the miniVarTop above other options.
87     return ((int32_t)miniVarTop << 16) | settings.options;
88 }
89 
90 int32_t
compareUTF16(const uint16_t * table,const uint16_t * primaries,int32_t options,const UChar * left,int32_t leftLength,const UChar * right,int32_t rightLength)91 CollationFastLatin::compareUTF16(const uint16_t *table, const uint16_t *primaries, int32_t options,
92                                  const UChar *left, int32_t leftLength,
93                                  const UChar *right, int32_t rightLength) {
94     // This is a modified copy of CollationCompare::compareUpToQuaternary(),
95     // optimized for common Latin text.
96     // Keep them in sync!
97     // Keep compareUTF16() and compareUTF8() in sync very closely!
98 
99     U_ASSERT((table[0] >> 8) == VERSION);
100     table += (table[0] & 0xff);  // skip the header
101     uint32_t variableTop = (uint32_t)options >> 16;  // see getOptions()
102     options &= 0xffff;  // needed for CollationSettings::getStrength() to work
103 
104     // Check for supported characters, fetch mini CEs, and compare primaries.
105     U_ALIGN_CODE(16);
106     int32_t leftIndex = 0, rightIndex = 0;
107     /**
108      * Single mini CE or a pair.
109      * The current mini CE is in the lower 16 bits, the next one is in the upper 16 bits.
110      * If there is only one, then it is in the lower bits, and the upper bits are 0.
111      */
112     uint32_t leftPair = 0, rightPair = 0;
113     for(;;) {
114         // We fetch CEs until we get a non-ignorable primary or reach the end.
115         while(leftPair == 0) {
116             if(leftIndex == leftLength) {
117                 leftPair = EOS;
118                 break;
119             }
120             UChar32 c = left[leftIndex++];
121             if(c <= LATIN_MAX) {
122                 leftPair = primaries[c];
123                 if(leftPair != 0) { break; }
124                 if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) {
125                     return BAIL_OUT_RESULT;
126                 }
127                 leftPair = table[c];
128             } else if(PUNCT_START <= c && c < PUNCT_LIMIT) {
129                 leftPair = table[c - PUNCT_START + LATIN_LIMIT];
130             } else {
131                 leftPair = lookup(table, c);
132             }
133             if(leftPair >= MIN_SHORT) {
134                 leftPair &= SHORT_PRIMARY_MASK;
135                 break;
136             } else if(leftPair > variableTop) {
137                 leftPair &= LONG_PRIMARY_MASK;
138                 break;
139             } else {
140                 leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength);
141                 if(leftPair == BAIL_OUT) { return BAIL_OUT_RESULT; }
142                 leftPair = getPrimaries(variableTop, leftPair);
143             }
144         }
145 
146         while(rightPair == 0) {
147             if(rightIndex == rightLength) {
148                 rightPair = EOS;
149                 break;
150             }
151             UChar32 c = right[rightIndex++];
152             if(c <= LATIN_MAX) {
153                 rightPair = primaries[c];
154                 if(rightPair != 0) { break; }
155                 if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) {
156                     return BAIL_OUT_RESULT;
157                 }
158                 rightPair = table[c];
159             } else if(PUNCT_START <= c && c < PUNCT_LIMIT) {
160                 rightPair = table[c - PUNCT_START + LATIN_LIMIT];
161             } else {
162                 rightPair = lookup(table, c);
163             }
164             if(rightPair >= MIN_SHORT) {
165                 rightPair &= SHORT_PRIMARY_MASK;
166                 break;
167             } else if(rightPair > variableTop) {
168                 rightPair &= LONG_PRIMARY_MASK;
169                 break;
170             } else {
171                 rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength);
172                 if(rightPair == BAIL_OUT) { return BAIL_OUT_RESULT; }
173                 rightPair = getPrimaries(variableTop, rightPair);
174             }
175         }
176 
177         if(leftPair == rightPair) {
178             if(leftPair == EOS) { break; }
179             leftPair = rightPair = 0;
180             continue;
181         }
182         uint32_t leftPrimary = leftPair & 0xffff;
183         uint32_t rightPrimary = rightPair & 0xffff;
184         if(leftPrimary != rightPrimary) {
185             // Return the primary difference.
186             return (leftPrimary < rightPrimary) ? UCOL_LESS : UCOL_GREATER;
187         }
188         if(leftPair == EOS) { break; }
189         leftPair >>= 16;
190         rightPair >>= 16;
191     }
192     // In the following, we need to re-fetch each character because we did not buffer the CEs,
193     // but we know that the string is well-formed and
194     // only contains supported characters and mappings.
195 
196     // We might skip the secondary level but continue with the case level
197     // which is turned on separately.
198     if(CollationSettings::getStrength(options) >= UCOL_SECONDARY) {
199         leftIndex = rightIndex = 0;
200         leftPair = rightPair = 0;
201         for(;;) {
202             while(leftPair == 0) {
203                 if(leftIndex == leftLength) {
204                     leftPair = EOS;
205                     break;
206                 }
207                 UChar32 c = left[leftIndex++];
208                 if(c <= LATIN_MAX) {
209                     leftPair = table[c];
210                 } else if(PUNCT_START <= c && c < PUNCT_LIMIT) {
211                     leftPair = table[c - PUNCT_START + LATIN_LIMIT];
212                 } else {
213                     leftPair = lookup(table, c);
214                 }
215                 if(leftPair >= MIN_SHORT) {
216                     leftPair = getSecondariesFromOneShortCE(leftPair);
217                     break;
218                 } else if(leftPair > variableTop) {
219                     leftPair = COMMON_SEC_PLUS_OFFSET;
220                     break;
221                 } else {
222                     leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength);
223                     leftPair = getSecondaries(variableTop, leftPair);
224                 }
225             }
226 
227             while(rightPair == 0) {
228                 if(rightIndex == rightLength) {
229                     rightPair = EOS;
230                     break;
231                 }
232                 UChar32 c = right[rightIndex++];
233                 if(c <= LATIN_MAX) {
234                     rightPair = table[c];
235                 } else if(PUNCT_START <= c && c < PUNCT_LIMIT) {
236                     rightPair = table[c - PUNCT_START + LATIN_LIMIT];
237                 } else {
238                     rightPair = lookup(table, c);
239                 }
240                 if(rightPair >= MIN_SHORT) {
241                     rightPair = getSecondariesFromOneShortCE(rightPair);
242                     break;
243                 } else if(rightPair > variableTop) {
244                     rightPair = COMMON_SEC_PLUS_OFFSET;
245                     break;
246                 } else {
247                     rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength);
248                     rightPair = getSecondaries(variableTop, rightPair);
249                 }
250             }
251 
252             if(leftPair == rightPair) {
253                 if(leftPair == EOS) { break; }
254                 leftPair = rightPair = 0;
255                 continue;
256             }
257             uint32_t leftSecondary = leftPair & 0xffff;
258             uint32_t rightSecondary = rightPair & 0xffff;
259             if(leftSecondary != rightSecondary) {
260                 if((options & CollationSettings::BACKWARD_SECONDARY) != 0) {
261                     // Full support for backwards secondary requires backwards contraction matching
262                     // and moving backwards between merge separators.
263                     return BAIL_OUT_RESULT;
264                 }
265                 return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER;
266             }
267             if(leftPair == EOS) { break; }
268             leftPair >>= 16;
269             rightPair >>= 16;
270         }
271     }
272 
273     if((options & CollationSettings::CASE_LEVEL) != 0) {
274         UBool strengthIsPrimary = CollationSettings::getStrength(options) == UCOL_PRIMARY;
275         leftIndex = rightIndex = 0;
276         leftPair = rightPair = 0;
277         for(;;) {
278             while(leftPair == 0) {
279                 if(leftIndex == leftLength) {
280                     leftPair = EOS;
281                     break;
282                 }
283                 UChar32 c = left[leftIndex++];
284                 leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
285                 if(leftPair < MIN_LONG) {
286                     leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength);
287                 }
288                 leftPair = getCases(variableTop, strengthIsPrimary, leftPair);
289             }
290 
291             while(rightPair == 0) {
292                 if(rightIndex == rightLength) {
293                     rightPair = EOS;
294                     break;
295                 }
296                 UChar32 c = right[rightIndex++];
297                 rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
298                 if(rightPair < MIN_LONG) {
299                     rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength);
300                 }
301                 rightPair = getCases(variableTop, strengthIsPrimary, rightPair);
302             }
303 
304             if(leftPair == rightPair) {
305                 if(leftPair == EOS) { break; }
306                 leftPair = rightPair = 0;
307                 continue;
308             }
309             uint32_t leftCase = leftPair & 0xffff;
310             uint32_t rightCase = rightPair & 0xffff;
311             if(leftCase != rightCase) {
312                 if((options & CollationSettings::UPPER_FIRST) == 0) {
313                     return (leftCase < rightCase) ? UCOL_LESS : UCOL_GREATER;
314                 } else {
315                     return (leftCase < rightCase) ? UCOL_GREATER : UCOL_LESS;
316                 }
317             }
318             if(leftPair == EOS) { break; }
319             leftPair >>= 16;
320             rightPair >>= 16;
321         }
322     }
323     if(CollationSettings::getStrength(options) <= UCOL_SECONDARY) { return UCOL_EQUAL; }
324 
325     // Remove the case bits from the tertiary weight when caseLevel is on or caseFirst is off.
326     UBool withCaseBits = CollationSettings::isTertiaryWithCaseBits(options);
327 
328     leftIndex = rightIndex = 0;
329     leftPair = rightPair = 0;
330     for(;;) {
331         while(leftPair == 0) {
332             if(leftIndex == leftLength) {
333                 leftPair = EOS;
334                 break;
335             }
336             UChar32 c = left[leftIndex++];
337             leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
338             if(leftPair < MIN_LONG) {
339                 leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength);
340             }
341             leftPair = getTertiaries(variableTop, withCaseBits, leftPair);
342         }
343 
344         while(rightPair == 0) {
345             if(rightIndex == rightLength) {
346                 rightPair = EOS;
347                 break;
348             }
349             UChar32 c = right[rightIndex++];
350             rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
351             if(rightPair < MIN_LONG) {
352                 rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength);
353             }
354             rightPair = getTertiaries(variableTop, withCaseBits, rightPair);
355         }
356 
357         if(leftPair == rightPair) {
358             if(leftPair == EOS) { break; }
359             leftPair = rightPair = 0;
360             continue;
361         }
362         uint32_t leftTertiary = leftPair & 0xffff;
363         uint32_t rightTertiary = rightPair & 0xffff;
364         if(leftTertiary != rightTertiary) {
365             if(CollationSettings::sortsTertiaryUpperCaseFirst(options)) {
366                 // Pass through EOS and MERGE_WEIGHT
367                 // and keep real tertiary weights larger than the MERGE_WEIGHT.
368                 // Tertiary CEs (secondary ignorables) are not supported in fast Latin.
369                 if(leftTertiary > MERGE_WEIGHT) {
370                     leftTertiary ^= CASE_MASK;
371                 }
372                 if(rightTertiary > MERGE_WEIGHT) {
373                     rightTertiary ^= CASE_MASK;
374                 }
375             }
376             return (leftTertiary < rightTertiary) ? UCOL_LESS : UCOL_GREATER;
377         }
378         if(leftPair == EOS) { break; }
379         leftPair >>= 16;
380         rightPair >>= 16;
381     }
382     if(CollationSettings::getStrength(options) <= UCOL_TERTIARY) { return UCOL_EQUAL; }
383 
384     leftIndex = rightIndex = 0;
385     leftPair = rightPair = 0;
386     for(;;) {
387         while(leftPair == 0) {
388             if(leftIndex == leftLength) {
389                 leftPair = EOS;
390                 break;
391             }
392             UChar32 c = left[leftIndex++];
393             leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
394             if(leftPair < MIN_LONG) {
395                 leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength);
396             }
397             leftPair = getQuaternaries(variableTop, leftPair);
398         }
399 
400         while(rightPair == 0) {
401             if(rightIndex == rightLength) {
402                 rightPair = EOS;
403                 break;
404             }
405             UChar32 c = right[rightIndex++];
406             rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
407             if(rightPair < MIN_LONG) {
408                 rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength);
409             }
410             rightPair = getQuaternaries(variableTop, rightPair);
411         }
412 
413         if(leftPair == rightPair) {
414             if(leftPair == EOS) { break; }
415             leftPair = rightPair = 0;
416             continue;
417         }
418         uint32_t leftQuaternary = leftPair & 0xffff;
419         uint32_t rightQuaternary = rightPair & 0xffff;
420         if(leftQuaternary != rightQuaternary) {
421             return (leftQuaternary < rightQuaternary) ? UCOL_LESS : UCOL_GREATER;
422         }
423         if(leftPair == EOS) { break; }
424         leftPair >>= 16;
425         rightPair >>= 16;
426     }
427     return UCOL_EQUAL;
428 }
429 
430 int32_t
compareUTF8(const uint16_t * table,const uint16_t * primaries,int32_t options,const uint8_t * left,int32_t leftLength,const uint8_t * right,int32_t rightLength)431 CollationFastLatin::compareUTF8(const uint16_t *table, const uint16_t *primaries, int32_t options,
432                                  const uint8_t *left, int32_t leftLength,
433                                  const uint8_t *right, int32_t rightLength) {
434     // Keep compareUTF16() and compareUTF8() in sync very closely!
435 
436     U_ASSERT((table[0] >> 8) == VERSION);
437     table += (table[0] & 0xff);  // skip the header
438     uint32_t variableTop = (uint32_t)options >> 16;  // see RuleBasedCollator::getFastLatinOptions()
439     options &= 0xffff;  // needed for CollationSettings::getStrength() to work
440 
441     // Check for supported characters, fetch mini CEs, and compare primaries.
442     U_ALIGN_CODE(16);
443     int32_t leftIndex = 0, rightIndex = 0;
444     /**
445      * Single mini CE or a pair.
446      * The current mini CE is in the lower 16 bits, the next one is in the upper 16 bits.
447      * If there is only one, then it is in the lower bits, and the upper bits are 0.
448      */
449     uint32_t leftPair = 0, rightPair = 0;
450     // Note: There is no need to assemble the code point.
451     // We only need to look up the table entry for the character,
452     // and nextPair() looks for whether c==0.
453     for(;;) {
454         // We fetch CEs until we get a non-ignorable primary or reach the end.
455         while(leftPair == 0) {
456             if(leftIndex == leftLength) {
457                 leftPair = EOS;
458                 break;
459             }
460             UChar32 c = left[leftIndex++];
461             uint8_t t;
462             if(c <= 0x7f) {
463                 leftPair = primaries[c];
464                 if(leftPair != 0) { break; }
465                 if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) {
466                     return BAIL_OUT_RESULT;
467                 }
468                 leftPair = table[c];
469             } else if(c <= LATIN_MAX_UTF8_LEAD && 0xc2 <= c && leftIndex != leftLength &&
470                     0x80 <= (t = left[leftIndex]) && t <= 0xbf) {
471                 ++leftIndex;
472                 c = ((c - 0xc2) << 6) + t;
473                 leftPair = primaries[c];
474                 if(leftPair != 0) { break; }
475                 leftPair = table[c];
476             } else {
477                 leftPair = lookupUTF8(table, c, left, leftIndex, leftLength);
478             }
479             if(leftPair >= MIN_SHORT) {
480                 leftPair &= SHORT_PRIMARY_MASK;
481                 break;
482             } else if(leftPair > variableTop) {
483                 leftPair &= LONG_PRIMARY_MASK;
484                 break;
485             } else {
486                 leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength);
487                 if(leftPair == BAIL_OUT) { return BAIL_OUT_RESULT; }
488                 leftPair = getPrimaries(variableTop, leftPair);
489             }
490         }
491 
492         while(rightPair == 0) {
493             if(rightIndex == rightLength) {
494                 rightPair = EOS;
495                 break;
496             }
497             UChar32 c = right[rightIndex++];
498             uint8_t t;
499             if(c <= 0x7f) {
500                 rightPair = primaries[c];
501                 if(rightPair != 0) { break; }
502                 if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) {
503                     return BAIL_OUT_RESULT;
504                 }
505                 rightPair = table[c];
506             } else if(c <= LATIN_MAX_UTF8_LEAD && 0xc2 <= c && rightIndex != rightLength &&
507                     0x80 <= (t = right[rightIndex]) && t <= 0xbf) {
508                 ++rightIndex;
509                 c = ((c - 0xc2) << 6) + t;
510                 rightPair = primaries[c];
511                 if(rightPair != 0) { break; }
512                 rightPair = table[c];
513             } else {
514                 rightPair = lookupUTF8(table, c, right, rightIndex, rightLength);
515             }
516             if(rightPair >= MIN_SHORT) {
517                 rightPair &= SHORT_PRIMARY_MASK;
518                 break;
519             } else if(rightPair > variableTop) {
520                 rightPair &= LONG_PRIMARY_MASK;
521                 break;
522             } else {
523                 rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength);
524                 if(rightPair == BAIL_OUT) { return BAIL_OUT_RESULT; }
525                 rightPair = getPrimaries(variableTop, rightPair);
526             }
527         }
528 
529         if(leftPair == rightPair) {
530             if(leftPair == EOS) { break; }
531             leftPair = rightPair = 0;
532             continue;
533         }
534         uint32_t leftPrimary = leftPair & 0xffff;
535         uint32_t rightPrimary = rightPair & 0xffff;
536         if(leftPrimary != rightPrimary) {
537             // Return the primary difference.
538             return (leftPrimary < rightPrimary) ? UCOL_LESS : UCOL_GREATER;
539         }
540         if(leftPair == EOS) { break; }
541         leftPair >>= 16;
542         rightPair >>= 16;
543     }
544     // In the following, we need to re-fetch each character because we did not buffer the CEs,
545     // but we know that the string is well-formed and
546     // only contains supported characters and mappings.
547 
548     // We might skip the secondary level but continue with the case level
549     // which is turned on separately.
550     if(CollationSettings::getStrength(options) >= UCOL_SECONDARY) {
551         leftIndex = rightIndex = 0;
552         leftPair = rightPair = 0;
553         for(;;) {
554             while(leftPair == 0) {
555                 if(leftIndex == leftLength) {
556                     leftPair = EOS;
557                     break;
558                 }
559                 UChar32 c = left[leftIndex++];
560                 if(c <= 0x7f) {
561                     leftPair = table[c];
562                 } else if(c <= LATIN_MAX_UTF8_LEAD) {
563                     leftPair = table[((c - 0xc2) << 6) + left[leftIndex++]];
564                 } else {
565                     leftPair = lookupUTF8Unsafe(table, c, left, leftIndex);
566                 }
567                 if(leftPair >= MIN_SHORT) {
568                     leftPair = getSecondariesFromOneShortCE(leftPair);
569                     break;
570                 } else if(leftPair > variableTop) {
571                     leftPair = COMMON_SEC_PLUS_OFFSET;
572                     break;
573                 } else {
574                     leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength);
575                     leftPair = getSecondaries(variableTop, leftPair);
576                 }
577             }
578 
579             while(rightPair == 0) {
580                 if(rightIndex == rightLength) {
581                     rightPair = EOS;
582                     break;
583                 }
584                 UChar32 c = right[rightIndex++];
585                 if(c <= 0x7f) {
586                     rightPair = table[c];
587                 } else if(c <= LATIN_MAX_UTF8_LEAD) {
588                     rightPair = table[((c - 0xc2) << 6) + right[rightIndex++]];
589                 } else {
590                     rightPair = lookupUTF8Unsafe(table, c, right, rightIndex);
591                 }
592                 if(rightPair >= MIN_SHORT) {
593                     rightPair = getSecondariesFromOneShortCE(rightPair);
594                     break;
595                 } else if(rightPair > variableTop) {
596                     rightPair = COMMON_SEC_PLUS_OFFSET;
597                     break;
598                 } else {
599                     rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength);
600                     rightPair = getSecondaries(variableTop, rightPair);
601                 }
602             }
603 
604             if(leftPair == rightPair) {
605                 if(leftPair == EOS) { break; }
606                 leftPair = rightPair = 0;
607                 continue;
608             }
609             uint32_t leftSecondary = leftPair & 0xffff;
610             uint32_t rightSecondary = rightPair & 0xffff;
611             if(leftSecondary != rightSecondary) {
612                 if((options & CollationSettings::BACKWARD_SECONDARY) != 0) {
613                     // Full support for backwards secondary requires backwards contraction matching
614                     // and moving backwards between merge separators.
615                     return BAIL_OUT_RESULT;
616                 }
617                 return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER;
618             }
619             if(leftPair == EOS) { break; }
620             leftPair >>= 16;
621             rightPair >>= 16;
622         }
623     }
624 
625     if((options & CollationSettings::CASE_LEVEL) != 0) {
626         UBool strengthIsPrimary = CollationSettings::getStrength(options) == UCOL_PRIMARY;
627         leftIndex = rightIndex = 0;
628         leftPair = rightPair = 0;
629         for(;;) {
630             while(leftPair == 0) {
631                 if(leftIndex == leftLength) {
632                     leftPair = EOS;
633                     break;
634                 }
635                 UChar32 c = left[leftIndex++];
636                 leftPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, left, leftIndex);
637                 if(leftPair < MIN_LONG) {
638                     leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength);
639                 }
640                 leftPair = getCases(variableTop, strengthIsPrimary, leftPair);
641             }
642 
643             while(rightPair == 0) {
644                 if(rightIndex == rightLength) {
645                     rightPair = EOS;
646                     break;
647                 }
648                 UChar32 c = right[rightIndex++];
649                 rightPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, right, rightIndex);
650                 if(rightPair < MIN_LONG) {
651                     rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength);
652                 }
653                 rightPair = getCases(variableTop, strengthIsPrimary, rightPair);
654             }
655 
656             if(leftPair == rightPair) {
657                 if(leftPair == EOS) { break; }
658                 leftPair = rightPair = 0;
659                 continue;
660             }
661             uint32_t leftCase = leftPair & 0xffff;
662             uint32_t rightCase = rightPair & 0xffff;
663             if(leftCase != rightCase) {
664                 if((options & CollationSettings::UPPER_FIRST) == 0) {
665                     return (leftCase < rightCase) ? UCOL_LESS : UCOL_GREATER;
666                 } else {
667                     return (leftCase < rightCase) ? UCOL_GREATER : UCOL_LESS;
668                 }
669             }
670             if(leftPair == EOS) { break; }
671             leftPair >>= 16;
672             rightPair >>= 16;
673         }
674     }
675     if(CollationSettings::getStrength(options) <= UCOL_SECONDARY) { return UCOL_EQUAL; }
676 
677     // Remove the case bits from the tertiary weight when caseLevel is on or caseFirst is off.
678     UBool withCaseBits = CollationSettings::isTertiaryWithCaseBits(options);
679 
680     leftIndex = rightIndex = 0;
681     leftPair = rightPair = 0;
682     for(;;) {
683         while(leftPair == 0) {
684             if(leftIndex == leftLength) {
685                 leftPair = EOS;
686                 break;
687             }
688             UChar32 c = left[leftIndex++];
689             leftPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, left, leftIndex);
690             if(leftPair < MIN_LONG) {
691                 leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength);
692             }
693             leftPair = getTertiaries(variableTop, withCaseBits, leftPair);
694         }
695 
696         while(rightPair == 0) {
697             if(rightIndex == rightLength) {
698                 rightPair = EOS;
699                 break;
700             }
701             UChar32 c = right[rightIndex++];
702             rightPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, right, rightIndex);
703             if(rightPair < MIN_LONG) {
704                 rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength);
705             }
706             rightPair = getTertiaries(variableTop, withCaseBits, rightPair);
707         }
708 
709         if(leftPair == rightPair) {
710             if(leftPair == EOS) { break; }
711             leftPair = rightPair = 0;
712             continue;
713         }
714         uint32_t leftTertiary = leftPair & 0xffff;
715         uint32_t rightTertiary = rightPair & 0xffff;
716         if(leftTertiary != rightTertiary) {
717             if(CollationSettings::sortsTertiaryUpperCaseFirst(options)) {
718                 // Pass through EOS and MERGE_WEIGHT
719                 // and keep real tertiary weights larger than the MERGE_WEIGHT.
720                 // Tertiary CEs (secondary ignorables) are not supported in fast Latin.
721                 if(leftTertiary > MERGE_WEIGHT) {
722                     leftTertiary ^= CASE_MASK;
723                 }
724                 if(rightTertiary > MERGE_WEIGHT) {
725                     rightTertiary ^= CASE_MASK;
726                 }
727             }
728             return (leftTertiary < rightTertiary) ? UCOL_LESS : UCOL_GREATER;
729         }
730         if(leftPair == EOS) { break; }
731         leftPair >>= 16;
732         rightPair >>= 16;
733     }
734     if(CollationSettings::getStrength(options) <= UCOL_TERTIARY) { return UCOL_EQUAL; }
735 
736     leftIndex = rightIndex = 0;
737     leftPair = rightPair = 0;
738     for(;;) {
739         while(leftPair == 0) {
740             if(leftIndex == leftLength) {
741                 leftPair = EOS;
742                 break;
743             }
744             UChar32 c = left[leftIndex++];
745             leftPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, left, leftIndex);
746             if(leftPair < MIN_LONG) {
747                 leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength);
748             }
749             leftPair = getQuaternaries(variableTop, leftPair);
750         }
751 
752         while(rightPair == 0) {
753             if(rightIndex == rightLength) {
754                 rightPair = EOS;
755                 break;
756             }
757             UChar32 c = right[rightIndex++];
758             rightPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, right, rightIndex);
759             if(rightPair < MIN_LONG) {
760                 rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength);
761             }
762             rightPair = getQuaternaries(variableTop, rightPair);
763         }
764 
765         if(leftPair == rightPair) {
766             if(leftPair == EOS) { break; }
767             leftPair = rightPair = 0;
768             continue;
769         }
770         uint32_t leftQuaternary = leftPair & 0xffff;
771         uint32_t rightQuaternary = rightPair & 0xffff;
772         if(leftQuaternary != rightQuaternary) {
773             return (leftQuaternary < rightQuaternary) ? UCOL_LESS : UCOL_GREATER;
774         }
775         if(leftPair == EOS) { break; }
776         leftPair >>= 16;
777         rightPair >>= 16;
778     }
779     return UCOL_EQUAL;
780 }
781 
782 uint32_t
lookup(const uint16_t * table,UChar32 c)783 CollationFastLatin::lookup(const uint16_t *table, UChar32 c) {
784     U_ASSERT(c > LATIN_MAX);
785     if(PUNCT_START <= c && c < PUNCT_LIMIT) {
786         return table[c - PUNCT_START + LATIN_LIMIT];
787     } else if(c == 0xfffe) {
788         return MERGE_WEIGHT;
789     } else if(c == 0xffff) {
790         return MAX_SHORT | COMMON_SEC | LOWER_CASE | COMMON_TER;
791     } else {
792         return BAIL_OUT;
793     }
794 }
795 
796 uint32_t
lookupUTF8(const uint16_t * table,UChar32 c,const uint8_t * s8,int32_t & sIndex,int32_t sLength)797 CollationFastLatin::lookupUTF8(const uint16_t *table, UChar32 c,
798                                const uint8_t *s8, int32_t &sIndex, int32_t sLength) {
799     // The caller handled ASCII and valid/supported Latin.
800     U_ASSERT(c > 0x7f);
801     int32_t i2 = sIndex + 1;
802     if(i2 < sLength || sLength < 0) {
803         uint8_t t1 = s8[sIndex];
804         uint8_t t2 = s8[i2];
805         sIndex += 2;
806         if(c == 0xe2 && t1 == 0x80 && 0x80 <= t2 && t2 <= 0xbf) {
807             return table[(LATIN_LIMIT - 0x80) + t2];  // 2000..203F -> 0180..01BF
808         } else if(c == 0xef && t1 == 0xbf) {
809             if(t2 == 0xbe) {
810                 return MERGE_WEIGHT;  // U+FFFE
811             } else if(t2 == 0xbf) {
812                 return MAX_SHORT | COMMON_SEC | LOWER_CASE | COMMON_TER;  // U+FFFF
813             }
814         }
815     }
816     return BAIL_OUT;
817 }
818 
819 uint32_t
lookupUTF8Unsafe(const uint16_t * table,UChar32 c,const uint8_t * s8,int32_t & sIndex)820 CollationFastLatin::lookupUTF8Unsafe(const uint16_t *table, UChar32 c,
821                                      const uint8_t *s8, int32_t &sIndex) {
822     // The caller handled ASCII.
823     // The string is well-formed and contains only supported characters.
824     U_ASSERT(c > 0x7f);
825     if(c <= LATIN_MAX_UTF8_LEAD) {
826         return table[((c - 0xc2) << 6) + s8[sIndex++]];  // 0080..017F
827     }
828     uint8_t t2 = s8[sIndex + 1];
829     sIndex += 2;
830     if(c == 0xe2) {
831         return table[(LATIN_LIMIT - 0x80) + t2];  // 2000..203F -> 0180..01BF
832     } else if(t2 == 0xbe) {
833         return MERGE_WEIGHT;  // U+FFFE
834     } else {
835         return MAX_SHORT | COMMON_SEC | LOWER_CASE | COMMON_TER;  // U+FFFF
836     }
837 }
838 
839 uint32_t
nextPair(const uint16_t * table,UChar32 c,uint32_t ce,const UChar * s16,const uint8_t * s8,int32_t & sIndex,int32_t & sLength)840 CollationFastLatin::nextPair(const uint16_t *table, UChar32 c, uint32_t ce,
841                              const UChar *s16, const uint8_t *s8, int32_t &sIndex, int32_t &sLength) {
842     if(ce >= MIN_LONG || ce < CONTRACTION) {
843         return ce;  // simple or special mini CE
844     } else if(ce >= EXPANSION) {
845         int32_t index = NUM_FAST_CHARS + (ce & INDEX_MASK);
846         return ((uint32_t)table[index + 1] << 16) | table[index];
847     } else /* ce >= CONTRACTION */ {
848         if(c == 0 && sLength < 0) {
849             sLength = sIndex - 1;
850             return EOS;
851         }
852         // Contraction list: Default mapping followed by
853         // 0 or more single-character contraction suffix mappings.
854         int32_t index = NUM_FAST_CHARS + (ce & INDEX_MASK);
855         if(sIndex != sLength) {
856             // Read the next character.
857             int32_t c2;
858             int32_t nextIndex = sIndex;
859             if(s16 != NULL) {
860                 c2 = s16[nextIndex++];
861                 if(c2 > LATIN_MAX) {
862                     if(PUNCT_START <= c2 && c2 < PUNCT_LIMIT) {
863                         c2 = c2 - PUNCT_START + LATIN_LIMIT;  // 2000..203F -> 0180..01BF
864                     } else if(c2 == 0xfffe || c2 == 0xffff) {
865                         c2 = -1;  // U+FFFE & U+FFFF cannot occur in contractions.
866                     } else {
867                         return BAIL_OUT;
868                     }
869                 }
870             } else {
871                 c2 = s8[nextIndex++];
872                 if(c2 > 0x7f) {
873                     uint8_t t;
874                     if(c2 <= 0xc5 && 0xc2 <= c2 && nextIndex != sLength &&
875                             0x80 <= (t = s8[nextIndex]) && t <= 0xbf) {
876                         c2 = ((c2 - 0xc2) << 6) + t;  // 0080..017F
877                         ++nextIndex;
878                     } else {
879                         int32_t i2 = nextIndex + 1;
880                         if(i2 < sLength || sLength < 0) {
881                             if(c2 == 0xe2 && s8[nextIndex] == 0x80 &&
882                                     0x80 <= (t = s8[i2]) && t <= 0xbf) {
883                                 c2 = (LATIN_LIMIT - 0x80) + t;  // 2000..203F -> 0180..01BF
884                             } else if(c2 == 0xef && s8[nextIndex] == 0xbf &&
885                                     ((t = s8[i2]) == 0xbe || t == 0xbf)) {
886                                 c2 = -1;  // U+FFFE & U+FFFF cannot occur in contractions.
887                             } else {
888                                 return BAIL_OUT;
889                             }
890                         } else {
891                             return BAIL_OUT;
892                         }
893                         nextIndex += 2;
894                     }
895                 }
896             }
897             if(c2 == 0 && sLength < 0) {
898                 sLength = sIndex;
899                 c2 = -1;
900             }
901             // Look for the next character in the contraction suffix list,
902             // which is in ascending order of single suffix characters.
903             int32_t i = index;
904             int32_t head = table[i];  // first skip the default mapping
905             int32_t x;
906             do {
907                 i += head >> CONTR_LENGTH_SHIFT;
908                 head = table[i];
909                 x = head & CONTR_CHAR_MASK;
910             } while(x < c2);
911             if(x == c2) {
912                 index = i;
913                 sIndex = nextIndex;
914             }
915         }
916         // Return the CE or CEs for the default or contraction mapping.
917         int32_t length = table[index] >> CONTR_LENGTH_SHIFT;
918         if(length == 1) {
919             return BAIL_OUT;
920         }
921         ce = table[index + 1];
922         if(length == 2) {
923             return ce;
924         } else {
925             return ((uint32_t)table[index + 2] << 16) | ce;
926         }
927     }
928 }
929 
930 uint32_t
getSecondaries(uint32_t variableTop,uint32_t pair)931 CollationFastLatin::getSecondaries(uint32_t variableTop, uint32_t pair) {
932     if(pair <= 0xffff) {
933         // one mini CE
934         if(pair >= MIN_SHORT) {
935             pair = getSecondariesFromOneShortCE(pair);
936         } else if(pair > variableTop) {
937             pair = COMMON_SEC_PLUS_OFFSET;
938         } else if(pair >= MIN_LONG) {
939             pair = 0;  // variable
940         }
941         // else special mini CE
942     } else {
943         uint32_t ce = pair & 0xffff;
944         if(ce >= MIN_SHORT) {
945             pair = (pair & TWO_SECONDARIES_MASK) + TWO_SEC_OFFSETS;
946         } else if(ce > variableTop) {
947             pair = TWO_COMMON_SEC_PLUS_OFFSET;
948         } else {
949             U_ASSERT(ce >= MIN_LONG);
950             pair = 0;  // variable
951         }
952     }
953     return pair;
954 }
955 
956 uint32_t
getCases(uint32_t variableTop,UBool strengthIsPrimary,uint32_t pair)957 CollationFastLatin::getCases(uint32_t variableTop, UBool strengthIsPrimary, uint32_t pair) {
958     // Primary+caseLevel: Ignore case level weights of primary ignorables.
959     // Otherwise: Ignore case level weights of secondary ignorables.
960     // For details see the comments in the CollationCompare class.
961     // Tertiary CEs (secondary ignorables) are not supported in fast Latin.
962     if(pair <= 0xffff) {
963         // one mini CE
964         if(pair >= MIN_SHORT) {
965             // A high secondary weight means we really have two CEs,
966             // a primary CE and a secondary CE.
967             uint32_t ce = pair;
968             pair &= CASE_MASK;  // explicit weight of primary CE
969             if(!strengthIsPrimary && (ce & SECONDARY_MASK) >= MIN_SEC_HIGH) {
970                 pair |= LOWER_CASE << 16;  // implied weight of secondary CE
971             }
972         } else if(pair > variableTop) {
973             pair = LOWER_CASE;
974         } else if(pair >= MIN_LONG) {
975             pair = 0;  // variable
976         }
977         // else special mini CE
978     } else {
979         // two mini CEs, same primary groups, neither expands like above
980         uint32_t ce = pair & 0xffff;
981         if(ce >= MIN_SHORT) {
982             if(strengthIsPrimary && (pair & (SHORT_PRIMARY_MASK << 16)) == 0) {
983                 pair &= CASE_MASK;
984             } else {
985                 pair &= TWO_CASES_MASK;
986             }
987         } else if(ce > variableTop) {
988             pair = TWO_LOWER_CASES;
989         } else {
990             U_ASSERT(ce >= MIN_LONG);
991             pair = 0;  // variable
992         }
993     }
994     return pair;
995 }
996 
997 uint32_t
getTertiaries(uint32_t variableTop,UBool withCaseBits,uint32_t pair)998 CollationFastLatin::getTertiaries(uint32_t variableTop, UBool withCaseBits, uint32_t pair) {
999     if(pair <= 0xffff) {
1000         // one mini CE
1001         if(pair >= MIN_SHORT) {
1002             // A high secondary weight means we really have two CEs,
1003             // a primary CE and a secondary CE.
1004             uint32_t ce = pair;
1005             if(withCaseBits) {
1006                 pair = (pair & CASE_AND_TERTIARY_MASK) + TER_OFFSET;
1007                 if((ce & SECONDARY_MASK) >= MIN_SEC_HIGH) {
1008                     pair |= (LOWER_CASE | COMMON_TER_PLUS_OFFSET) << 16;
1009                 }
1010             } else {
1011                 pair = (pair & TERTIARY_MASK) + TER_OFFSET;
1012                 if((ce & SECONDARY_MASK) >= MIN_SEC_HIGH) {
1013                     pair |= COMMON_TER_PLUS_OFFSET << 16;
1014                 }
1015             }
1016         } else if(pair > variableTop) {
1017             pair = (pair & TERTIARY_MASK) + TER_OFFSET;
1018             if(withCaseBits) {
1019                 pair |= LOWER_CASE;
1020             }
1021         } else if(pair >= MIN_LONG) {
1022             pair = 0;  // variable
1023         }
1024         // else special mini CE
1025     } else {
1026         // two mini CEs, same primary groups, neither expands like above
1027         uint32_t ce = pair & 0xffff;
1028         if(ce >= MIN_SHORT) {
1029             if(withCaseBits) {
1030                 pair &= TWO_CASES_MASK | TWO_TERTIARIES_MASK;
1031             } else {
1032                 pair &= TWO_TERTIARIES_MASK;
1033             }
1034             pair += TWO_TER_OFFSETS;
1035         } else if(ce > variableTop) {
1036             pair = (pair & TWO_TERTIARIES_MASK) + TWO_TER_OFFSETS;
1037             if(withCaseBits) {
1038                 pair |= TWO_LOWER_CASES;
1039             }
1040         } else {
1041             U_ASSERT(ce >= MIN_LONG);
1042             pair = 0;  // variable
1043         }
1044     }
1045     return pair;
1046 }
1047 
1048 uint32_t
getQuaternaries(uint32_t variableTop,uint32_t pair)1049 CollationFastLatin::getQuaternaries(uint32_t variableTop, uint32_t pair) {
1050     // Return the primary weight of a variable CE,
1051     // or the maximum primary weight for a non-variable, not-completely-ignorable CE.
1052     if(pair <= 0xffff) {
1053         // one mini CE
1054         if(pair >= MIN_SHORT) {
1055             // A high secondary weight means we really have two CEs,
1056             // a primary CE and a secondary CE.
1057             if((pair & SECONDARY_MASK) >= MIN_SEC_HIGH) {
1058                 pair = TWO_SHORT_PRIMARIES_MASK;
1059             } else {
1060                 pair = SHORT_PRIMARY_MASK;
1061             }
1062         } else if(pair > variableTop) {
1063             pair = SHORT_PRIMARY_MASK;
1064         } else if(pair >= MIN_LONG) {
1065             pair &= LONG_PRIMARY_MASK;  // variable
1066         }
1067         // else special mini CE
1068     } else {
1069         // two mini CEs, same primary groups, neither expands like above
1070         uint32_t ce = pair & 0xffff;
1071         if(ce > variableTop) {
1072             pair = TWO_SHORT_PRIMARIES_MASK;
1073         } else {
1074             U_ASSERT(ce >= MIN_LONG);
1075             pair &= TWO_LONG_PRIMARIES_MASK;  // variable
1076         }
1077     }
1078     return pair;
1079 }
1080 
1081 U_NAMESPACE_END
1082 
1083 #endif  // !UCONFIG_NO_COLLATION
1084