1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html#License 3 /* 4 ******************************************************************************* 5 * Copyright (C) 2013-2015, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ******************************************************************************* 8 * CollationFastLatin.java, ported from collationfastlatin.h/.cpp 9 * 10 * C++ version created on: 2013aug09 11 * created by: Markus W. Scherer 12 */ 13 14 package com.ibm.icu.impl.coll; 15 16 import com.ibm.icu.lang.UScript; 17 import com.ibm.icu.text.Collator; 18 19 public final class CollationFastLatin /* all static */ { 20 /** 21 * Fast Latin format version (one byte 1..FF). 22 * Must be incremented for any runtime-incompatible changes, 23 * in particular, for changes to any of the following constants. 24 * 25 * When the major version number of the main data format changes, 26 * we can reset this fast Latin version to 1. 27 */ 28 public static final int VERSION = 2; 29 30 public static final int LATIN_MAX = 0x17f; 31 public static final int LATIN_LIMIT = LATIN_MAX + 1; 32 33 static final int LATIN_MAX_UTF8_LEAD = 0xc5; // UTF-8 lead byte of LATIN_MAX 34 35 static final int PUNCT_START = 0x2000; 36 static final int PUNCT_LIMIT = 0x2040; 37 38 // excludes U+FFFE & U+FFFF 39 static final int NUM_FAST_CHARS = LATIN_LIMIT + (PUNCT_LIMIT - PUNCT_START); 40 41 // Note on the supported weight ranges: 42 // Analysis of UCA 6.3 and CLDR 23 non-search tailorings shows that 43 // the CEs for characters in the above ranges, excluding expansions with length >2, 44 // excluding contractions of >2 characters, and other restrictions 45 // (see the builder's getCEsFromCE32()), 46 // use at most about 150 primary weights, 47 // where about 94 primary weights are possibly-variable (space/punct/symbol/currency), 48 // at most 4 secondary before-common weights, 49 // at most 4 secondary after-common weights, 50 // at most 16 secondary high weights (in secondary CEs), and 51 // at most 4 tertiary after-common weights. 52 // The following ranges are designed to support slightly more weights than that. 53 // (en_US_POSIX is unusual: It creates about 64 variable + 116 Latin primaries.) 54 55 // Digits may use long primaries (preserving more short ones) 56 // or short primaries (faster) without changing this data structure. 57 // (If we supported numeric collation, then digits would have to have long primaries 58 // so that special handling does not affect the fast path.) 59 60 static final int SHORT_PRIMARY_MASK = 0xfc00; // bits 15..10 61 static final int INDEX_MASK = 0x3ff; // bits 9..0 for expansions & contractions 62 static final int SECONDARY_MASK = 0x3e0; // bits 9..5 63 static final int CASE_MASK = 0x18; // bits 4..3 64 static final int LONG_PRIMARY_MASK = 0xfff8; // bits 15..3 65 static final int TERTIARY_MASK = 7; // bits 2..0 66 static final int CASE_AND_TERTIARY_MASK = CASE_MASK | TERTIARY_MASK; 67 68 static final int TWO_SHORT_PRIMARIES_MASK = 69 (SHORT_PRIMARY_MASK << 16) | SHORT_PRIMARY_MASK; // 0xfc00fc00 70 static final int TWO_LONG_PRIMARIES_MASK = 71 (LONG_PRIMARY_MASK << 16) | LONG_PRIMARY_MASK; // 0xfff8fff8 72 static final int TWO_SECONDARIES_MASK = 73 (SECONDARY_MASK << 16) | SECONDARY_MASK; // 0x3e003e0 74 static final int TWO_CASES_MASK = 75 (CASE_MASK << 16) | CASE_MASK; // 0x180018 76 static final int TWO_TERTIARIES_MASK = 77 (TERTIARY_MASK << 16) | TERTIARY_MASK; // 0x70007 78 79 /** 80 * Contraction with one fast Latin character. 81 * Use INDEX_MASK to find the start of the contraction list after the fixed table. 82 * The first entry contains the default mapping. 83 * Otherwise use CONTR_CHAR_MASK for the contraction character index 84 * (in ascending order). 85 * Use CONTR_LENGTH_SHIFT for the length of the entry 86 * (1=BAIL_OUT, 2=one CE, 3=two CEs). 87 * 88 * Also, U+0000 maps to a contraction entry, so that the fast path need not 89 * check for NUL termination. 90 * It usually maps to a contraction list with only the completely ignorable default value. 91 */ 92 static final int CONTRACTION = 0x400; 93 /** 94 * An expansion encodes two CEs. 95 * Use INDEX_MASK to find the pair of CEs after the fixed table. 96 * 97 * The higher a mini CE value, the easier it is to process. 98 * For expansions and higher, no context needs to be considered. 99 */ 100 static final int EXPANSION = 0x800; 101 /** 102 * Encodes one CE with a long/low mini primary (there are 128). 103 * All potentially-variable primaries must be in this range, 104 * to make the short-primary path as fast as possible. 105 */ 106 static final int MIN_LONG = 0xc00; 107 static final int LONG_INC = 8; 108 static final int MAX_LONG = 0xff8; 109 /** 110 * Encodes one CE with a short/high primary (there are 60), 111 * plus a secondary CE if the secondary weight is high. 112 * Fast handling: At least all letter primaries should be in this range. 113 */ 114 static final int MIN_SHORT = 0x1000; 115 static final int SHORT_INC = 0x400; 116 /** The highest primary weight is reserved for U+FFFF. */ 117 static final int MAX_SHORT = SHORT_PRIMARY_MASK; 118 119 static final int MIN_SEC_BEFORE = 0; // must add SEC_OFFSET 120 static final int SEC_INC = 0x20; 121 static final int MAX_SEC_BEFORE = MIN_SEC_BEFORE + 4 * SEC_INC; // 5 before common 122 static final int COMMON_SEC = MAX_SEC_BEFORE + SEC_INC; 123 static final int MIN_SEC_AFTER = COMMON_SEC + SEC_INC; 124 static final int MAX_SEC_AFTER = MIN_SEC_AFTER + 5 * SEC_INC; // 6 after common 125 static final int MIN_SEC_HIGH = MAX_SEC_AFTER + SEC_INC; // 20 high secondaries 126 static final int MAX_SEC_HIGH = SECONDARY_MASK; 127 128 /** 129 * Lookup: Add this offset to secondary weights, except for completely ignorable CEs. 130 * Must be greater than any special value, e.g., MERGE_WEIGHT. 131 * The exact value is not relevant for the format version. 132 */ 133 static final int SEC_OFFSET = SEC_INC; 134 static final int COMMON_SEC_PLUS_OFFSET = COMMON_SEC + SEC_OFFSET; 135 136 static final int TWO_SEC_OFFSETS = 137 (SEC_OFFSET << 16) | SEC_OFFSET; // 0x200020 138 static final int TWO_COMMON_SEC_PLUS_OFFSET = 139 (COMMON_SEC_PLUS_OFFSET << 16) | COMMON_SEC_PLUS_OFFSET; 140 141 static final int LOWER_CASE = 8; // case bits include this offset 142 static final int TWO_LOWER_CASES = (LOWER_CASE << 16) | LOWER_CASE; // 0x80008 143 144 static final int COMMON_TER = 0; // must add TER_OFFSET 145 static final int MAX_TER_AFTER = 7; // 7 after common 146 147 /** 148 * Lookup: Add this offset to tertiary weights, except for completely ignorable CEs. 149 * Must be greater than any special value, e.g., MERGE_WEIGHT. 150 * Must be greater than case bits as well, so that with combined case+tertiary weights 151 * plus the offset the tertiary bits does not spill over into the case bits. 152 * The exact value is not relevant for the format version. 153 */ 154 static final int TER_OFFSET = SEC_OFFSET; 155 static final int COMMON_TER_PLUS_OFFSET = COMMON_TER + TER_OFFSET; 156 157 static final int TWO_TER_OFFSETS = (TER_OFFSET << 16) | TER_OFFSET; 158 static final int TWO_COMMON_TER_PLUS_OFFSET = 159 (COMMON_TER_PLUS_OFFSET << 16) | COMMON_TER_PLUS_OFFSET; 160 161 static final int MERGE_WEIGHT = 3; 162 static final int EOS = 2; // end of string 163 static final int BAIL_OUT = 1; 164 165 /** 166 * Contraction result first word bits 8..0 contain the 167 * second contraction character, as a char index 0..NUM_FAST_CHARS-1. 168 * Each contraction list is terminated with a word containing CONTR_CHAR_MASK. 169 */ 170 static final int CONTR_CHAR_MASK = 0x1ff; 171 /** 172 * Contraction result first word bits 10..9 contain the result length: 173 * 1=bail out, 2=one mini CE, 3=two mini CEs 174 */ 175 static final int CONTR_LENGTH_SHIFT = 9; 176 177 /** 178 * Comparison return value when the regular comparison must be used. 179 * The exact value is not relevant for the format version. 180 */ 181 public static final int BAIL_OUT_RESULT = -2; 182 getCharIndex(char c)183 static int getCharIndex(char c) { 184 if(c <= LATIN_MAX) { 185 return c; 186 } else if(PUNCT_START <= c && c < PUNCT_LIMIT) { 187 return c - (PUNCT_START - LATIN_LIMIT); 188 } else { 189 // Not a fast Latin character. 190 // Note: U+FFFE & U+FFFF are forbidden in tailorings 191 // and thus do not occur in any contractions. 192 return -1; 193 } 194 } 195 196 /** 197 * Computes the options value for the compare functions 198 * and writes the precomputed primary weights. 199 * Returns -1 if the Latin fastpath is not supported for the data and settings. 200 * The capacity must be LATIN_LIMIT. 201 */ getOptions(CollationData data, CollationSettings settings, char[] primaries)202 public static int getOptions(CollationData data, CollationSettings settings, 203 char[] primaries) { 204 char[] header = data.fastLatinTableHeader; 205 if(header == null) { return -1; } 206 assert((header[0] >> 8) == VERSION); 207 if(primaries.length != LATIN_LIMIT) { 208 assert false; 209 return -1; 210 } 211 212 int miniVarTop; 213 if((settings.options & CollationSettings.ALTERNATE_MASK) == 0) { 214 // No mini primaries are variable, set a variableTop just below the 215 // lowest long mini primary. 216 miniVarTop = MIN_LONG - 1; 217 } else { 218 int headerLength = header[0] & 0xff; 219 int i = 1 + settings.getMaxVariable(); 220 if(i >= headerLength) { 221 return -1; // variableTop >= digits, should not occur 222 } 223 miniVarTop = header[i]; 224 } 225 226 boolean digitsAreReordered = false; 227 if(settings.hasReordering()) { 228 long prevStart = 0; 229 long beforeDigitStart = 0; 230 long digitStart = 0; 231 long afterDigitStart = 0; 232 for(int group = Collator.ReorderCodes.FIRST; 233 group < Collator.ReorderCodes.FIRST + CollationData.MAX_NUM_SPECIAL_REORDER_CODES; 234 ++group) { 235 long start = data.getFirstPrimaryForGroup(group); 236 start = settings.reorder(start); 237 if(group == Collator.ReorderCodes.DIGIT) { 238 beforeDigitStart = prevStart; 239 digitStart = start; 240 } else if(start != 0) { 241 if(start < prevStart) { 242 // The permutation affects the groups up to Latin. 243 return -1; 244 } 245 // In the future, there might be a special group between digits & Latin. 246 if(digitStart != 0 && afterDigitStart == 0 && prevStart == beforeDigitStart) { 247 afterDigitStart = start; 248 } 249 prevStart = start; 250 } 251 } 252 long latinStart = data.getFirstPrimaryForGroup(UScript.LATIN); 253 latinStart = settings.reorder(latinStart); 254 if(latinStart < prevStart) { 255 return -1; 256 } 257 if(afterDigitStart == 0) { 258 afterDigitStart = latinStart; 259 } 260 if(!(beforeDigitStart < digitStart && digitStart < afterDigitStart)) { 261 digitsAreReordered = true; 262 } 263 } 264 265 char[] table = data.fastLatinTable; // skip the header 266 for(int c = 0; c < LATIN_LIMIT; ++c) { 267 int p = table[c]; 268 if(p >= MIN_SHORT) { 269 p &= SHORT_PRIMARY_MASK; 270 } else if(p > miniVarTop) { 271 p &= LONG_PRIMARY_MASK; 272 } else { 273 p = 0; 274 } 275 primaries[c] = (char)p; 276 } 277 if(digitsAreReordered || (settings.options & CollationSettings.NUMERIC) != 0) { 278 // Bail out for digits. 279 for(int c = 0x30; c <= 0x39; ++c) { primaries[c] = 0; } 280 } 281 282 // Shift the miniVarTop above other options. 283 return (miniVarTop << 16) | settings.options; 284 } 285 compareUTF16(char[] table, char[] primaries, int options, CharSequence left, CharSequence right, int startIndex)286 public static int compareUTF16(char[] table, char[] primaries, int options, 287 CharSequence left, CharSequence right, int startIndex) { 288 // This is a modified copy of CollationCompare.compareUpToQuaternary(), 289 // optimized for common Latin text. 290 // Keep them in sync! 291 292 int variableTop = options >> 16; // see getOptions() 293 options &= 0xffff; // needed for CollationSettings.getStrength() to work 294 295 // Check for supported characters, fetch mini CEs, and compare primaries. 296 int leftIndex = startIndex, rightIndex = startIndex; 297 /** 298 * Single mini CE or a pair. 299 * The current mini CE is in the lower 16 bits, the next one is in the upper 16 bits. 300 * If there is only one, then it is in the lower bits, and the upper bits are 0. 301 */ 302 int leftPair = 0, rightPair = 0; 303 for(;;) { 304 // We fetch CEs until we get a non-ignorable primary or reach the end. 305 while(leftPair == 0) { 306 if(leftIndex == left.length()) { 307 leftPair = EOS; 308 break; 309 } 310 int c = left.charAt(leftIndex++); 311 if(c <= LATIN_MAX) { 312 leftPair = primaries[c]; 313 if(leftPair != 0) { break; } 314 if(c <= 0x39 && c >= 0x30 && (options & CollationSettings.NUMERIC) != 0) { 315 return BAIL_OUT_RESULT; 316 } 317 leftPair = table[c]; 318 } else if(PUNCT_START <= c && c < PUNCT_LIMIT) { 319 leftPair = table[c - PUNCT_START + LATIN_LIMIT]; 320 } else { 321 leftPair = lookup(table, c); 322 } 323 if(leftPair >= MIN_SHORT) { 324 leftPair &= SHORT_PRIMARY_MASK; 325 break; 326 } else if(leftPair > variableTop) { 327 leftPair &= LONG_PRIMARY_MASK; 328 break; 329 } else { 330 long pairAndInc = nextPair(table, c, leftPair, left, leftIndex); 331 if(pairAndInc < 0) { 332 ++leftIndex; 333 pairAndInc = ~pairAndInc; 334 } 335 leftPair = (int)pairAndInc; 336 if(leftPair == BAIL_OUT) { return BAIL_OUT_RESULT; } 337 leftPair = getPrimaries(variableTop, leftPair); 338 } 339 } 340 341 while(rightPair == 0) { 342 if(rightIndex == right.length()) { 343 rightPair = EOS; 344 break; 345 } 346 int c = right.charAt(rightIndex++); 347 if(c <= LATIN_MAX) { 348 rightPair = primaries[c]; 349 if(rightPair != 0) { break; } 350 if(c <= 0x39 && c >= 0x30 && (options & CollationSettings.NUMERIC) != 0) { 351 return BAIL_OUT_RESULT; 352 } 353 rightPair = table[c]; 354 } else if(PUNCT_START <= c && c < PUNCT_LIMIT) { 355 rightPair = table[c - PUNCT_START + LATIN_LIMIT]; 356 } else { 357 rightPair = lookup(table, c); 358 } 359 if(rightPair >= MIN_SHORT) { 360 rightPair &= SHORT_PRIMARY_MASK; 361 break; 362 } else if(rightPair > variableTop) { 363 rightPair &= LONG_PRIMARY_MASK; 364 break; 365 } else { 366 long pairAndInc = nextPair(table, c, rightPair, right, rightIndex); 367 if(pairAndInc < 0) { 368 ++rightIndex; 369 pairAndInc = ~pairAndInc; 370 } 371 rightPair = (int)pairAndInc; 372 if(rightPair == BAIL_OUT) { return BAIL_OUT_RESULT; } 373 rightPair = getPrimaries(variableTop, rightPair); 374 } 375 } 376 377 if(leftPair == rightPair) { 378 if(leftPair == EOS) { break; } 379 leftPair = rightPair = 0; 380 continue; 381 } 382 int leftPrimary = leftPair & 0xffff; 383 int rightPrimary = rightPair & 0xffff; 384 if(leftPrimary != rightPrimary) { 385 // Return the primary difference. 386 return (leftPrimary < rightPrimary) ? Collation.LESS : Collation.GREATER; 387 } 388 if(leftPair == EOS) { break; } 389 leftPair >>>= 16; 390 rightPair >>>= 16; 391 } 392 // In the following, we need to re-fetch each character because we did not buffer the CEs, 393 // but we know that the string is well-formed and 394 // only contains supported characters and mappings. 395 396 // We might skip the secondary level but continue with the case level 397 // which is turned on separately. 398 if(CollationSettings.getStrength(options) >= Collator.SECONDARY) { 399 leftIndex = rightIndex = startIndex; 400 leftPair = rightPair = 0; 401 for(;;) { 402 while(leftPair == 0) { 403 if(leftIndex == left.length()) { 404 leftPair = EOS; 405 break; 406 } 407 int c = left.charAt(leftIndex++); 408 if(c <= LATIN_MAX) { 409 leftPair = table[c]; 410 } else if(PUNCT_START <= c && c < PUNCT_LIMIT) { 411 leftPair = table[c - PUNCT_START + LATIN_LIMIT]; 412 } else { 413 leftPair = lookup(table, c); 414 } 415 if(leftPair >= MIN_SHORT) { 416 leftPair = getSecondariesFromOneShortCE(leftPair); 417 break; 418 } else if(leftPair > variableTop) { 419 leftPair = COMMON_SEC_PLUS_OFFSET; 420 break; 421 } else { 422 long pairAndInc = nextPair(table, c, leftPair, left, leftIndex); 423 if(pairAndInc < 0) { 424 ++leftIndex; 425 pairAndInc = ~pairAndInc; 426 } 427 leftPair = getSecondaries(variableTop, (int)pairAndInc); 428 } 429 } 430 431 while(rightPair == 0) { 432 if(rightIndex == right.length()) { 433 rightPair = EOS; 434 break; 435 } 436 int c = right.charAt(rightIndex++); 437 if(c <= LATIN_MAX) { 438 rightPair = table[c]; 439 } else if(PUNCT_START <= c && c < PUNCT_LIMIT) { 440 rightPair = table[c - PUNCT_START + LATIN_LIMIT]; 441 } else { 442 rightPair = lookup(table, c); 443 } 444 if(rightPair >= MIN_SHORT) { 445 rightPair = getSecondariesFromOneShortCE(rightPair); 446 break; 447 } else if(rightPair > variableTop) { 448 rightPair = COMMON_SEC_PLUS_OFFSET; 449 break; 450 } else { 451 long pairAndInc = nextPair(table, c, rightPair, right, rightIndex); 452 if(pairAndInc < 0) { 453 ++rightIndex; 454 pairAndInc = ~pairAndInc; 455 } 456 rightPair = getSecondaries(variableTop, (int)pairAndInc); 457 } 458 } 459 460 if(leftPair == rightPair) { 461 if(leftPair == EOS) { break; } 462 leftPair = rightPair = 0; 463 continue; 464 } 465 int leftSecondary = leftPair & 0xffff; 466 int rightSecondary = rightPair & 0xffff; 467 if(leftSecondary != rightSecondary) { 468 if((options & CollationSettings.BACKWARD_SECONDARY) != 0) { 469 // Full support for backwards secondary requires backwards contraction matching 470 // and moving backwards between merge separators. 471 return BAIL_OUT_RESULT; 472 } 473 return (leftSecondary < rightSecondary) ? Collation.LESS : Collation.GREATER; 474 } 475 if(leftPair == EOS) { break; } 476 leftPair >>>= 16; 477 rightPair >>>= 16; 478 } 479 } 480 481 if((options & CollationSettings.CASE_LEVEL) != 0) { 482 boolean strengthIsPrimary = CollationSettings.getStrength(options) == Collator.PRIMARY; 483 leftIndex = rightIndex = startIndex; 484 leftPair = rightPair = 0; 485 for(;;) { 486 while(leftPair == 0) { 487 if(leftIndex == left.length()) { 488 leftPair = EOS; 489 break; 490 } 491 int c = left.charAt(leftIndex++); 492 leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c); 493 if(leftPair < MIN_LONG) { 494 long pairAndInc = nextPair(table, c, leftPair, left, leftIndex); 495 if(pairAndInc < 0) { 496 ++leftIndex; 497 pairAndInc = ~pairAndInc; 498 } 499 leftPair = (int)pairAndInc; 500 } 501 leftPair = getCases(variableTop, strengthIsPrimary, leftPair); 502 } 503 504 while(rightPair == 0) { 505 if(rightIndex == right.length()) { 506 rightPair = EOS; 507 break; 508 } 509 int c = right.charAt(rightIndex++); 510 rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c); 511 if(rightPair < MIN_LONG) { 512 long pairAndInc = nextPair(table, c, rightPair, right, rightIndex); 513 if(pairAndInc < 0) { 514 ++rightIndex; 515 pairAndInc = ~pairAndInc; 516 } 517 rightPair = (int)pairAndInc; 518 } 519 rightPair = getCases(variableTop, strengthIsPrimary, rightPair); 520 } 521 522 if(leftPair == rightPair) { 523 if(leftPair == EOS) { break; } 524 leftPair = rightPair = 0; 525 continue; 526 } 527 int leftCase = leftPair & 0xffff; 528 int rightCase = rightPair & 0xffff; 529 if(leftCase != rightCase) { 530 if((options & CollationSettings.UPPER_FIRST) == 0) { 531 return (leftCase < rightCase) ? Collation.LESS : Collation.GREATER; 532 } else { 533 return (leftCase < rightCase) ? Collation.GREATER : Collation.LESS; 534 } 535 } 536 if(leftPair == EOS) { break; } 537 leftPair >>>= 16; 538 rightPair >>>= 16; 539 } 540 } 541 if(CollationSettings.getStrength(options) <= Collator.SECONDARY) { return Collation.EQUAL; } 542 543 // Remove the case bits from the tertiary weight when caseLevel is on or caseFirst is off. 544 boolean withCaseBits = CollationSettings.isTertiaryWithCaseBits(options); 545 546 leftIndex = rightIndex = startIndex; 547 leftPair = rightPair = 0; 548 for(;;) { 549 while(leftPair == 0) { 550 if(leftIndex == left.length()) { 551 leftPair = EOS; 552 break; 553 } 554 int c = left.charAt(leftIndex++); 555 leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c); 556 if(leftPair < MIN_LONG) { 557 long pairAndInc = nextPair(table, c, leftPair, left, leftIndex); 558 if(pairAndInc < 0) { 559 ++leftIndex; 560 pairAndInc = ~pairAndInc; 561 } 562 leftPair = (int)pairAndInc; 563 } 564 leftPair = getTertiaries(variableTop, withCaseBits, leftPair); 565 } 566 567 while(rightPair == 0) { 568 if(rightIndex == right.length()) { 569 rightPair = EOS; 570 break; 571 } 572 int c = right.charAt(rightIndex++); 573 rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c); 574 if(rightPair < MIN_LONG) { 575 long pairAndInc = nextPair(table, c, rightPair, right, rightIndex); 576 if(pairAndInc < 0) { 577 ++rightIndex; 578 pairAndInc = ~pairAndInc; 579 } 580 rightPair = (int)pairAndInc; 581 } 582 rightPair = getTertiaries(variableTop, withCaseBits, rightPair); 583 } 584 585 if(leftPair == rightPair) { 586 if(leftPair == EOS) { break; } 587 leftPair = rightPair = 0; 588 continue; 589 } 590 int leftTertiary = leftPair & 0xffff; 591 int rightTertiary = rightPair & 0xffff; 592 if(leftTertiary != rightTertiary) { 593 if(CollationSettings.sortsTertiaryUpperCaseFirst(options)) { 594 // Pass through EOS and MERGE_WEIGHT 595 // and keep real tertiary weights larger than the MERGE_WEIGHT. 596 // Tertiary CEs (secondary ignorables) are not supported in fast Latin. 597 if(leftTertiary > MERGE_WEIGHT) { 598 leftTertiary ^= CASE_MASK; 599 } 600 if(rightTertiary > MERGE_WEIGHT) { 601 rightTertiary ^= CASE_MASK; 602 } 603 } 604 return (leftTertiary < rightTertiary) ? Collation.LESS : Collation.GREATER; 605 } 606 if(leftPair == EOS) { break; } 607 leftPair >>>= 16; 608 rightPair >>>= 16; 609 } 610 if(CollationSettings.getStrength(options) <= Collator.TERTIARY) { return Collation.EQUAL; } 611 612 leftIndex = rightIndex = startIndex; 613 leftPair = rightPair = 0; 614 for(;;) { 615 while(leftPair == 0) { 616 if(leftIndex == left.length()) { 617 leftPair = EOS; 618 break; 619 } 620 int c = left.charAt(leftIndex++); 621 leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c); 622 if(leftPair < MIN_LONG) { 623 long pairAndInc = nextPair(table, c, leftPair, left, leftIndex); 624 if(pairAndInc < 0) { 625 ++leftIndex; 626 pairAndInc = ~pairAndInc; 627 } 628 leftPair = (int)pairAndInc; 629 } 630 leftPair = getQuaternaries(variableTop, leftPair); 631 } 632 633 while(rightPair == 0) { 634 if(rightIndex == right.length()) { 635 rightPair = EOS; 636 break; 637 } 638 int c = right.charAt(rightIndex++); 639 rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c); 640 if(rightPair < MIN_LONG) { 641 long pairAndInc = nextPair(table, c, rightPair, right, rightIndex); 642 if(pairAndInc < 0) { 643 ++rightIndex; 644 pairAndInc = ~pairAndInc; 645 } 646 rightPair = (int)pairAndInc; 647 } 648 rightPair = getQuaternaries(variableTop, rightPair); 649 } 650 651 if(leftPair == rightPair) { 652 if(leftPair == EOS) { break; } 653 leftPair = rightPair = 0; 654 continue; 655 } 656 int leftQuaternary = leftPair & 0xffff; 657 int rightQuaternary = rightPair & 0xffff; 658 if(leftQuaternary != rightQuaternary) { 659 return (leftQuaternary < rightQuaternary) ? Collation.LESS : Collation.GREATER; 660 } 661 if(leftPair == EOS) { break; } 662 leftPair >>>= 16; 663 rightPair >>>= 16; 664 } 665 return Collation.EQUAL; 666 } 667 lookup(char[] table, int c)668 private static int lookup(char[] table, int c) { 669 assert(c > LATIN_MAX); 670 if(PUNCT_START <= c && c < PUNCT_LIMIT) { 671 return table[c - PUNCT_START + LATIN_LIMIT]; 672 } else if(c == 0xfffe) { 673 return MERGE_WEIGHT; 674 } else if(c == 0xffff) { 675 return MAX_SHORT | COMMON_SEC | LOWER_CASE | COMMON_TER; 676 } else { 677 return BAIL_OUT; 678 } 679 } 680 681 /** 682 * Java returns a negative result (use the '~' operator) if sIndex is to be incremented. 683 * C++ modifies sIndex. 684 */ nextPair(char[] table, int c, int ce, CharSequence s16, int sIndex)685 private static long nextPair(char[] table, int c, int ce, CharSequence s16, int sIndex) { 686 if(ce >= MIN_LONG || ce < CONTRACTION) { 687 return ce; // simple or special mini CE 688 } else if(ce >= EXPANSION) { 689 int index = NUM_FAST_CHARS + (ce & INDEX_MASK); 690 return ((long)table[index + 1] << 16) | table[index]; 691 } else /* ce >= CONTRACTION */ { 692 // Contraction list: Default mapping followed by 693 // 0 or more single-character contraction suffix mappings. 694 int index = NUM_FAST_CHARS + (ce & INDEX_MASK); 695 boolean inc = false; // true if the next char is consumed. 696 if(sIndex != s16.length()) { 697 // Read the next character. 698 int c2; 699 int nextIndex = sIndex; 700 c2 = s16.charAt(nextIndex++); 701 if(c2 > LATIN_MAX) { 702 if(PUNCT_START <= c2 && c2 < PUNCT_LIMIT) { 703 c2 = c2 - PUNCT_START + LATIN_LIMIT; // 2000..203F -> 0180..01BF 704 } else if(c2 == 0xfffe || c2 == 0xffff) { 705 c2 = -1; // U+FFFE & U+FFFF cannot occur in contractions. 706 } else { 707 return BAIL_OUT; 708 } 709 } 710 // Look for the next character in the contraction suffix list, 711 // which is in ascending order of single suffix characters. 712 int i = index; 713 int head = table[i]; // first skip the default mapping 714 int x; 715 do { 716 i += head >> CONTR_LENGTH_SHIFT; 717 head = table[i]; 718 x = head & CONTR_CHAR_MASK; 719 } while(x < c2); 720 if(x == c2) { 721 index = i; 722 inc = true; 723 } 724 } 725 // Return the CE or CEs for the default or contraction mapping. 726 int length = table[index] >> CONTR_LENGTH_SHIFT; 727 if(length == 1) { 728 return BAIL_OUT; 729 } 730 ce = table[index + 1]; 731 long result; 732 if(length == 2) { 733 result = ce; 734 } else { 735 result = ((long)table[index + 2] << 16) | ce; 736 } 737 return inc ? ~result : result; 738 } 739 } 740 getPrimaries(int variableTop, int pair)741 private static int getPrimaries(int variableTop, int pair) { 742 int ce = pair & 0xffff; 743 if(ce >= MIN_SHORT) { return pair & TWO_SHORT_PRIMARIES_MASK; } 744 if(ce > variableTop) { return pair & TWO_LONG_PRIMARIES_MASK; } 745 if(ce >= MIN_LONG) { return 0; } // variable 746 return pair; // special mini CE 747 } 748 getSecondariesFromOneShortCE(int ce)749 private static int getSecondariesFromOneShortCE(int ce) { 750 ce &= SECONDARY_MASK; 751 if(ce < MIN_SEC_HIGH) { 752 return ce + SEC_OFFSET; 753 } else { 754 return ((ce + SEC_OFFSET) << 16) | COMMON_SEC_PLUS_OFFSET; 755 } 756 } 757 getSecondaries(int variableTop, int pair)758 private static int getSecondaries(int variableTop, int pair) { 759 if(pair <= 0xffff) { 760 // one mini CE 761 if(pair >= MIN_SHORT) { 762 pair = getSecondariesFromOneShortCE(pair); 763 } else if(pair > variableTop) { 764 pair = COMMON_SEC_PLUS_OFFSET; 765 } else if(pair >= MIN_LONG) { 766 pair = 0; // variable 767 } 768 // else special mini CE 769 } else { 770 int ce = pair & 0xffff; 771 if(ce >= MIN_SHORT) { 772 pair = (pair & TWO_SECONDARIES_MASK) + TWO_SEC_OFFSETS; 773 } else if(ce > variableTop) { 774 pair = TWO_COMMON_SEC_PLUS_OFFSET; 775 } else { 776 assert(ce >= MIN_LONG); 777 pair = 0; // variable 778 } 779 } 780 return pair; 781 } 782 getCases(int variableTop, boolean strengthIsPrimary, int pair)783 private static int getCases(int variableTop, boolean strengthIsPrimary, int pair) { 784 // Primary+caseLevel: Ignore case level weights of primary ignorables. 785 // Otherwise: Ignore case level weights of secondary ignorables. 786 // For details see the comments in the CollationCompare class. 787 // Tertiary CEs (secondary ignorables) are not supported in fast Latin. 788 if(pair <= 0xffff) { 789 // one mini CE 790 if(pair >= MIN_SHORT) { 791 // A high secondary weight means we really have two CEs, 792 // a primary CE and a secondary CE. 793 int ce = pair; 794 pair &= CASE_MASK; // explicit weight of primary CE 795 if(!strengthIsPrimary && (ce & SECONDARY_MASK) >= MIN_SEC_HIGH) { 796 pair |= LOWER_CASE << 16; // implied weight of secondary CE 797 } 798 } else if(pair > variableTop) { 799 pair = LOWER_CASE; 800 } else if(pair >= MIN_LONG) { 801 pair = 0; // variable 802 } 803 // else special mini CE 804 } else { 805 // two mini CEs, same primary groups, neither expands like above 806 int ce = pair & 0xffff; 807 if(ce >= MIN_SHORT) { 808 if(strengthIsPrimary && (pair & (SHORT_PRIMARY_MASK << 16)) == 0) { 809 pair &= CASE_MASK; 810 } else { 811 pair &= TWO_CASES_MASK; 812 } 813 } else if(ce > variableTop) { 814 pair = TWO_LOWER_CASES; 815 } else { 816 assert(ce >= MIN_LONG); 817 pair = 0; // variable 818 } 819 } 820 return pair; 821 } 822 getTertiaries(int variableTop, boolean withCaseBits, int pair)823 private static int getTertiaries(int variableTop, boolean withCaseBits, int pair) { 824 if(pair <= 0xffff) { 825 // one mini CE 826 if(pair >= MIN_SHORT) { 827 // A high secondary weight means we really have two CEs, 828 // a primary CE and a secondary CE. 829 int ce = pair; 830 if(withCaseBits) { 831 pair = (pair & CASE_AND_TERTIARY_MASK) + TER_OFFSET; 832 if((ce & SECONDARY_MASK) >= MIN_SEC_HIGH) { 833 pair |= (LOWER_CASE | COMMON_TER_PLUS_OFFSET) << 16; 834 } 835 } else { 836 pair = (pair & TERTIARY_MASK) + TER_OFFSET; 837 if((ce & SECONDARY_MASK) >= MIN_SEC_HIGH) { 838 pair |= COMMON_TER_PLUS_OFFSET << 16; 839 } 840 } 841 } else if(pair > variableTop) { 842 pair = (pair & TERTIARY_MASK) + TER_OFFSET; 843 if(withCaseBits) { 844 pair |= LOWER_CASE; 845 } 846 } else if(pair >= MIN_LONG) { 847 pair = 0; // variable 848 } 849 // else special mini CE 850 } else { 851 // two mini CEs, same primary groups, neither expands like above 852 int ce = pair & 0xffff; 853 if(ce >= MIN_SHORT) { 854 if(withCaseBits) { 855 pair &= TWO_CASES_MASK | TWO_TERTIARIES_MASK; 856 } else { 857 pair &= TWO_TERTIARIES_MASK; 858 } 859 pair += TWO_TER_OFFSETS; 860 } else if(ce > variableTop) { 861 pair = (pair & TWO_TERTIARIES_MASK) + TWO_TER_OFFSETS; 862 if(withCaseBits) { 863 pair |= TWO_LOWER_CASES; 864 } 865 } else { 866 assert(ce >= MIN_LONG); 867 pair = 0; // variable 868 } 869 } 870 return pair; 871 } 872 getQuaternaries(int variableTop, int pair)873 private static int getQuaternaries(int variableTop, int pair) { 874 // Return the primary weight of a variable CE, 875 // or the maximum primary weight for a non-variable, not-completely-ignorable CE. 876 if(pair <= 0xffff) { 877 // one mini CE 878 if(pair >= MIN_SHORT) { 879 // A high secondary weight means we really have two CEs, 880 // a primary CE and a secondary CE. 881 if((pair & SECONDARY_MASK) >= MIN_SEC_HIGH) { 882 pair = TWO_SHORT_PRIMARIES_MASK; 883 } else { 884 pair = SHORT_PRIMARY_MASK; 885 } 886 } else if(pair > variableTop) { 887 pair = SHORT_PRIMARY_MASK; 888 } else if(pair >= MIN_LONG) { 889 pair &= LONG_PRIMARY_MASK; // variable 890 } 891 // else special mini CE 892 } else { 893 // two mini CEs, same primary groups, neither expands like above 894 int ce = pair & 0xffff; 895 if(ce > variableTop) { 896 pair = TWO_SHORT_PRIMARIES_MASK; 897 } else { 898 assert(ce >= MIN_LONG); 899 pair &= TWO_LONG_PRIMARIES_MASK; // variable 900 } 901 } 902 return pair; 903 } 904 CollationFastLatin()905 private CollationFastLatin() {} // no constructor 906 } 907