1 /* GENERATED SOURCE. DO NOT MODIFY. */ 2 // © 2016 and later: Unicode, Inc. and others. 3 // License & terms of use: http://www.unicode.org/copyright.html#License 4 /* 5 ******************************************************************************* 6 * Copyright (C) 2013-2015, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 ******************************************************************************* 9 * CollationFastLatinBuilder.java, ported from collationfastlatinbuilder.h/.cpp 10 * 11 * C++ version created on: 2013aug09 12 * created by: Markus W. Scherer 13 */ 14 15 package ohos.global.icu.impl.coll; 16 17 import ohos.global.icu.lang.UScript; 18 import ohos.global.icu.text.Collator; 19 import ohos.global.icu.util.CharsTrie; 20 21 final class CollationFastLatinBuilder { 22 // #define DEBUG_COLLATION_FAST_LATIN_BUILDER 0 // 0 or 1 or 2 23 24 /** 25 * Compare two signed long values as if they were unsigned. 26 */ compareInt64AsUnsigned(long a, long b)27 private static final int compareInt64AsUnsigned(long a, long b) { 28 a += 0x8000000000000000L; 29 b += 0x8000000000000000L; 30 if(a < b) { 31 return -1; 32 } else if(a > b) { 33 return 1; 34 } else { 35 return 0; 36 } 37 } 38 39 /** 40 * Like Java Collections.binarySearch(List, String, Comparator). 41 * 42 * @return the index>=0 where the item was found, 43 * or the index<0 for inserting the string at ~index in sorted order 44 */ binarySearch(long[] list, int limit, long ce)45 private static final int binarySearch(long[] list, int limit, long ce) { 46 if (limit == 0) { return ~0; } 47 int start = 0; 48 for (;;) { 49 int i = (int)(((long)start + (long)limit) / 2); 50 int cmp = compareInt64AsUnsigned(ce, list[i]); 51 if (cmp == 0) { 52 return i; 53 } else if (cmp < 0) { 54 if (i == start) { 55 return ~start; // insert ce before i 56 } 57 limit = i; 58 } else { 59 if (i == start) { 60 return ~(start + 1); // insert ce after i 61 } 62 start = i; 63 } 64 } 65 } 66 CollationFastLatinBuilder()67 CollationFastLatinBuilder() { 68 ce0 = 0; 69 ce1 = 0; 70 contractionCEs = new UVector64(); 71 uniqueCEs = new UVector64(); 72 miniCEs = null; 73 firstDigitPrimary = 0; 74 firstLatinPrimary = 0; 75 lastLatinPrimary = 0; 76 firstShortPrimary = 0; 77 shortPrimaryOverflow = false; 78 headerLength = 0; 79 } 80 forData(CollationData data)81 boolean forData(CollationData data) { 82 if(result.length() != 0) { // This builder is not reusable. 83 throw new IllegalStateException("attempt to reuse a CollationFastLatinBuilder"); 84 } 85 if(!loadGroups(data)) { return false; } 86 87 // Fast handling of digits. 88 firstShortPrimary = firstDigitPrimary; 89 getCEs(data); 90 encodeUniqueCEs(); 91 if(shortPrimaryOverflow) { 92 // Give digits long mini primaries, 93 // so that there are more short primaries for letters. 94 firstShortPrimary = firstLatinPrimary; 95 resetCEs(); 96 getCEs(data); 97 encodeUniqueCEs(); 98 } 99 // Note: If we still have a short-primary overflow but not a long-primary overflow, 100 // then we could calculate how many more long primaries would fit, 101 // and set the firstShortPrimary to that many after the current firstShortPrimary, 102 // and try again. 103 // However, this might only benefit the en_US_POSIX tailoring, 104 // and it is simpler to suppress building fast Latin data for it in genrb, 105 // or by returning false here if shortPrimaryOverflow. 106 107 boolean ok = !shortPrimaryOverflow; 108 if(ok) { 109 encodeCharCEs(); 110 encodeContractions(); 111 } 112 contractionCEs.removeAllElements(); // might reduce heap memory usage 113 uniqueCEs.removeAllElements(); 114 return ok; 115 } 116 117 // C++ returns one combined array with the contents of the result buffer. 118 // Java returns two arrays (header & table) because we cannot use pointer arithmetic, 119 // and we do not want to index into the table with an offset. getHeader()120 char[] getHeader() { 121 char[] resultArray = new char[headerLength]; 122 result.getChars(0, headerLength, resultArray, 0); 123 return resultArray; 124 } 125 getTable()126 char[] getTable() { 127 char[] resultArray = new char[result.length() - headerLength]; 128 result.getChars(headerLength, result.length(), resultArray, 0); 129 return resultArray; 130 } 131 loadGroups(CollationData data)132 private boolean loadGroups(CollationData data) { 133 headerLength = 1 + NUM_SPECIAL_GROUPS; 134 int r0 = (CollationFastLatin.VERSION << 8) | headerLength; 135 result.append((char)r0); 136 // The first few reordering groups should be special groups 137 // (space, punct, ..., digit) followed by Latn, then Grek and other scripts. 138 for(int i = 0; i < NUM_SPECIAL_GROUPS; ++i) { 139 lastSpecialPrimaries[i] = data.getLastPrimaryForGroup(Collator.ReorderCodes.FIRST + i); 140 if(lastSpecialPrimaries[i] == 0) { 141 // missing data 142 return false; 143 } 144 result.append(0); // reserve a slot for this group 145 } 146 147 firstDigitPrimary = data.getFirstPrimaryForGroup(Collator.ReorderCodes.DIGIT); 148 firstLatinPrimary = data.getFirstPrimaryForGroup(UScript.LATIN); 149 lastLatinPrimary = data.getLastPrimaryForGroup(UScript.LATIN); 150 if(firstDigitPrimary == 0 || firstLatinPrimary == 0) { 151 // missing data 152 return false; 153 } 154 return true; 155 } 156 inSameGroup(long p, long q)157 private boolean inSameGroup(long p, long q) { 158 // Both or neither need to be encoded as short primaries, 159 // so that we can test only one and use the same bit mask. 160 if(p >= firstShortPrimary) { 161 return q >= firstShortPrimary; 162 } else if(q >= firstShortPrimary) { 163 return false; 164 } 165 // Both or neither must be potentially-variable, 166 // so that we can test only one and determine if both are variable. 167 long lastVariablePrimary = lastSpecialPrimaries[NUM_SPECIAL_GROUPS - 1]; 168 if(p > lastVariablePrimary) { 169 return q > lastVariablePrimary; 170 } else if(q > lastVariablePrimary) { 171 return false; 172 } 173 // Both will be encoded with long mini primaries. 174 // They must be in the same special reordering group, 175 // so that we can test only one and determine if both are variable. 176 assert(p != 0 && q != 0); 177 for(int i = 0;; ++i) { // will terminate 178 long lastPrimary = lastSpecialPrimaries[i]; 179 if(p <= lastPrimary) { 180 return q <= lastPrimary; 181 } else if(q <= lastPrimary) { 182 return false; 183 } 184 } 185 } 186 resetCEs()187 private void resetCEs() { 188 contractionCEs.removeAllElements(); 189 uniqueCEs.removeAllElements(); 190 shortPrimaryOverflow = false; 191 result.setLength(headerLength); 192 } 193 getCEs(CollationData data)194 private void getCEs(CollationData data) { 195 int i = 0; 196 for(char c = 0;; ++i, ++c) { 197 if(c == CollationFastLatin.LATIN_LIMIT) { 198 c = CollationFastLatin.PUNCT_START; 199 } else if(c == CollationFastLatin.PUNCT_LIMIT) { 200 break; 201 } 202 CollationData d; 203 int ce32 = data.getCE32(c); 204 if(ce32 == Collation.FALLBACK_CE32) { 205 d = data.base; 206 ce32 = d.getCE32(c); 207 } else { 208 d = data; 209 } 210 if(getCEsFromCE32(d, c, ce32)) { 211 charCEs[i][0] = ce0; 212 charCEs[i][1] = ce1; 213 addUniqueCE(ce0); 214 addUniqueCE(ce1); 215 } else { 216 // bail out for c 217 charCEs[i][0] = ce0 = Collation.NO_CE; 218 charCEs[i][1] = ce1 = 0; 219 } 220 if(c == 0 && !isContractionCharCE(ce0)) { 221 // Always map U+0000 to a contraction. 222 // Write a contraction list with only a default value if there is no real contraction. 223 assert(contractionCEs.isEmpty()); 224 addContractionEntry(CollationFastLatin.CONTR_CHAR_MASK, ce0, ce1); 225 charCEs[0][0] = (Collation.NO_CE_PRIMARY << 32) | CONTRACTION_FLAG; 226 charCEs[0][1] = 0; 227 } 228 } 229 // Terminate the last contraction list. 230 contractionCEs.addElement(CollationFastLatin.CONTR_CHAR_MASK); 231 } 232 getCEsFromCE32(CollationData data, int c, int ce32)233 private boolean getCEsFromCE32(CollationData data, int c, int ce32) { 234 ce32 = data.getFinalCE32(ce32); 235 ce1 = 0; 236 if(Collation.isSimpleOrLongCE32(ce32)) { 237 ce0 = Collation.ceFromCE32(ce32); 238 } else { 239 switch(Collation.tagFromCE32(ce32)) { 240 case Collation.LATIN_EXPANSION_TAG: 241 ce0 = Collation.latinCE0FromCE32(ce32); 242 ce1 = Collation.latinCE1FromCE32(ce32); 243 break; 244 case Collation.EXPANSION32_TAG: { 245 int index = Collation.indexFromCE32(ce32); 246 int length = Collation.lengthFromCE32(ce32); 247 if(length <= 2) { 248 ce0 = Collation.ceFromCE32(data.ce32s[index]); 249 if(length == 2) { 250 ce1 = Collation.ceFromCE32(data.ce32s[index + 1]); 251 } 252 break; 253 } else { 254 return false; 255 } 256 } 257 case Collation.EXPANSION_TAG: { 258 int index = Collation.indexFromCE32(ce32); 259 int length = Collation.lengthFromCE32(ce32); 260 if(length <= 2) { 261 ce0 = data.ces[index]; 262 if(length == 2) { 263 ce1 = data.ces[index + 1]; 264 } 265 break; 266 } else { 267 return false; 268 } 269 } 270 // Note: We could support PREFIX_TAG (assert c>=0) 271 // by recursing on its default CE32 and checking that none of the prefixes starts 272 // with a fast Latin character. 273 // However, currently (2013) there are only the L-before-middle-dot 274 // prefix mappings in the Latin range, and those would be rejected anyway. 275 case Collation.CONTRACTION_TAG: 276 assert(c >= 0); 277 return getCEsFromContractionCE32(data, ce32); 278 case Collation.OFFSET_TAG: 279 assert(c >= 0); 280 ce0 = data.getCEFromOffsetCE32(c, ce32); 281 break; 282 default: 283 return false; 284 } 285 } 286 // A mapping can be completely ignorable. 287 if(ce0 == 0) { return ce1 == 0; } 288 // We do not support an ignorable ce0 unless it is completely ignorable. 289 long p0 = ce0 >>> 32; 290 if(p0 == 0) { return false; } 291 // We only support primaries up to the Latin script. 292 if(p0 > lastLatinPrimary) { return false; } 293 // We support non-common secondary and case weights only together with short primaries. 294 int lower32_0 = (int)ce0; 295 if(p0 < firstShortPrimary) { 296 int sc0 = lower32_0 & Collation.SECONDARY_AND_CASE_MASK; 297 if(sc0 != Collation.COMMON_SECONDARY_CE) { return false; } 298 } 299 // No below-common tertiary weights. 300 if((lower32_0 & Collation.ONLY_TERTIARY_MASK) < Collation.COMMON_WEIGHT16) { return false; } 301 if(ce1 != 0) { 302 // Both primaries must be in the same group, 303 // or both must get short mini primaries, 304 // or a short-primary CE is followed by a secondary CE. 305 // This is so that we can test the first primary and use the same mask for both, 306 // and determine for both whether they are variable. 307 long p1 = ce1 >>> 32; 308 if(p1 == 0 ? p0 < firstShortPrimary : !inSameGroup(p0, p1)) { return false; } 309 int lower32_1 = (int)ce1; 310 // No tertiary CEs. 311 if((lower32_1 >>> 16) == 0) { return false; } 312 // We support non-common secondary and case weights 313 // only for secondary CEs or together with short primaries. 314 if(p1 != 0 && p1 < firstShortPrimary) { 315 int sc1 = lower32_1 & Collation.SECONDARY_AND_CASE_MASK; 316 if(sc1 != Collation.COMMON_SECONDARY_CE) { return false; } 317 } 318 // No below-common tertiary weights. 319 if((lower32_0 & Collation.ONLY_TERTIARY_MASK) < Collation.COMMON_WEIGHT16) { return false; } 320 } 321 // No quaternary weights. 322 if(((ce0 | ce1) & Collation.QUATERNARY_MASK) != 0) { return false; } 323 return true; 324 } 325 getCEsFromContractionCE32(CollationData data, int ce32)326 private boolean getCEsFromContractionCE32(CollationData data, int ce32) { 327 int trieIndex = Collation.indexFromCE32(ce32); 328 ce32 = data.getCE32FromContexts(trieIndex); // Default if no suffix match. 329 // Since the original ce32 is not a prefix mapping, 330 // the default ce32 must not be another contraction. 331 assert(!Collation.isContractionCE32(ce32)); 332 int contractionIndex = contractionCEs.size(); 333 if(getCEsFromCE32(data, Collation.SENTINEL_CP, ce32)) { 334 addContractionEntry(CollationFastLatin.CONTR_CHAR_MASK, ce0, ce1); 335 } else { 336 // Bail out for c-without-contraction. 337 addContractionEntry(CollationFastLatin.CONTR_CHAR_MASK, Collation.NO_CE, 0); 338 } 339 // Handle an encodable contraction unless the next contraction is too long 340 // and starts with the same character. 341 int prevX = -1; 342 boolean addContraction = false; 343 CharsTrie.Iterator suffixes = CharsTrie.iterator(data.contexts, trieIndex + 2, 0); 344 while(suffixes.hasNext()) { 345 CharsTrie.Entry entry = suffixes.next(); 346 CharSequence suffix = entry.chars; 347 int x = CollationFastLatin.getCharIndex(suffix.charAt(0)); 348 if(x < 0) { continue; } // ignore anything but fast Latin text 349 if(x == prevX) { 350 if(addContraction) { 351 // Bail out for all contractions starting with this character. 352 addContractionEntry(x, Collation.NO_CE, 0); 353 addContraction = false; 354 } 355 continue; 356 } 357 if(addContraction) { 358 addContractionEntry(prevX, ce0, ce1); 359 } 360 ce32 = entry.value; 361 if(suffix.length() == 1 && getCEsFromCE32(data, Collation.SENTINEL_CP, ce32)) { 362 addContraction = true; 363 } else { 364 addContractionEntry(x, Collation.NO_CE, 0); 365 addContraction = false; 366 } 367 prevX = x; 368 } 369 if(addContraction) { 370 addContractionEntry(prevX, ce0, ce1); 371 } 372 // Note: There might not be any fast Latin contractions, but 373 // we need to enter contraction handling anyway so that we can bail out 374 // when there is a non-fast-Latin character following. 375 // For example: Danish &Y<<u+umlaut, when we compare Y vs. u\u0308 we need to see the 376 // following umlaut and bail out, rather than return the difference of Y vs. u. 377 ce0 = (Collation.NO_CE_PRIMARY << 32) | CONTRACTION_FLAG | contractionIndex; 378 ce1 = 0; 379 return true; 380 } 381 addContractionEntry(int x, long cce0, long cce1)382 private void addContractionEntry(int x, long cce0, long cce1) { 383 contractionCEs.addElement(x); 384 contractionCEs.addElement(cce0); 385 contractionCEs.addElement(cce1); 386 addUniqueCE(cce0); 387 addUniqueCE(cce1); 388 } 389 addUniqueCE(long ce)390 private void addUniqueCE(long ce) { 391 if(ce == 0 || (ce >>> 32) == Collation.NO_CE_PRIMARY) { return; } 392 ce &= ~(long)Collation.CASE_MASK; // blank out case bits 393 int i = binarySearch(uniqueCEs.getBuffer(), uniqueCEs.size(), ce); 394 if(i < 0) { 395 uniqueCEs.insertElementAt(ce, ~i); 396 } 397 } 398 getMiniCE(long ce)399 private int getMiniCE(long ce) { 400 ce &= ~(long)Collation.CASE_MASK; // blank out case bits 401 int index = binarySearch(uniqueCEs.getBuffer(), uniqueCEs.size(), ce); 402 assert(index >= 0); 403 return miniCEs[index]; 404 } 405 encodeUniqueCEs()406 private void encodeUniqueCEs() { 407 miniCEs = new char[uniqueCEs.size()]; 408 int group = 0; 409 long lastGroupPrimary = lastSpecialPrimaries[group]; 410 // The lowest unique CE must be at least a secondary CE. 411 assert(((int)uniqueCEs.elementAti(0) >>> 16) != 0); 412 long prevPrimary = 0; 413 int prevSecondary = 0; 414 int pri = 0; 415 int sec = 0; 416 int ter = CollationFastLatin.COMMON_TER; 417 for(int i = 0; i < uniqueCEs.size(); ++i) { 418 long ce = uniqueCEs.elementAti(i); 419 // Note: At least one of the p/s/t weights changes from one unique CE to the next. 420 // (uniqueCEs does not store case bits.) 421 long p = ce >>> 32; 422 if(p != prevPrimary) { 423 while(p > lastGroupPrimary) { 424 assert(pri <= CollationFastLatin.MAX_LONG); 425 // Set the group's header entry to the 426 // last "long primary" in or before the group. 427 result.setCharAt(1 + group, (char)pri); 428 if(++group < NUM_SPECIAL_GROUPS) { 429 lastGroupPrimary = lastSpecialPrimaries[group]; 430 } else { 431 lastGroupPrimary = 0xffffffffL; 432 break; 433 } 434 } 435 if(p < firstShortPrimary) { 436 if(pri == 0) { 437 pri = CollationFastLatin.MIN_LONG; 438 } else if(pri < CollationFastLatin.MAX_LONG) { 439 pri += CollationFastLatin.LONG_INC; 440 } else { 441 /* #if DEBUG_COLLATION_FAST_LATIN_BUILDER 442 printf("long-primary overflow for %08x\n", p); 443 #endif */ 444 miniCEs[i] = CollationFastLatin.BAIL_OUT; 445 continue; 446 } 447 } else { 448 if(pri < CollationFastLatin.MIN_SHORT) { 449 pri = CollationFastLatin.MIN_SHORT; 450 } else if(pri < (CollationFastLatin.MAX_SHORT - CollationFastLatin.SHORT_INC)) { 451 // Reserve the highest primary weight for U+FFFF. 452 pri += CollationFastLatin.SHORT_INC; 453 } else { 454 /* #if DEBUG_COLLATION_FAST_LATIN_BUILDER 455 printf("short-primary overflow for %08x\n", p); 456 #endif */ 457 shortPrimaryOverflow = true; 458 miniCEs[i] = CollationFastLatin.BAIL_OUT; 459 continue; 460 } 461 } 462 prevPrimary = p; 463 prevSecondary = Collation.COMMON_WEIGHT16; 464 sec = CollationFastLatin.COMMON_SEC; 465 ter = CollationFastLatin.COMMON_TER; 466 } 467 int lower32 = (int)ce; 468 int s = lower32 >>> 16; 469 if(s != prevSecondary) { 470 if(pri == 0) { 471 if(sec == 0) { 472 sec = CollationFastLatin.MIN_SEC_HIGH; 473 } else if(sec < CollationFastLatin.MAX_SEC_HIGH) { 474 sec += CollationFastLatin.SEC_INC; 475 } else { 476 miniCEs[i] = CollationFastLatin.BAIL_OUT; 477 continue; 478 } 479 prevSecondary = s; 480 ter = CollationFastLatin.COMMON_TER; 481 } else if(s < Collation.COMMON_WEIGHT16) { 482 if(sec == CollationFastLatin.COMMON_SEC) { 483 sec = CollationFastLatin.MIN_SEC_BEFORE; 484 } else if(sec < CollationFastLatin.MAX_SEC_BEFORE) { 485 sec += CollationFastLatin.SEC_INC; 486 } else { 487 miniCEs[i] = CollationFastLatin.BAIL_OUT; 488 continue; 489 } 490 } else if(s == Collation.COMMON_WEIGHT16) { 491 sec = CollationFastLatin.COMMON_SEC; 492 } else { 493 if(sec < CollationFastLatin.MIN_SEC_AFTER) { 494 sec = CollationFastLatin.MIN_SEC_AFTER; 495 } else if(sec < CollationFastLatin.MAX_SEC_AFTER) { 496 sec += CollationFastLatin.SEC_INC; 497 } else { 498 miniCEs[i] = CollationFastLatin.BAIL_OUT; 499 continue; 500 } 501 } 502 prevSecondary = s; 503 ter = CollationFastLatin.COMMON_TER; 504 } 505 assert((lower32 & Collation.CASE_MASK) == 0); // blanked out in uniqueCEs 506 int t = lower32 & Collation.ONLY_TERTIARY_MASK; 507 if(t > Collation.COMMON_WEIGHT16) { 508 if(ter < CollationFastLatin.MAX_TER_AFTER) { 509 ++ter; 510 } else { 511 miniCEs[i] = CollationFastLatin.BAIL_OUT; 512 continue; 513 } 514 } 515 if(CollationFastLatin.MIN_LONG <= pri && pri <= CollationFastLatin.MAX_LONG) { 516 assert(sec == CollationFastLatin.COMMON_SEC); 517 miniCEs[i] = (char)(pri | ter); 518 } else { 519 miniCEs[i] = (char)(pri | sec | ter); 520 } 521 } 522 /* #if DEBUG_COLLATION_FAST_LATIN_BUILDER 523 printf("last mini primary: %04x\n", pri); 524 #endif */ 525 /* #if DEBUG_COLLATION_FAST_LATIN_BUILDER >= 2 526 for(int i = 0; i < uniqueCEs.size(); ++i) { 527 long ce = uniqueCEs.elementAti(i); 528 printf("unique CE 0x%016lx -> 0x%04x\n", ce, miniCEs[i]); 529 } 530 #endif */ 531 } 532 encodeCharCEs()533 private void encodeCharCEs() { 534 int miniCEsStart = result.length(); 535 for(int i = 0; i < CollationFastLatin.NUM_FAST_CHARS; ++i) { 536 result.append(0); // initialize to completely ignorable 537 } 538 int indexBase = result.length(); 539 for(int i = 0; i < CollationFastLatin.NUM_FAST_CHARS; ++i) { 540 long ce = charCEs[i][0]; 541 if(isContractionCharCE(ce)) { continue; } // defer contraction 542 int miniCE = encodeTwoCEs(ce, charCEs[i][1]); 543 if((miniCE >>> 16) > 0) { // if ((unsigned)miniCE > 0xffff) 544 // Note: There is a chance that this new expansion is the same as a previous one, 545 // and if so, then we could reuse the other expansion. 546 // However, that seems unlikely. 547 int expansionIndex = result.length() - indexBase; 548 if(expansionIndex > CollationFastLatin.INDEX_MASK) { 549 miniCE = CollationFastLatin.BAIL_OUT; 550 } else { 551 result.append((char)(miniCE >> 16)).append((char)miniCE); 552 miniCE = CollationFastLatin.EXPANSION | expansionIndex; 553 } 554 } 555 result.setCharAt(miniCEsStart + i, (char)miniCE); 556 } 557 } 558 encodeContractions()559 private void encodeContractions() { 560 // We encode all contraction lists so that the first word of a list 561 // terminates the previous list, and we only need one additional terminator at the end. 562 int indexBase = headerLength + CollationFastLatin.NUM_FAST_CHARS; 563 int firstContractionIndex = result.length(); 564 for(int i = 0; i < CollationFastLatin.NUM_FAST_CHARS; ++i) { 565 long ce = charCEs[i][0]; 566 if(!isContractionCharCE(ce)) { continue; } 567 int contractionIndex = result.length() - indexBase; 568 if(contractionIndex > CollationFastLatin.INDEX_MASK) { 569 result.setCharAt(headerLength + i, (char) CollationFastLatin.BAIL_OUT); 570 continue; 571 } 572 boolean firstTriple = true; 573 for(int index = (int)ce & 0x7fffffff;; index += 3) { 574 long x = contractionCEs.elementAti(index); 575 if(x == CollationFastLatin.CONTR_CHAR_MASK && !firstTriple) { break; } 576 long cce0 = contractionCEs.elementAti(index + 1); 577 long cce1 = contractionCEs.elementAti(index + 2); 578 int miniCE = encodeTwoCEs(cce0, cce1); 579 if(miniCE == CollationFastLatin.BAIL_OUT) { 580 result.append((char)(x | (1 << CollationFastLatin.CONTR_LENGTH_SHIFT))); 581 } else if((miniCE >>> 16) == 0) { // if ((unsigned)miniCE <= 0xffff) 582 result.append((char)(x | (2 << CollationFastLatin.CONTR_LENGTH_SHIFT))); 583 result.append((char)miniCE); 584 } else { 585 result.append((char)(x | (3 << CollationFastLatin.CONTR_LENGTH_SHIFT))); 586 result.append((char)(miniCE >> 16)).append((char)miniCE); 587 } 588 firstTriple = false; 589 } 590 // Note: There is a chance that this new contraction list is the same as a previous one, 591 // and if so, then we could truncate the result and reuse the other list. 592 // However, that seems unlikely. 593 result.setCharAt(headerLength + i, 594 (char)(CollationFastLatin.CONTRACTION | contractionIndex)); 595 } 596 if(result.length() > firstContractionIndex) { 597 // Terminate the last contraction list. 598 result.append((char)CollationFastLatin.CONTR_CHAR_MASK); 599 } 600 /* #if DEBUG_COLLATION_FAST_LATIN_BUILDER 601 printf("** fast Latin %d * 2 = %d bytes\n", result.length(), result.length() * 2); 602 puts(" header & below-digit groups map"); 603 int i = 0; 604 for(; i < headerLength; ++i) { 605 printf(" %04x", result[i]); 606 } 607 printf("\n char mini CEs"); 608 assert(CollationFastLatin.NUM_FAST_CHARS % 16 == 0); 609 for(; i < indexBase; i += 16) { 610 int c = i - headerLength; 611 if(c >= CollationFastLatin.LATIN_LIMIT) { 612 c = CollationFastLatin.PUNCT_START + c - CollationFastLatin.LATIN_LIMIT; 613 } 614 printf("\n %04x:", c); 615 for(int j = 0; j < 16; ++j) { 616 printf(" %04x", result[i + j]); 617 } 618 } 619 printf("\n expansions & contractions"); 620 for(; i < result.length(); ++i) { 621 if((i - indexBase) % 16 == 0) { puts(""); } 622 printf(" %04x", result[i]); 623 } 624 puts(""); 625 #endif */ 626 } 627 encodeTwoCEs(long first, long second)628 private int encodeTwoCEs(long first, long second) { 629 if(first == 0) { 630 return 0; // completely ignorable 631 } 632 if(first == Collation.NO_CE) { 633 return CollationFastLatin.BAIL_OUT; 634 } 635 assert((first >>> 32) != Collation.NO_CE_PRIMARY); 636 637 int miniCE = getMiniCE(first); 638 if(miniCE == CollationFastLatin.BAIL_OUT) { return miniCE; } 639 if(miniCE >= CollationFastLatin.MIN_SHORT) { 640 // Extract & copy the case bits. 641 // Shift them from normal CE bits 15..14 to mini CE bits 4..3. 642 int c = (((int)first & Collation.CASE_MASK) >> (14 - 3)); 643 // Only in mini CEs: Ignorable case bits = 0, lowercase = 1. 644 c += CollationFastLatin.LOWER_CASE; 645 miniCE |= c; 646 } 647 if(second == 0) { return miniCE; } 648 649 int miniCE1 = getMiniCE(second); 650 if(miniCE1 == CollationFastLatin.BAIL_OUT) { return miniCE1; } 651 652 int case1 = (int)second & Collation.CASE_MASK; 653 if(miniCE >= CollationFastLatin.MIN_SHORT && 654 (miniCE & CollationFastLatin.SECONDARY_MASK) == CollationFastLatin.COMMON_SEC) { 655 // Try to combine the two mini CEs into one. 656 int sec1 = miniCE1 & CollationFastLatin.SECONDARY_MASK; 657 int ter1 = miniCE1 & CollationFastLatin.TERTIARY_MASK; 658 if(sec1 >= CollationFastLatin.MIN_SEC_HIGH && case1 == 0 && 659 ter1 == CollationFastLatin.COMMON_TER) { 660 // sec1>=sec_high implies pri1==0. 661 return (miniCE & ~CollationFastLatin.SECONDARY_MASK) | sec1; 662 } 663 } 664 665 if(miniCE1 <= CollationFastLatin.SECONDARY_MASK || CollationFastLatin.MIN_SHORT <= miniCE1) { 666 // Secondary CE, or a CE with a short primary, copy the case bits. 667 case1 = (case1 >> (14 - 3)) + CollationFastLatin.LOWER_CASE; 668 miniCE1 |= case1; 669 } 670 return (miniCE << 16) | miniCE1; 671 } 672 isContractionCharCE(long ce)673 private static boolean isContractionCharCE(long ce) { 674 return (ce >>> 32) == Collation.NO_CE_PRIMARY && ce != Collation.NO_CE; 675 } 676 677 // space, punct, symbol, currency (not digit) 678 private static final int NUM_SPECIAL_GROUPS = 679 Collator.ReorderCodes.CURRENCY - Collator.ReorderCodes.FIRST + 1; 680 681 private static final long CONTRACTION_FLAG = 0x80000000L; 682 683 // temporary "buffer" 684 private long ce0, ce1; 685 686 private long[][] charCEs = new long[CollationFastLatin.NUM_FAST_CHARS][2]; 687 688 private UVector64 contractionCEs; 689 private UVector64 uniqueCEs; 690 691 /** One 16-bit mini CE per unique CE. */ 692 private char[] miniCEs; 693 694 // These are constant for a given root collator. 695 long[] lastSpecialPrimaries = new long[NUM_SPECIAL_GROUPS]; 696 private long firstDigitPrimary; 697 private long firstLatinPrimary; 698 private long lastLatinPrimary; 699 // This determines the first normal primary weight which is mapped to 700 // a short mini primary. It must be >=firstDigitPrimary. 701 private long firstShortPrimary; 702 703 private boolean shortPrimaryOverflow; 704 705 private StringBuilder result = new StringBuilder(); 706 private int headerLength; 707 } 708