1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html#License 3 /* 4 ******************************************************************************* 5 * Copyright (C) 2013-2015, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ******************************************************************************* 8 * CollationFastLatinBuilder.java, ported from collationfastlatinbuilder.h/.cpp 9 * 10 * C++ version created on: 2013aug09 11 * created by: Markus W. Scherer 12 */ 13 14 package com.ibm.icu.impl.coll; 15 16 import com.ibm.icu.lang.UScript; 17 import com.ibm.icu.text.Collator; 18 import com.ibm.icu.util.CharsTrie; 19 20 final class CollationFastLatinBuilder { 21 // #define DEBUG_COLLATION_FAST_LATIN_BUILDER 0 // 0 or 1 or 2 22 23 /** 24 * Compare two signed long values as if they were unsigned. 25 */ compareInt64AsUnsigned(long a, long b)26 private static final int compareInt64AsUnsigned(long a, long b) { 27 a += 0x8000000000000000L; 28 b += 0x8000000000000000L; 29 if(a < b) { 30 return -1; 31 } else if(a > b) { 32 return 1; 33 } else { 34 return 0; 35 } 36 } 37 38 /** 39 * Like Java Collections.binarySearch(List, String, Comparator). 40 * 41 * @return the index>=0 where the item was found, 42 * or the index<0 for inserting the string at ~index in sorted order 43 */ binarySearch(long[] list, int limit, long ce)44 private static final int binarySearch(long[] list, int limit, long ce) { 45 if (limit == 0) { return ~0; } 46 int start = 0; 47 for (;;) { 48 int i = (int)(((long)start + (long)limit) / 2); 49 int cmp = compareInt64AsUnsigned(ce, list[i]); 50 if (cmp == 0) { 51 return i; 52 } else if (cmp < 0) { 53 if (i == start) { 54 return ~start; // insert ce before i 55 } 56 limit = i; 57 } else { 58 if (i == start) { 59 return ~(start + 1); // insert ce after i 60 } 61 start = i; 62 } 63 } 64 } 65 CollationFastLatinBuilder()66 CollationFastLatinBuilder() { 67 ce0 = 0; 68 ce1 = 0; 69 contractionCEs = new UVector64(); 70 uniqueCEs = new UVector64(); 71 miniCEs = null; 72 firstDigitPrimary = 0; 73 firstLatinPrimary = 0; 74 lastLatinPrimary = 0; 75 firstShortPrimary = 0; 76 shortPrimaryOverflow = false; 77 headerLength = 0; 78 } 79 forData(CollationData data)80 boolean forData(CollationData data) { 81 if(result.length() != 0) { // This builder is not reusable. 82 throw new IllegalStateException("attempt to reuse a CollationFastLatinBuilder"); 83 } 84 if(!loadGroups(data)) { return false; } 85 86 // Fast handling of digits. 87 firstShortPrimary = firstDigitPrimary; 88 getCEs(data); 89 encodeUniqueCEs(); 90 if(shortPrimaryOverflow) { 91 // Give digits long mini primaries, 92 // so that there are more short primaries for letters. 93 firstShortPrimary = firstLatinPrimary; 94 resetCEs(); 95 getCEs(data); 96 encodeUniqueCEs(); 97 } 98 // Note: If we still have a short-primary overflow but not a long-primary overflow, 99 // then we could calculate how many more long primaries would fit, 100 // and set the firstShortPrimary to that many after the current firstShortPrimary, 101 // and try again. 102 // However, this might only benefit the en_US_POSIX tailoring, 103 // and it is simpler to suppress building fast Latin data for it in genrb, 104 // or by returning false here if shortPrimaryOverflow. 105 106 boolean ok = !shortPrimaryOverflow; 107 if(ok) { 108 encodeCharCEs(); 109 encodeContractions(); 110 } 111 contractionCEs.removeAllElements(); // might reduce heap memory usage 112 uniqueCEs.removeAllElements(); 113 return ok; 114 } 115 116 // C++ returns one combined array with the contents of the result buffer. 117 // Java returns two arrays (header & table) because we cannot use pointer arithmetic, 118 // and we do not want to index into the table with an offset. getHeader()119 char[] getHeader() { 120 char[] resultArray = new char[headerLength]; 121 result.getChars(0, headerLength, resultArray, 0); 122 return resultArray; 123 } 124 getTable()125 char[] getTable() { 126 char[] resultArray = new char[result.length() - headerLength]; 127 result.getChars(headerLength, result.length(), resultArray, 0); 128 return resultArray; 129 } 130 loadGroups(CollationData data)131 private boolean loadGroups(CollationData data) { 132 headerLength = 1 + NUM_SPECIAL_GROUPS; 133 int r0 = (CollationFastLatin.VERSION << 8) | headerLength; 134 result.append((char)r0); 135 // The first few reordering groups should be special groups 136 // (space, punct, ..., digit) followed by Latn, then Grek and other scripts. 137 for(int i = 0; i < NUM_SPECIAL_GROUPS; ++i) { 138 lastSpecialPrimaries[i] = data.getLastPrimaryForGroup(Collator.ReorderCodes.FIRST + i); 139 if(lastSpecialPrimaries[i] == 0) { 140 // missing data 141 return false; 142 } 143 result.append(0); // reserve a slot for this group 144 } 145 146 firstDigitPrimary = data.getFirstPrimaryForGroup(Collator.ReorderCodes.DIGIT); 147 firstLatinPrimary = data.getFirstPrimaryForGroup(UScript.LATIN); 148 lastLatinPrimary = data.getLastPrimaryForGroup(UScript.LATIN); 149 if(firstDigitPrimary == 0 || firstLatinPrimary == 0) { 150 // missing data 151 return false; 152 } 153 return true; 154 } 155 inSameGroup(long p, long q)156 private boolean inSameGroup(long p, long q) { 157 // Both or neither need to be encoded as short primaries, 158 // so that we can test only one and use the same bit mask. 159 if(p >= firstShortPrimary) { 160 return q >= firstShortPrimary; 161 } else if(q >= firstShortPrimary) { 162 return false; 163 } 164 // Both or neither must be potentially-variable, 165 // so that we can test only one and determine if both are variable. 166 long lastVariablePrimary = lastSpecialPrimaries[NUM_SPECIAL_GROUPS - 1]; 167 if(p > lastVariablePrimary) { 168 return q > lastVariablePrimary; 169 } else if(q > lastVariablePrimary) { 170 return false; 171 } 172 // Both will be encoded with long mini primaries. 173 // They must be in the same special reordering group, 174 // so that we can test only one and determine if both are variable. 175 assert(p != 0 && q != 0); 176 for(int i = 0;; ++i) { // will terminate 177 long lastPrimary = lastSpecialPrimaries[i]; 178 if(p <= lastPrimary) { 179 return q <= lastPrimary; 180 } else if(q <= lastPrimary) { 181 return false; 182 } 183 } 184 } 185 resetCEs()186 private void resetCEs() { 187 contractionCEs.removeAllElements(); 188 uniqueCEs.removeAllElements(); 189 shortPrimaryOverflow = false; 190 result.setLength(headerLength); 191 } 192 getCEs(CollationData data)193 private void getCEs(CollationData data) { 194 int i = 0; 195 for(char c = 0;; ++i, ++c) { 196 if(c == CollationFastLatin.LATIN_LIMIT) { 197 c = CollationFastLatin.PUNCT_START; 198 } else if(c == CollationFastLatin.PUNCT_LIMIT) { 199 break; 200 } 201 CollationData d; 202 int ce32 = data.getCE32(c); 203 if(ce32 == Collation.FALLBACK_CE32) { 204 d = data.base; 205 ce32 = d.getCE32(c); 206 } else { 207 d = data; 208 } 209 if(getCEsFromCE32(d, c, ce32)) { 210 charCEs[i][0] = ce0; 211 charCEs[i][1] = ce1; 212 addUniqueCE(ce0); 213 addUniqueCE(ce1); 214 } else { 215 // bail out for c 216 charCEs[i][0] = ce0 = Collation.NO_CE; 217 charCEs[i][1] = ce1 = 0; 218 } 219 if(c == 0 && !isContractionCharCE(ce0)) { 220 // Always map U+0000 to a contraction. 221 // Write a contraction list with only a default value if there is no real contraction. 222 assert(contractionCEs.isEmpty()); 223 addContractionEntry(CollationFastLatin.CONTR_CHAR_MASK, ce0, ce1); 224 charCEs[0][0] = (Collation.NO_CE_PRIMARY << 32) | CONTRACTION_FLAG; 225 charCEs[0][1] = 0; 226 } 227 } 228 // Terminate the last contraction list. 229 contractionCEs.addElement(CollationFastLatin.CONTR_CHAR_MASK); 230 } 231 getCEsFromCE32(CollationData data, int c, int ce32)232 private boolean getCEsFromCE32(CollationData data, int c, int ce32) { 233 ce32 = data.getFinalCE32(ce32); 234 ce1 = 0; 235 if(Collation.isSimpleOrLongCE32(ce32)) { 236 ce0 = Collation.ceFromCE32(ce32); 237 } else { 238 switch(Collation.tagFromCE32(ce32)) { 239 case Collation.LATIN_EXPANSION_TAG: 240 ce0 = Collation.latinCE0FromCE32(ce32); 241 ce1 = Collation.latinCE1FromCE32(ce32); 242 break; 243 case Collation.EXPANSION32_TAG: { 244 int index = Collation.indexFromCE32(ce32); 245 int length = Collation.lengthFromCE32(ce32); 246 if(length <= 2) { 247 ce0 = Collation.ceFromCE32(data.ce32s[index]); 248 if(length == 2) { 249 ce1 = Collation.ceFromCE32(data.ce32s[index + 1]); 250 } 251 break; 252 } else { 253 return false; 254 } 255 } 256 case Collation.EXPANSION_TAG: { 257 int index = Collation.indexFromCE32(ce32); 258 int length = Collation.lengthFromCE32(ce32); 259 if(length <= 2) { 260 ce0 = data.ces[index]; 261 if(length == 2) { 262 ce1 = data.ces[index + 1]; 263 } 264 break; 265 } else { 266 return false; 267 } 268 } 269 // Note: We could support PREFIX_TAG (assert c>=0) 270 // by recursing on its default CE32 and checking that none of the prefixes starts 271 // with a fast Latin character. 272 // However, currently (2013) there are only the L-before-middle-dot 273 // prefix mappings in the Latin range, and those would be rejected anyway. 274 case Collation.CONTRACTION_TAG: 275 assert(c >= 0); 276 return getCEsFromContractionCE32(data, ce32); 277 case Collation.OFFSET_TAG: 278 assert(c >= 0); 279 ce0 = data.getCEFromOffsetCE32(c, ce32); 280 break; 281 default: 282 return false; 283 } 284 } 285 // A mapping can be completely ignorable. 286 if(ce0 == 0) { return ce1 == 0; } 287 // We do not support an ignorable ce0 unless it is completely ignorable. 288 long p0 = ce0 >>> 32; 289 if(p0 == 0) { return false; } 290 // We only support primaries up to the Latin script. 291 if(p0 > lastLatinPrimary) { return false; } 292 // We support non-common secondary and case weights only together with short primaries. 293 int lower32_0 = (int)ce0; 294 if(p0 < firstShortPrimary) { 295 int sc0 = lower32_0 & Collation.SECONDARY_AND_CASE_MASK; 296 if(sc0 != Collation.COMMON_SECONDARY_CE) { return false; } 297 } 298 // No below-common tertiary weights. 299 if((lower32_0 & Collation.ONLY_TERTIARY_MASK) < Collation.COMMON_WEIGHT16) { return false; } 300 if(ce1 != 0) { 301 // Both primaries must be in the same group, 302 // or both must get short mini primaries, 303 // or a short-primary CE is followed by a secondary CE. 304 // This is so that we can test the first primary and use the same mask for both, 305 // and determine for both whether they are variable. 306 long p1 = ce1 >>> 32; 307 if(p1 == 0 ? p0 < firstShortPrimary : !inSameGroup(p0, p1)) { return false; } 308 int lower32_1 = (int)ce1; 309 // No tertiary CEs. 310 if((lower32_1 >>> 16) == 0) { return false; } 311 // We support non-common secondary and case weights 312 // only for secondary CEs or together with short primaries. 313 if(p1 != 0 && p1 < firstShortPrimary) { 314 int sc1 = lower32_1 & Collation.SECONDARY_AND_CASE_MASK; 315 if(sc1 != Collation.COMMON_SECONDARY_CE) { return false; } 316 } 317 // No below-common tertiary weights. 318 if((lower32_0 & Collation.ONLY_TERTIARY_MASK) < Collation.COMMON_WEIGHT16) { return false; } 319 } 320 // No quaternary weights. 321 if(((ce0 | ce1) & Collation.QUATERNARY_MASK) != 0) { return false; } 322 return true; 323 } 324 getCEsFromContractionCE32(CollationData data, int ce32)325 private boolean getCEsFromContractionCE32(CollationData data, int ce32) { 326 int trieIndex = Collation.indexFromCE32(ce32); 327 ce32 = data.getCE32FromContexts(trieIndex); // Default if no suffix match. 328 // Since the original ce32 is not a prefix mapping, 329 // the default ce32 must not be another contraction. 330 assert(!Collation.isContractionCE32(ce32)); 331 int contractionIndex = contractionCEs.size(); 332 if(getCEsFromCE32(data, Collation.SENTINEL_CP, ce32)) { 333 addContractionEntry(CollationFastLatin.CONTR_CHAR_MASK, ce0, ce1); 334 } else { 335 // Bail out for c-without-contraction. 336 addContractionEntry(CollationFastLatin.CONTR_CHAR_MASK, Collation.NO_CE, 0); 337 } 338 // Handle an encodable contraction unless the next contraction is too long 339 // and starts with the same character. 340 int prevX = -1; 341 boolean addContraction = false; 342 CharsTrie.Iterator suffixes = CharsTrie.iterator(data.contexts, trieIndex + 2, 0); 343 while(suffixes.hasNext()) { 344 CharsTrie.Entry entry = suffixes.next(); 345 CharSequence suffix = entry.chars; 346 int x = CollationFastLatin.getCharIndex(suffix.charAt(0)); 347 if(x < 0) { continue; } // ignore anything but fast Latin text 348 if(x == prevX) { 349 if(addContraction) { 350 // Bail out for all contractions starting with this character. 351 addContractionEntry(x, Collation.NO_CE, 0); 352 addContraction = false; 353 } 354 continue; 355 } 356 if(addContraction) { 357 addContractionEntry(prevX, ce0, ce1); 358 } 359 ce32 = entry.value; 360 if(suffix.length() == 1 && getCEsFromCE32(data, Collation.SENTINEL_CP, ce32)) { 361 addContraction = true; 362 } else { 363 addContractionEntry(x, Collation.NO_CE, 0); 364 addContraction = false; 365 } 366 prevX = x; 367 } 368 if(addContraction) { 369 addContractionEntry(prevX, ce0, ce1); 370 } 371 // Note: There might not be any fast Latin contractions, but 372 // we need to enter contraction handling anyway so that we can bail out 373 // when there is a non-fast-Latin character following. 374 // For example: Danish &Y<<u+umlaut, when we compare Y vs. u\u0308 we need to see the 375 // following umlaut and bail out, rather than return the difference of Y vs. u. 376 ce0 = (Collation.NO_CE_PRIMARY << 32) | CONTRACTION_FLAG | contractionIndex; 377 ce1 = 0; 378 return true; 379 } 380 addContractionEntry(int x, long cce0, long cce1)381 private void addContractionEntry(int x, long cce0, long cce1) { 382 contractionCEs.addElement(x); 383 contractionCEs.addElement(cce0); 384 contractionCEs.addElement(cce1); 385 addUniqueCE(cce0); 386 addUniqueCE(cce1); 387 } 388 addUniqueCE(long ce)389 private void addUniqueCE(long ce) { 390 if(ce == 0 || (ce >>> 32) == Collation.NO_CE_PRIMARY) { return; } 391 ce &= ~(long)Collation.CASE_MASK; // blank out case bits 392 int i = binarySearch(uniqueCEs.getBuffer(), uniqueCEs.size(), ce); 393 if(i < 0) { 394 uniqueCEs.insertElementAt(ce, ~i); 395 } 396 } 397 getMiniCE(long ce)398 private int getMiniCE(long ce) { 399 ce &= ~(long)Collation.CASE_MASK; // blank out case bits 400 int index = binarySearch(uniqueCEs.getBuffer(), uniqueCEs.size(), ce); 401 assert(index >= 0); 402 return miniCEs[index]; 403 } 404 encodeUniqueCEs()405 private void encodeUniqueCEs() { 406 miniCEs = new char[uniqueCEs.size()]; 407 int group = 0; 408 long lastGroupPrimary = lastSpecialPrimaries[group]; 409 // The lowest unique CE must be at least a secondary CE. 410 assert(((int)uniqueCEs.elementAti(0) >>> 16) != 0); 411 long prevPrimary = 0; 412 int prevSecondary = 0; 413 int pri = 0; 414 int sec = 0; 415 int ter = CollationFastLatin.COMMON_TER; 416 for(int i = 0; i < uniqueCEs.size(); ++i) { 417 long ce = uniqueCEs.elementAti(i); 418 // Note: At least one of the p/s/t weights changes from one unique CE to the next. 419 // (uniqueCEs does not store case bits.) 420 long p = ce >>> 32; 421 if(p != prevPrimary) { 422 while(p > lastGroupPrimary) { 423 assert(pri <= CollationFastLatin.MAX_LONG); 424 // Set the group's header entry to the 425 // last "long primary" in or before the group. 426 result.setCharAt(1 + group, (char)pri); 427 if(++group < NUM_SPECIAL_GROUPS) { 428 lastGroupPrimary = lastSpecialPrimaries[group]; 429 } else { 430 lastGroupPrimary = 0xffffffffL; 431 break; 432 } 433 } 434 if(p < firstShortPrimary) { 435 if(pri == 0) { 436 pri = CollationFastLatin.MIN_LONG; 437 } else if(pri < CollationFastLatin.MAX_LONG) { 438 pri += CollationFastLatin.LONG_INC; 439 } else { 440 /* #if DEBUG_COLLATION_FAST_LATIN_BUILDER 441 printf("long-primary overflow for %08x\n", p); 442 #endif */ 443 miniCEs[i] = CollationFastLatin.BAIL_OUT; 444 continue; 445 } 446 } else { 447 if(pri < CollationFastLatin.MIN_SHORT) { 448 pri = CollationFastLatin.MIN_SHORT; 449 } else if(pri < (CollationFastLatin.MAX_SHORT - CollationFastLatin.SHORT_INC)) { 450 // Reserve the highest primary weight for U+FFFF. 451 pri += CollationFastLatin.SHORT_INC; 452 } else { 453 /* #if DEBUG_COLLATION_FAST_LATIN_BUILDER 454 printf("short-primary overflow for %08x\n", p); 455 #endif */ 456 shortPrimaryOverflow = true; 457 miniCEs[i] = CollationFastLatin.BAIL_OUT; 458 continue; 459 } 460 } 461 prevPrimary = p; 462 prevSecondary = Collation.COMMON_WEIGHT16; 463 sec = CollationFastLatin.COMMON_SEC; 464 ter = CollationFastLatin.COMMON_TER; 465 } 466 int lower32 = (int)ce; 467 int s = lower32 >>> 16; 468 if(s != prevSecondary) { 469 if(pri == 0) { 470 if(sec == 0) { 471 sec = CollationFastLatin.MIN_SEC_HIGH; 472 } else if(sec < CollationFastLatin.MAX_SEC_HIGH) { 473 sec += CollationFastLatin.SEC_INC; 474 } else { 475 miniCEs[i] = CollationFastLatin.BAIL_OUT; 476 continue; 477 } 478 prevSecondary = s; 479 ter = CollationFastLatin.COMMON_TER; 480 } else if(s < Collation.COMMON_WEIGHT16) { 481 if(sec == CollationFastLatin.COMMON_SEC) { 482 sec = CollationFastLatin.MIN_SEC_BEFORE; 483 } else if(sec < CollationFastLatin.MAX_SEC_BEFORE) { 484 sec += CollationFastLatin.SEC_INC; 485 } else { 486 miniCEs[i] = CollationFastLatin.BAIL_OUT; 487 continue; 488 } 489 } else if(s == Collation.COMMON_WEIGHT16) { 490 sec = CollationFastLatin.COMMON_SEC; 491 } else { 492 if(sec < CollationFastLatin.MIN_SEC_AFTER) { 493 sec = CollationFastLatin.MIN_SEC_AFTER; 494 } else if(sec < CollationFastLatin.MAX_SEC_AFTER) { 495 sec += CollationFastLatin.SEC_INC; 496 } else { 497 miniCEs[i] = CollationFastLatin.BAIL_OUT; 498 continue; 499 } 500 } 501 prevSecondary = s; 502 ter = CollationFastLatin.COMMON_TER; 503 } 504 assert((lower32 & Collation.CASE_MASK) == 0); // blanked out in uniqueCEs 505 int t = lower32 & Collation.ONLY_TERTIARY_MASK; 506 if(t > Collation.COMMON_WEIGHT16) { 507 if(ter < CollationFastLatin.MAX_TER_AFTER) { 508 ++ter; 509 } else { 510 miniCEs[i] = CollationFastLatin.BAIL_OUT; 511 continue; 512 } 513 } 514 if(CollationFastLatin.MIN_LONG <= pri && pri <= CollationFastLatin.MAX_LONG) { 515 assert(sec == CollationFastLatin.COMMON_SEC); 516 miniCEs[i] = (char)(pri | ter); 517 } else { 518 miniCEs[i] = (char)(pri | sec | ter); 519 } 520 } 521 /* #if DEBUG_COLLATION_FAST_LATIN_BUILDER 522 printf("last mini primary: %04x\n", pri); 523 #endif */ 524 /* #if DEBUG_COLLATION_FAST_LATIN_BUILDER >= 2 525 for(int i = 0; i < uniqueCEs.size(); ++i) { 526 long ce = uniqueCEs.elementAti(i); 527 printf("unique CE 0x%016lx -> 0x%04x\n", ce, miniCEs[i]); 528 } 529 #endif */ 530 } 531 encodeCharCEs()532 private void encodeCharCEs() { 533 int miniCEsStart = result.length(); 534 for(int i = 0; i < CollationFastLatin.NUM_FAST_CHARS; ++i) { 535 result.append(0); // initialize to completely ignorable 536 } 537 int indexBase = result.length(); 538 for(int i = 0; i < CollationFastLatin.NUM_FAST_CHARS; ++i) { 539 long ce = charCEs[i][0]; 540 if(isContractionCharCE(ce)) { continue; } // defer contraction 541 int miniCE = encodeTwoCEs(ce, charCEs[i][1]); 542 if((miniCE >>> 16) > 0) { // if ((unsigned)miniCE > 0xffff) 543 // Note: There is a chance that this new expansion is the same as a previous one, 544 // and if so, then we could reuse the other expansion. 545 // However, that seems unlikely. 546 int expansionIndex = result.length() - indexBase; 547 if(expansionIndex > CollationFastLatin.INDEX_MASK) { 548 miniCE = CollationFastLatin.BAIL_OUT; 549 } else { 550 result.append((char)(miniCE >> 16)).append((char)miniCE); 551 miniCE = CollationFastLatin.EXPANSION | expansionIndex; 552 } 553 } 554 result.setCharAt(miniCEsStart + i, (char)miniCE); 555 } 556 } 557 encodeContractions()558 private void encodeContractions() { 559 // We encode all contraction lists so that the first word of a list 560 // terminates the previous list, and we only need one additional terminator at the end. 561 int indexBase = headerLength + CollationFastLatin.NUM_FAST_CHARS; 562 int firstContractionIndex = result.length(); 563 for(int i = 0; i < CollationFastLatin.NUM_FAST_CHARS; ++i) { 564 long ce = charCEs[i][0]; 565 if(!isContractionCharCE(ce)) { continue; } 566 int contractionIndex = result.length() - indexBase; 567 if(contractionIndex > CollationFastLatin.INDEX_MASK) { 568 result.setCharAt(headerLength + i, (char) CollationFastLatin.BAIL_OUT); 569 continue; 570 } 571 boolean firstTriple = true; 572 for(int index = (int)ce & 0x7fffffff;; index += 3) { 573 long x = contractionCEs.elementAti(index); 574 if(x == CollationFastLatin.CONTR_CHAR_MASK && !firstTriple) { break; } 575 long cce0 = contractionCEs.elementAti(index + 1); 576 long cce1 = contractionCEs.elementAti(index + 2); 577 int miniCE = encodeTwoCEs(cce0, cce1); 578 if(miniCE == CollationFastLatin.BAIL_OUT) { 579 result.append((char)(x | (1 << CollationFastLatin.CONTR_LENGTH_SHIFT))); 580 } else if((miniCE >>> 16) == 0) { // if ((unsigned)miniCE <= 0xffff) 581 result.append((char)(x | (2 << CollationFastLatin.CONTR_LENGTH_SHIFT))); 582 result.append((char)miniCE); 583 } else { 584 result.append((char)(x | (3 << CollationFastLatin.CONTR_LENGTH_SHIFT))); 585 result.append((char)(miniCE >> 16)).append((char)miniCE); 586 } 587 firstTriple = false; 588 } 589 // Note: There is a chance that this new contraction list is the same as a previous one, 590 // and if so, then we could truncate the result and reuse the other list. 591 // However, that seems unlikely. 592 result.setCharAt(headerLength + i, 593 (char)(CollationFastLatin.CONTRACTION | contractionIndex)); 594 } 595 if(result.length() > firstContractionIndex) { 596 // Terminate the last contraction list. 597 result.append((char)CollationFastLatin.CONTR_CHAR_MASK); 598 } 599 /* #if DEBUG_COLLATION_FAST_LATIN_BUILDER 600 printf("** fast Latin %d * 2 = %d bytes\n", result.length(), result.length() * 2); 601 puts(" header & below-digit groups map"); 602 int i = 0; 603 for(; i < headerLength; ++i) { 604 printf(" %04x", result[i]); 605 } 606 printf("\n char mini CEs"); 607 assert(CollationFastLatin.NUM_FAST_CHARS % 16 == 0); 608 for(; i < indexBase; i += 16) { 609 int c = i - headerLength; 610 if(c >= CollationFastLatin.LATIN_LIMIT) { 611 c = CollationFastLatin.PUNCT_START + c - CollationFastLatin.LATIN_LIMIT; 612 } 613 printf("\n %04x:", c); 614 for(int j = 0; j < 16; ++j) { 615 printf(" %04x", result[i + j]); 616 } 617 } 618 printf("\n expansions & contractions"); 619 for(; i < result.length(); ++i) { 620 if((i - indexBase) % 16 == 0) { puts(""); } 621 printf(" %04x", result[i]); 622 } 623 puts(""); 624 #endif */ 625 } 626 encodeTwoCEs(long first, long second)627 private int encodeTwoCEs(long first, long second) { 628 if(first == 0) { 629 return 0; // completely ignorable 630 } 631 if(first == Collation.NO_CE) { 632 return CollationFastLatin.BAIL_OUT; 633 } 634 assert((first >>> 32) != Collation.NO_CE_PRIMARY); 635 636 int miniCE = getMiniCE(first); 637 if(miniCE == CollationFastLatin.BAIL_OUT) { return miniCE; } 638 if(miniCE >= CollationFastLatin.MIN_SHORT) { 639 // Extract & copy the case bits. 640 // Shift them from normal CE bits 15..14 to mini CE bits 4..3. 641 int c = (((int)first & Collation.CASE_MASK) >> (14 - 3)); 642 // Only in mini CEs: Ignorable case bits = 0, lowercase = 1. 643 c += CollationFastLatin.LOWER_CASE; 644 miniCE |= c; 645 } 646 if(second == 0) { return miniCE; } 647 648 int miniCE1 = getMiniCE(second); 649 if(miniCE1 == CollationFastLatin.BAIL_OUT) { return miniCE1; } 650 651 int case1 = (int)second & Collation.CASE_MASK; 652 if(miniCE >= CollationFastLatin.MIN_SHORT && 653 (miniCE & CollationFastLatin.SECONDARY_MASK) == CollationFastLatin.COMMON_SEC) { 654 // Try to combine the two mini CEs into one. 655 int sec1 = miniCE1 & CollationFastLatin.SECONDARY_MASK; 656 int ter1 = miniCE1 & CollationFastLatin.TERTIARY_MASK; 657 if(sec1 >= CollationFastLatin.MIN_SEC_HIGH && case1 == 0 && 658 ter1 == CollationFastLatin.COMMON_TER) { 659 // sec1>=sec_high implies pri1==0. 660 return (miniCE & ~CollationFastLatin.SECONDARY_MASK) | sec1; 661 } 662 } 663 664 if(miniCE1 <= CollationFastLatin.SECONDARY_MASK || CollationFastLatin.MIN_SHORT <= miniCE1) { 665 // Secondary CE, or a CE with a short primary, copy the case bits. 666 case1 = (case1 >> (14 - 3)) + CollationFastLatin.LOWER_CASE; 667 miniCE1 |= case1; 668 } 669 return (miniCE << 16) | miniCE1; 670 } 671 isContractionCharCE(long ce)672 private static boolean isContractionCharCE(long ce) { 673 return (ce >>> 32) == Collation.NO_CE_PRIMARY && ce != Collation.NO_CE; 674 } 675 676 // space, punct, symbol, currency (not digit) 677 private static final int NUM_SPECIAL_GROUPS = 678 Collator.ReorderCodes.CURRENCY - Collator.ReorderCodes.FIRST + 1; 679 680 private static final long CONTRACTION_FLAG = 0x80000000L; 681 682 // temporary "buffer" 683 private long ce0, ce1; 684 685 private long[][] charCEs = new long[CollationFastLatin.NUM_FAST_CHARS][2]; 686 687 private UVector64 contractionCEs; 688 private UVector64 uniqueCEs; 689 690 /** One 16-bit mini CE per unique CE. */ 691 private char[] miniCEs; 692 693 // These are constant for a given root collator. 694 long[] lastSpecialPrimaries = new long[NUM_SPECIAL_GROUPS]; 695 private long firstDigitPrimary; 696 private long firstLatinPrimary; 697 private long lastLatinPrimary; 698 // This determines the first normal primary weight which is mapped to 699 // a short mini primary. It must be >=firstDigitPrimary. 700 private long firstShortPrimary; 701 702 private boolean shortPrimaryOverflow; 703 704 private StringBuilder result = new StringBuilder(); 705 private int headerLength; 706 } 707