1 /* GENERATED SOURCE. DO NOT MODIFY. */ 2 // © 2016 and later: Unicode, Inc. and others. 3 // License & terms of use: http://www.unicode.org/copyright.html#License 4 /* 5 ******************************************************************************* 6 * Copyright (C) 2013-2015, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 ******************************************************************************* 9 * CollationBuilder.java, ported from collationbuilder.h/.cpp 10 * 11 * C++ version created on: 2013may06 12 * created by: Markus W. Scherer 13 */ 14 15 package ohos.global.icu.impl.coll; 16 17 import java.text.ParseException; 18 19 import ohos.global.icu.impl.Norm2AllModes; 20 import ohos.global.icu.impl.Normalizer2Impl; 21 import ohos.global.icu.impl.Normalizer2Impl.Hangul; 22 import ohos.global.icu.lang.UScript; 23 import ohos.global.icu.text.CanonicalIterator; 24 import ohos.global.icu.text.Collator; 25 import ohos.global.icu.text.Normalizer2; 26 import ohos.global.icu.text.UnicodeSet; 27 import ohos.global.icu.text.UnicodeSetIterator; 28 import ohos.global.icu.util.ULocale; 29 30 /** 31 * @hide exposed on OHOS 32 */ 33 public final class CollationBuilder extends CollationRuleParser.Sink { 34 private static final boolean DEBUG = false; 35 private static final class BundleImporter implements CollationRuleParser.Importer { BundleImporter()36 BundleImporter() {} 37 @Override getRules(String localeID, String collationType)38 public String getRules(String localeID, String collationType) { 39 return CollationLoader.loadRules(new ULocale(localeID), collationType); 40 } 41 } 42 CollationBuilder(CollationTailoring b)43 public CollationBuilder(CollationTailoring b) { 44 nfd = Normalizer2.getNFDInstance(); 45 fcd = Norm2AllModes.getFCDNormalizer2(); 46 nfcImpl = Norm2AllModes.getNFCInstance().impl; 47 base = b; 48 baseData = b.data; 49 rootElements = new CollationRootElements(b.data.rootElements); 50 variableTop = 0; 51 dataBuilder = new CollationDataBuilder(); 52 fastLatinEnabled = true; 53 cesLength = 0; 54 rootPrimaryIndexes = new UVector32(); 55 nodes = new UVector64(); 56 nfcImpl.ensureCanonIterData(); 57 dataBuilder.initForTailoring(baseData); 58 } 59 parseAndBuild(String ruleString)60 public CollationTailoring parseAndBuild(String ruleString) throws ParseException { 61 if(baseData.rootElements == null) { 62 // C++ U_MISSING_RESOURCE_ERROR 63 throw new UnsupportedOperationException( 64 "missing root elements data, tailoring not supported"); 65 } 66 CollationTailoring tailoring = new CollationTailoring(base.settings); 67 CollationRuleParser parser = new CollationRuleParser(baseData); 68 // Note: This always bases &[last variable] and &[first regular] 69 // on the root collator's maxVariable/variableTop. 70 // If we wanted this to change after [maxVariable x], then we would keep 71 // the tailoring.settings pointer here and read its variableTop when we need it. 72 // See http://unicode.org/cldr/trac/ticket/6070 73 variableTop = base.settings.readOnly().variableTop; 74 parser.setSink(this); 75 // In Java, there is only one Importer implementation. 76 // In C++, the importer is a parameter for this method. 77 parser.setImporter(new BundleImporter()); 78 CollationSettings ownedSettings = tailoring.settings.copyOnWrite(); 79 parser.parse(ruleString, ownedSettings); 80 if(dataBuilder.hasMappings()) { 81 makeTailoredCEs(); 82 closeOverComposites(); 83 finalizeCEs(); 84 // Copy all of ASCII, and Latin-1 letters, into each tailoring. 85 optimizeSet.add(0, 0x7f); 86 optimizeSet.add(0xc0, 0xff); 87 // Hangul is decomposed on the fly during collation, 88 // and the tailoring data is always built with HANGUL_TAG specials. 89 optimizeSet.remove(Hangul.HANGUL_BASE, Hangul.HANGUL_END); 90 dataBuilder.optimize(optimizeSet); 91 tailoring.ensureOwnedData(); 92 if(fastLatinEnabled) { dataBuilder.enableFastLatin(); } 93 dataBuilder.build(tailoring.ownedData); 94 // C++ tailoring.builder = dataBuilder; 95 dataBuilder = null; 96 } else { 97 tailoring.data = baseData; 98 } 99 ownedSettings.fastLatinOptions = CollationFastLatin.getOptions( 100 tailoring.data, ownedSettings, 101 ownedSettings.fastLatinPrimaries); 102 tailoring.setRules(ruleString); 103 // In Java, we do not have a rules version. 104 // In C++, the genrb build tool reads and supplies one, 105 // and the rulesVersion is a parameter for this method. 106 tailoring.setVersion(base.version, 0 /* rulesVersion */); 107 return tailoring; 108 } 109 110 /** Implements CollationRuleParser.Sink. */ 111 @Override addReset(int strength, CharSequence str)112 void addReset(int strength, CharSequence str) { 113 assert(str.length() != 0); 114 if(str.charAt(0) == CollationRuleParser.POS_LEAD) { 115 ces[0] = getSpecialResetPosition(str); 116 cesLength = 1; 117 assert((ces[0] & Collation.CASE_AND_QUATERNARY_MASK) == 0); 118 } else { 119 // normal reset to a character or string 120 String nfdString = nfd.normalize(str); 121 cesLength = dataBuilder.getCEs(nfdString, ces, 0); 122 if(cesLength > Collation.MAX_EXPANSION_LENGTH) { 123 throw new IllegalArgumentException( 124 "reset position maps to too many collation elements (more than 31)"); 125 } 126 } 127 if(strength == Collator.IDENTICAL) { return; } // simple reset-at-position 128 129 // &[before strength]position 130 assert(Collator.PRIMARY <= strength && strength <= Collator.TERTIARY); 131 int index = findOrInsertNodeForCEs(strength); 132 133 long node = nodes.elementAti(index); 134 // If the index is for a "weaker" node, 135 // then skip backwards over this and further "weaker" nodes. 136 while(strengthFromNode(node) > strength) { 137 index = previousIndexFromNode(node); 138 node = nodes.elementAti(index); 139 } 140 141 // Find or insert a node whose index we will put into a temporary CE. 142 if(strengthFromNode(node) == strength && isTailoredNode(node)) { 143 // Reset to just before this same-strength tailored node. 144 index = previousIndexFromNode(node); 145 } else if(strength == Collator.PRIMARY) { 146 // root primary node (has no previous index) 147 long p = weight32FromNode(node); 148 if(p == 0) { 149 throw new UnsupportedOperationException( 150 "reset primary-before ignorable not possible"); 151 } 152 if(p <= rootElements.getFirstPrimary()) { 153 // There is no primary gap between ignorables and the space-first-primary. 154 throw new UnsupportedOperationException( 155 "reset primary-before first non-ignorable not supported"); 156 } 157 if(p == Collation.FIRST_TRAILING_PRIMARY) { 158 // We do not support tailoring to an unassigned-implicit CE. 159 throw new UnsupportedOperationException( 160 "reset primary-before [first trailing] not supported"); 161 } 162 p = rootElements.getPrimaryBefore(p, baseData.isCompressiblePrimary(p)); 163 index = findOrInsertNodeForPrimary(p); 164 // Go to the last node in this list: 165 // Tailor after the last node between adjacent root nodes. 166 for(;;) { 167 node = nodes.elementAti(index); 168 int nextIndex = nextIndexFromNode(node); 169 if(nextIndex == 0) { break; } 170 index = nextIndex; 171 } 172 } else { 173 // &[before 2] or &[before 3] 174 index = findCommonNode(index, Collator.SECONDARY); 175 if(strength >= Collator.TERTIARY) { 176 index = findCommonNode(index, Collator.TERTIARY); 177 } 178 // findCommonNode() stayed on the stronger node or moved to 179 // an explicit common-weight node of the reset-before strength. 180 node = nodes.elementAti(index); 181 if(strengthFromNode(node) == strength) { 182 // Found a same-strength node with an explicit weight. 183 int weight16 = weight16FromNode(node); 184 if(weight16 == 0) { 185 throw new UnsupportedOperationException( 186 (strength == Collator.SECONDARY) ? 187 "reset secondary-before secondary ignorable not possible" : 188 "reset tertiary-before completely ignorable not possible"); 189 } 190 assert(weight16 > Collation.BEFORE_WEIGHT16); 191 // Reset to just before this node. 192 // Insert the preceding same-level explicit weight if it is not there already. 193 // Which explicit weight immediately precedes this one? 194 weight16 = getWeight16Before(index, node, strength); 195 // Does this preceding weight have a node? 196 int previousWeight16; 197 int previousIndex = previousIndexFromNode(node); 198 for(int i = previousIndex;; i = previousIndexFromNode(node)) { 199 node = nodes.elementAti(i); 200 int previousStrength = strengthFromNode(node); 201 if(previousStrength < strength) { 202 assert(weight16 >= Collation.COMMON_WEIGHT16 || i == previousIndex); 203 // Either the reset element has an above-common weight and 204 // the parent node provides the implied common weight, 205 // or the reset element has a weight<=common in the node 206 // right after the parent, and we need to insert the preceding weight. 207 previousWeight16 = Collation.COMMON_WEIGHT16; 208 break; 209 } else if(previousStrength == strength && !isTailoredNode(node)) { 210 previousWeight16 = weight16FromNode(node); 211 break; 212 } 213 // Skip weaker nodes and same-level tailored nodes. 214 } 215 if(previousWeight16 == weight16) { 216 // The preceding weight has a node, 217 // maybe with following weaker or tailored nodes. 218 // Reset to the last of them. 219 index = previousIndex; 220 } else { 221 // Insert a node with the preceding weight, reset to that. 222 node = nodeFromWeight16(weight16) | nodeFromStrength(strength); 223 index = insertNodeBetween(previousIndex, index, node); 224 } 225 } else { 226 // Found a stronger node with implied strength-common weight. 227 int weight16 = getWeight16Before(index, node, strength); 228 index = findOrInsertWeakNode(index, weight16, strength); 229 } 230 // Strength of the temporary CE = strength of its reset position. 231 // Code above raises an error if the before-strength is stronger. 232 strength = ceStrength(ces[cesLength - 1]); 233 } 234 ces[cesLength - 1] = tempCEFromIndexAndStrength(index, strength); 235 } 236 237 /** 238 * Returns the secondary or tertiary weight preceding the current node's weight. 239 * node=nodes[index]. 240 */ getWeight16Before(int index, long node, int level)241 private int getWeight16Before(int index, long node, int level) { 242 assert(strengthFromNode(node) < level || !isTailoredNode(node)); 243 // Collect the root CE weights if this node is for a root CE. 244 // If it is not, then return the low non-primary boundary for a tailored CE. 245 int t; 246 if(strengthFromNode(node) == Collator.TERTIARY) { 247 t = weight16FromNode(node); 248 } else { 249 t = Collation.COMMON_WEIGHT16; // Stronger node with implied common weight. 250 } 251 while(strengthFromNode(node) > Collator.SECONDARY) { 252 index = previousIndexFromNode(node); 253 node = nodes.elementAti(index); 254 } 255 if(isTailoredNode(node)) { 256 return Collation.BEFORE_WEIGHT16; 257 } 258 int s; 259 if(strengthFromNode(node) == Collator.SECONDARY) { 260 s = weight16FromNode(node); 261 } else { 262 s = Collation.COMMON_WEIGHT16; // Stronger node with implied common weight. 263 } 264 while(strengthFromNode(node) > Collator.PRIMARY) { 265 index = previousIndexFromNode(node); 266 node = nodes.elementAti(index); 267 } 268 if(isTailoredNode(node)) { 269 return Collation.BEFORE_WEIGHT16; 270 } 271 // [p, s, t] is a root CE. Return the preceding weight for the requested level. 272 long p = weight32FromNode(node); 273 int weight16; 274 if(level == Collator.SECONDARY) { 275 weight16 = rootElements.getSecondaryBefore(p, s); 276 } else { 277 weight16 = rootElements.getTertiaryBefore(p, s, t); 278 assert((weight16 & ~Collation.ONLY_TERTIARY_MASK) == 0); 279 } 280 return weight16; 281 } 282 getSpecialResetPosition(CharSequence str)283 private long getSpecialResetPosition(CharSequence str) { 284 assert(str.length() == 2); 285 long ce; 286 int strength = Collator.PRIMARY; 287 boolean isBoundary = false; 288 CollationRuleParser.Position pos = 289 CollationRuleParser.POSITION_VALUES[str.charAt(1) - CollationRuleParser.POS_BASE]; 290 switch(pos) { 291 case FIRST_TERTIARY_IGNORABLE: 292 // Quaternary CEs are not supported. 293 // Non-zero quaternary weights are possible only on tertiary or stronger CEs. 294 return 0; 295 case LAST_TERTIARY_IGNORABLE: 296 return 0; 297 case FIRST_SECONDARY_IGNORABLE: { 298 // Look for a tailored tertiary node after [0, 0, 0]. 299 int index = findOrInsertNodeForRootCE(0, Collator.TERTIARY); 300 long node = nodes.elementAti(index); 301 if((index = nextIndexFromNode(node)) != 0) { 302 node = nodes.elementAti(index); 303 assert(strengthFromNode(node) <= Collator.TERTIARY); 304 if(isTailoredNode(node) && strengthFromNode(node) == Collator.TERTIARY) { 305 return tempCEFromIndexAndStrength(index, Collator.TERTIARY); 306 } 307 } 308 return rootElements.getFirstTertiaryCE(); 309 // No need to look for nodeHasAnyBefore() on a tertiary node. 310 } 311 case LAST_SECONDARY_IGNORABLE: 312 ce = rootElements.getLastTertiaryCE(); 313 strength = Collator.TERTIARY; 314 break; 315 case FIRST_PRIMARY_IGNORABLE: { 316 // Look for a tailored secondary node after [0, 0, *]. 317 int index = findOrInsertNodeForRootCE(0, Collator.SECONDARY); 318 long node = nodes.elementAti(index); 319 while((index = nextIndexFromNode(node)) != 0) { 320 node = nodes.elementAti(index); 321 strength = strengthFromNode(node); 322 if(strength < Collator.SECONDARY) { break; } 323 if(strength == Collator.SECONDARY) { 324 if(isTailoredNode(node)) { 325 if(nodeHasBefore3(node)) { 326 index = nextIndexFromNode(nodes.elementAti(nextIndexFromNode(node))); 327 assert(isTailoredNode(nodes.elementAti(index))); 328 } 329 return tempCEFromIndexAndStrength(index, Collator.SECONDARY); 330 } else { 331 break; 332 } 333 } 334 } 335 ce = rootElements.getFirstSecondaryCE(); 336 strength = Collator.SECONDARY; 337 break; 338 } 339 case LAST_PRIMARY_IGNORABLE: 340 ce = rootElements.getLastSecondaryCE(); 341 strength = Collator.SECONDARY; 342 break; 343 case FIRST_VARIABLE: 344 ce = rootElements.getFirstPrimaryCE(); 345 isBoundary = true; // FractionalUCA.txt: FDD1 00A0, SPACE first primary 346 break; 347 case LAST_VARIABLE: 348 ce = rootElements.lastCEWithPrimaryBefore(variableTop + 1); 349 break; 350 case FIRST_REGULAR: 351 ce = rootElements.firstCEWithPrimaryAtLeast(variableTop + 1); 352 isBoundary = true; // FractionalUCA.txt: FDD1 263A, SYMBOL first primary 353 break; 354 case LAST_REGULAR: 355 // Use the Hani-first-primary rather than the actual last "regular" CE before it, 356 // for backward compatibility with behavior before the introduction of 357 // script-first-primary CEs in the root collator. 358 ce = rootElements.firstCEWithPrimaryAtLeast( 359 baseData.getFirstPrimaryForGroup(UScript.HAN)); 360 break; 361 case FIRST_IMPLICIT: 362 ce = baseData.getSingleCE(0x4e00); 363 break; 364 case LAST_IMPLICIT: 365 // We do not support tailoring to an unassigned-implicit CE. 366 throw new UnsupportedOperationException( 367 "reset to [last implicit] not supported"); 368 case FIRST_TRAILING: 369 ce = Collation.makeCE(Collation.FIRST_TRAILING_PRIMARY); 370 isBoundary = true; // trailing first primary (there is no mapping for it) 371 break; 372 case LAST_TRAILING: 373 throw new IllegalArgumentException("LDML forbids tailoring to U+FFFF"); 374 default: 375 assert(false); 376 return 0; 377 } 378 379 int index = findOrInsertNodeForRootCE(ce, strength); 380 long node = nodes.elementAti(index); 381 if((pos.ordinal() & 1) == 0) { 382 // even pos = [first xyz] 383 if(!nodeHasAnyBefore(node) && isBoundary) { 384 // A <group> first primary boundary is artificially added to FractionalUCA.txt. 385 // It is reachable via its special contraction, but is not normally used. 386 // Find the first character tailored after the boundary CE, 387 // or the first real root CE after it. 388 if((index = nextIndexFromNode(node)) != 0) { 389 // If there is a following node, then it must be tailored 390 // because there are no root CEs with a boundary primary 391 // and non-common secondary/tertiary weights. 392 node = nodes.elementAti(index); 393 assert(isTailoredNode(node)); 394 ce = tempCEFromIndexAndStrength(index, strength); 395 } else { 396 assert(strength == Collator.PRIMARY); 397 long p = ce >>> 32; 398 int pIndex = rootElements.findPrimary(p); 399 boolean isCompressible = baseData.isCompressiblePrimary(p); 400 p = rootElements.getPrimaryAfter(p, pIndex, isCompressible); 401 ce = Collation.makeCE(p); 402 index = findOrInsertNodeForRootCE(ce, Collator.PRIMARY); 403 node = nodes.elementAti(index); 404 } 405 } 406 if(nodeHasAnyBefore(node)) { 407 // Get the first node that was tailored before this one at a weaker strength. 408 if(nodeHasBefore2(node)) { 409 index = nextIndexFromNode(nodes.elementAti(nextIndexFromNode(node))); 410 node = nodes.elementAti(index); 411 } 412 if(nodeHasBefore3(node)) { 413 index = nextIndexFromNode(nodes.elementAti(nextIndexFromNode(node))); 414 } 415 assert(isTailoredNode(nodes.elementAti(index))); 416 ce = tempCEFromIndexAndStrength(index, strength); 417 } 418 } else { 419 // odd pos = [last xyz] 420 // Find the last node that was tailored after the [last xyz] 421 // at a strength no greater than the position's strength. 422 for(;;) { 423 int nextIndex = nextIndexFromNode(node); 424 if(nextIndex == 0) { break; } 425 long nextNode = nodes.elementAti(nextIndex); 426 if(strengthFromNode(nextNode) < strength) { break; } 427 index = nextIndex; 428 node = nextNode; 429 } 430 // Do not make a temporary CE for a root node. 431 // This last node might be the node for the root CE itself, 432 // or a node with a common secondary or tertiary weight. 433 if(isTailoredNode(node)) { 434 ce = tempCEFromIndexAndStrength(index, strength); 435 } 436 } 437 return ce; 438 } 439 440 /** Implements CollationRuleParser.Sink. */ 441 @Override addRelation(int strength, CharSequence prefix, CharSequence str, CharSequence extension)442 void addRelation(int strength, CharSequence prefix, CharSequence str, CharSequence extension) { 443 String nfdPrefix; 444 if(prefix.length() == 0) { 445 nfdPrefix = ""; 446 } else { 447 nfdPrefix = nfd.normalize(prefix); 448 } 449 String nfdString = nfd.normalize(str); 450 451 // The runtime code decomposes Hangul syllables on the fly, 452 // with recursive processing but without making the Jamo pieces visible for matching. 453 // It does not work with certain types of contextual mappings. 454 int nfdLength = nfdString.length(); 455 if(nfdLength >= 2) { 456 char c = nfdString.charAt(0); 457 if(Hangul.isJamoL(c) || Hangul.isJamoV(c)) { 458 // While handling a Hangul syllable, contractions starting with Jamo L or V 459 // would not see the following Jamo of that syllable. 460 throw new UnsupportedOperationException( 461 "contractions starting with conjoining Jamo L or V not supported"); 462 } 463 c = nfdString.charAt(nfdLength - 1); 464 if(Hangul.isJamoL(c) || 465 (Hangul.isJamoV(c) && Hangul.isJamoL(nfdString.charAt(nfdLength - 2)))) { 466 // A contraction ending with Jamo L or L+V would require 467 // generating Hangul syllables in addTailComposites() (588 for a Jamo L), 468 // or decomposing a following Hangul syllable on the fly, during contraction matching. 469 throw new UnsupportedOperationException( 470 "contractions ending with conjoining Jamo L or L+V not supported"); 471 } 472 // A Hangul syllable completely inside a contraction is ok. 473 } 474 // Note: If there is a prefix, then the parser checked that 475 // both the prefix and the string beging with NFC boundaries (not Jamo V or T). 476 // Therefore: prefix.isEmpty() || !isJamoVOrT(nfdString.charAt(0)) 477 // (While handling a Hangul syllable, prefixes on Jamo V or T 478 // would not see the previous Jamo of that syllable.) 479 480 if(strength != Collator.IDENTICAL) { 481 // Find the node index after which we insert the new tailored node. 482 int index = findOrInsertNodeForCEs(strength); 483 assert(cesLength > 0); 484 long ce = ces[cesLength - 1]; 485 if(strength == Collator.PRIMARY && !isTempCE(ce) && (ce >>> 32) == 0) { 486 // There is no primary gap between ignorables and the space-first-primary. 487 throw new UnsupportedOperationException( 488 "tailoring primary after ignorables not supported"); 489 } 490 if(strength == Collator.QUATERNARY && ce == 0) { 491 // The CE data structure does not support non-zero quaternary weights 492 // on tertiary ignorables. 493 throw new UnsupportedOperationException( 494 "tailoring quaternary after tertiary ignorables not supported"); 495 } 496 // Insert the new tailored node. 497 index = insertTailoredNodeAfter(index, strength); 498 // Strength of the temporary CE: 499 // The new relation may yield a stronger CE but not a weaker one. 500 int tempStrength = ceStrength(ce); 501 if(strength < tempStrength) { tempStrength = strength; } 502 ces[cesLength - 1] = tempCEFromIndexAndStrength(index, tempStrength); 503 } 504 505 setCaseBits(nfdString); 506 507 int cesLengthBeforeExtension = cesLength; 508 if(extension.length() != 0) { 509 String nfdExtension = nfd.normalize(extension); 510 cesLength = dataBuilder.getCEs(nfdExtension, ces, cesLength); 511 if(cesLength > Collation.MAX_EXPANSION_LENGTH) { 512 throw new IllegalArgumentException( 513 "extension string adds too many collation elements (more than 31 total)"); 514 } 515 } 516 int ce32 = Collation.UNASSIGNED_CE32; 517 if((!nfdPrefix.contentEquals(prefix) || !nfdString.contentEquals(str)) && 518 !ignorePrefix(prefix) && !ignoreString(str)) { 519 // Map from the original input to the CEs. 520 // We do this in case the canonical closure is incomplete, 521 // so that it is possible to explicitly provide the missing mappings. 522 ce32 = addIfDifferent(prefix, str, ces, cesLength, ce32); 523 } 524 addWithClosure(nfdPrefix, nfdString, ces, cesLength, ce32); 525 cesLength = cesLengthBeforeExtension; 526 } 527 528 /** 529 * Picks one of the current CEs and finds or inserts a node in the graph 530 * for the CE + strength. 531 */ findOrInsertNodeForCEs(int strength)532 private int findOrInsertNodeForCEs(int strength) { 533 assert(Collator.PRIMARY <= strength && strength <= Collator.QUATERNARY); 534 535 // Find the last CE that is at least as "strong" as the requested difference. 536 // Note: Stronger is smaller (Collator.PRIMARY=0). 537 long ce; 538 for(;; --cesLength) { 539 if(cesLength == 0) { 540 ce = ces[0] = 0; 541 cesLength = 1; 542 break; 543 } else { 544 ce = ces[cesLength - 1]; 545 } 546 if(ceStrength(ce) <= strength) { break; } 547 } 548 549 if(isTempCE(ce)) { 550 // No need to findCommonNode() here for lower levels 551 // because insertTailoredNodeAfter() will do that anyway. 552 return indexFromTempCE(ce); 553 } 554 555 // root CE 556 if((int)(ce >>> 56) == Collation.UNASSIGNED_IMPLICIT_BYTE) { 557 throw new UnsupportedOperationException( 558 "tailoring relative to an unassigned code point not supported"); 559 } 560 return findOrInsertNodeForRootCE(ce, strength); 561 } 562 findOrInsertNodeForRootCE(long ce, int strength)563 private int findOrInsertNodeForRootCE(long ce, int strength) { 564 assert((int)(ce >>> 56) != Collation.UNASSIGNED_IMPLICIT_BYTE); 565 566 // Find or insert the node for each of the root CE's weights, 567 // down to the requested level/strength. 568 // Root CEs must have common=zero quaternary weights (for which we never insert any nodes). 569 assert((ce & 0xc0) == 0); 570 int index = findOrInsertNodeForPrimary(ce >>> 32); 571 if(strength >= Collator.SECONDARY) { 572 int lower32 = (int)ce; 573 index = findOrInsertWeakNode(index, lower32 >>> 16, Collator.SECONDARY); 574 if(strength >= Collator.TERTIARY) { 575 index = findOrInsertWeakNode(index, lower32 & Collation.ONLY_TERTIARY_MASK, 576 Collator.TERTIARY); 577 } 578 } 579 return index; 580 } 581 582 /** 583 * Like Java Collections.binarySearch(List, key, Comparator). 584 * 585 * @return the index>=0 where the item was found, 586 * or the index<0 for inserting the string at ~index in sorted order 587 * (index into rootPrimaryIndexes) 588 */ binarySearchForRootPrimaryNode( int[] rootPrimaryIndexes, int length, long[] nodes, long p)589 private static final int binarySearchForRootPrimaryNode( 590 int[] rootPrimaryIndexes, int length, long[] nodes, long p) { 591 if(length == 0) { return ~0; } 592 int start = 0; 593 int limit = length; 594 for (;;) { 595 int i = (int)(((long)start + (long)limit) / 2); 596 long node = nodes[rootPrimaryIndexes[i]]; 597 long nodePrimary = node >>> 32; // weight32FromNode(node) 598 if (p == nodePrimary) { 599 return i; 600 } else if (p < nodePrimary) { 601 if (i == start) { 602 return ~start; // insert s before i 603 } 604 limit = i; 605 } else { 606 if (i == start) { 607 return ~(start + 1); // insert s after i 608 } 609 start = i; 610 } 611 } 612 } 613 614 /** Finds or inserts the node for a root CE's primary weight. */ findOrInsertNodeForPrimary(long p)615 private int findOrInsertNodeForPrimary(long p) { 616 int rootIndex = binarySearchForRootPrimaryNode( 617 rootPrimaryIndexes.getBuffer(), rootPrimaryIndexes.size(), nodes.getBuffer(), p); 618 if(rootIndex >= 0) { 619 return rootPrimaryIndexes.elementAti(rootIndex); 620 } else { 621 // Start a new list of nodes with this primary. 622 int index = nodes.size(); 623 nodes.addElement(nodeFromWeight32(p)); 624 rootPrimaryIndexes.insertElementAt(index, ~rootIndex); 625 return index; 626 } 627 } 628 629 /** Finds or inserts the node for a secondary or tertiary weight. */ findOrInsertWeakNode(int index, int weight16, int level)630 private int findOrInsertWeakNode(int index, int weight16, int level) { 631 assert(0 <= index && index < nodes.size()); 632 assert(Collator.SECONDARY <= level && level <= Collator.TERTIARY); 633 634 if(weight16 == Collation.COMMON_WEIGHT16) { 635 return findCommonNode(index, level); 636 } 637 638 // If this will be the first below-common weight for the parent node, 639 // then we will also need to insert a common weight after it. 640 long node = nodes.elementAti(index); 641 assert(strengthFromNode(node) < level); // parent node is stronger 642 if(weight16 != 0 && weight16 < Collation.COMMON_WEIGHT16) { 643 int hasThisLevelBefore = level == Collator.SECONDARY ? HAS_BEFORE2 : HAS_BEFORE3; 644 if((node & hasThisLevelBefore) == 0) { 645 // The parent node has an implied level-common weight. 646 long commonNode = 647 nodeFromWeight16(Collation.COMMON_WEIGHT16) | nodeFromStrength(level); 648 if(level == Collator.SECONDARY) { 649 // Move the HAS_BEFORE3 flag from the parent node 650 // to the new secondary common node. 651 commonNode |= node & HAS_BEFORE3; 652 node &= ~(long)HAS_BEFORE3; 653 } 654 nodes.setElementAt(node | hasThisLevelBefore, index); 655 // Insert below-common-weight node. 656 int nextIndex = nextIndexFromNode(node); 657 node = nodeFromWeight16(weight16) | nodeFromStrength(level); 658 index = insertNodeBetween(index, nextIndex, node); 659 // Insert common-weight node. 660 insertNodeBetween(index, nextIndex, commonNode); 661 // Return index of below-common-weight node. 662 return index; 663 } 664 } 665 666 // Find the root CE's weight for this level. 667 // Postpone insertion if not found: 668 // Insert the new root node before the next stronger node, 669 // or before the next root node with the same strength and a larger weight. 670 int nextIndex; 671 while((nextIndex = nextIndexFromNode(node)) != 0) { 672 node = nodes.elementAti(nextIndex); 673 int nextStrength = strengthFromNode(node); 674 if(nextStrength <= level) { 675 // Insert before a stronger node. 676 if(nextStrength < level) { break; } 677 // nextStrength == level 678 if(!isTailoredNode(node)) { 679 int nextWeight16 = weight16FromNode(node); 680 if(nextWeight16 == weight16) { 681 // Found the node for the root CE up to this level. 682 return nextIndex; 683 } 684 // Insert before a node with a larger same-strength weight. 685 if(nextWeight16 > weight16) { break; } 686 } 687 } 688 // Skip the next node. 689 index = nextIndex; 690 } 691 node = nodeFromWeight16(weight16) | nodeFromStrength(level); 692 return insertNodeBetween(index, nextIndex, node); 693 } 694 695 /** 696 * Makes and inserts a new tailored node into the list, after the one at index. 697 * Skips over nodes of weaker strength to maintain collation order 698 * ("postpone insertion"). 699 * @return the new node's index 700 */ 701 private int insertTailoredNodeAfter(int index, int strength) { 702 assert(0 <= index && index < nodes.size()); 703 if(strength >= Collator.SECONDARY) { 704 index = findCommonNode(index, Collator.SECONDARY); 705 if(strength >= Collator.TERTIARY) { 706 index = findCommonNode(index, Collator.TERTIARY); 707 } 708 } 709 // Postpone insertion: 710 // Insert the new node before the next one with a strength at least as strong. 711 long node = nodes.elementAti(index); 712 int nextIndex; 713 while((nextIndex = nextIndexFromNode(node)) != 0) { 714 node = nodes.elementAti(nextIndex); 715 if(strengthFromNode(node) <= strength) { break; } 716 // Skip the next node which has a weaker (larger) strength than the new one. 717 index = nextIndex; 718 } 719 node = IS_TAILORED | nodeFromStrength(strength); 720 return insertNodeBetween(index, nextIndex, node); 721 } 722 723 /** 724 * Inserts a new node into the list, between list-adjacent items. 725 * The node's previous and next indexes must not be set yet. 726 * @return the new node's index 727 */ 728 private int insertNodeBetween(int index, int nextIndex, long node) { 729 assert(previousIndexFromNode(node) == 0); 730 assert(nextIndexFromNode(node) == 0); 731 assert(nextIndexFromNode(nodes.elementAti(index)) == nextIndex); 732 // Append the new node and link it to the existing nodes. 733 int newIndex = nodes.size(); 734 node |= nodeFromPreviousIndex(index) | nodeFromNextIndex(nextIndex); 735 nodes.addElement(node); 736 // nodes[index].nextIndex = newIndex 737 node = nodes.elementAti(index); 738 nodes.setElementAt(changeNodeNextIndex(node, newIndex), index); 739 // nodes[nextIndex].previousIndex = newIndex 740 if(nextIndex != 0) { 741 node = nodes.elementAti(nextIndex); 742 nodes.setElementAt(changeNodePreviousIndex(node, newIndex), nextIndex); 743 } 744 return newIndex; 745 } 746 747 /** 748 * Finds the node which implies or contains a common=05 weight of the given strength 749 * (secondary or tertiary), if the current node is stronger. 750 * Skips weaker nodes and tailored nodes if the current node is stronger 751 * and is followed by an explicit-common-weight node. 752 * Always returns the input index if that node is no stronger than the given strength. 753 */ 754 private int findCommonNode(int index, int strength) { 755 assert(Collator.SECONDARY <= strength && strength <= Collator.TERTIARY); 756 long node = nodes.elementAti(index); 757 if(strengthFromNode(node) >= strength) { 758 // The current node is no stronger. 759 return index; 760 } 761 if(strength == Collator.SECONDARY ? !nodeHasBefore2(node) : !nodeHasBefore3(node)) { 762 // The current node implies the strength-common weight. 763 return index; 764 } 765 index = nextIndexFromNode(node); 766 node = nodes.elementAti(index); 767 assert(!isTailoredNode(node) && strengthFromNode(node) == strength && 768 weight16FromNode(node) < Collation.COMMON_WEIGHT16); 769 // Skip to the explicit common node. 770 do { 771 index = nextIndexFromNode(node); 772 node = nodes.elementAti(index); 773 assert(strengthFromNode(node) >= strength); 774 } while(isTailoredNode(node) || strengthFromNode(node) > strength || 775 weight16FromNode(node) < Collation.COMMON_WEIGHT16); 776 assert(weight16FromNode(node) == Collation.COMMON_WEIGHT16); 777 return index; 778 } 779 780 private void setCaseBits(CharSequence nfdString) { 781 int numTailoredPrimaries = 0; 782 for(int i = 0; i < cesLength; ++i) { 783 if(ceStrength(ces[i]) == Collator.PRIMARY) { ++numTailoredPrimaries; } 784 } 785 // We should not be able to get too many case bits because 786 // cesLength<=31==MAX_EXPANSION_LENGTH. 787 // 31 pairs of case bits fit into an long without setting its sign bit. 788 assert(numTailoredPrimaries <= 31); 789 790 long cases = 0; 791 if(numTailoredPrimaries > 0) { 792 CharSequence s = nfdString; 793 UTF16CollationIterator baseCEs = new UTF16CollationIterator(baseData, false, s, 0); 794 int baseCEsLength = baseCEs.fetchCEs() - 1; 795 assert(baseCEsLength >= 0 && baseCEs.getCE(baseCEsLength) == Collation.NO_CE); 796 797 int lastCase = 0; 798 int numBasePrimaries = 0; 799 for(int i = 0; i < baseCEsLength; ++i) { 800 long ce = baseCEs.getCE(i); 801 if((ce >>> 32) != 0) { 802 ++numBasePrimaries; 803 int c = ((int)ce >> 14) & 3; 804 assert(c == 0 || c == 2); // lowercase or uppercase, no mixed case in any base CE 805 if(numBasePrimaries < numTailoredPrimaries) { 806 cases |= (long)c << ((numBasePrimaries - 1) * 2); 807 } else if(numBasePrimaries == numTailoredPrimaries) { 808 lastCase = c; 809 } else if(c != lastCase) { 810 // There are more base primary CEs than tailored primaries. 811 // Set mixed case if the case bits of the remainder differ. 812 lastCase = 1; 813 // Nothing more can change. 814 break; 815 } 816 } 817 } 818 if(numBasePrimaries >= numTailoredPrimaries) { 819 cases |= (long)lastCase << ((numTailoredPrimaries - 1) * 2); 820 } 821 } 822 823 for(int i = 0; i < cesLength; ++i) { 824 long ce = ces[i] & 0xffffffffffff3fffL; // clear old case bits 825 int strength = ceStrength(ce); 826 if(strength == Collator.PRIMARY) { 827 ce |= (cases & 3) << 14; 828 cases >>>= 2; 829 } else if(strength == Collator.TERTIARY) { 830 // Tertiary CEs must have uppercase bits. 831 // See the LDML spec, and comments in class CollationCompare. 832 ce |= 0x8000; 833 } 834 // Tertiary ignorable CEs must have 0 case bits. 835 // We set 0 case bits for secondary CEs too 836 // since currently only U+0345 is cased and maps to a secondary CE, 837 // and it is lowercase. Other secondaries are uncased. 838 // See [[:Cased:]&[:uca1=:]] where uca1 queries the root primary weight. 839 ces[i] = ce; 840 } 841 } 842 843 /** Implements CollationRuleParser.Sink. */ 844 @Override 845 void suppressContractions(UnicodeSet set) { 846 dataBuilder.suppressContractions(set); 847 } 848 849 /** Implements CollationRuleParser.Sink. */ 850 @Override 851 void optimize(UnicodeSet set) { 852 optimizeSet.addAll(set); 853 } 854 855 /** 856 * Adds the mapping and its canonical closure. 857 * Takes ce32=dataBuilder.encodeCEs(...) so that the data builder 858 * need not re-encode the CEs multiple times. 859 */ 860 private int addWithClosure(CharSequence nfdPrefix, CharSequence nfdString, 861 long[] newCEs, int newCEsLength, int ce32) { 862 // Map from the NFD input to the CEs. 863 ce32 = addIfDifferent(nfdPrefix, nfdString, newCEs, newCEsLength, ce32); 864 ce32 = addOnlyClosure(nfdPrefix, nfdString, newCEs, newCEsLength, ce32); 865 addTailComposites(nfdPrefix, nfdString); 866 return ce32; 867 } 868 869 private int addOnlyClosure(CharSequence nfdPrefix, CharSequence nfdString, 870 long[] newCEs, int newCEsLength, int ce32) { 871 // Map from canonically equivalent input to the CEs. (But not from the all-NFD input.) 872 // TODO: make CanonicalIterator work with CharSequence, or maybe change arguments here to String 873 if(nfdPrefix.length() == 0) { 874 CanonicalIterator stringIter = new CanonicalIterator(nfdString.toString()); 875 String prefix = ""; 876 for(;;) { 877 String str = stringIter.next(); 878 if(str == null) { break; } 879 if(ignoreString(str) || str.contentEquals(nfdString)) { continue; } 880 ce32 = addIfDifferent(prefix, str, newCEs, newCEsLength, ce32); 881 } 882 } else { 883 CanonicalIterator prefixIter = new CanonicalIterator(nfdPrefix.toString()); 884 CanonicalIterator stringIter = new CanonicalIterator(nfdString.toString()); 885 for(;;) { 886 String prefix = prefixIter.next(); 887 if(prefix == null) { break; } 888 if(ignorePrefix(prefix)) { continue; } 889 boolean samePrefix = prefix.contentEquals(nfdPrefix); 890 for(;;) { 891 String str = stringIter.next(); 892 if(str == null) { break; } 893 if(ignoreString(str) || (samePrefix && str.contentEquals(nfdString))) { continue; } 894 ce32 = addIfDifferent(prefix, str, newCEs, newCEsLength, ce32); 895 } 896 stringIter.reset(); 897 } 898 } 899 return ce32; 900 } 901 902 private void addTailComposites(CharSequence nfdPrefix, CharSequence nfdString) { 903 // Look for the last starter in the NFD string. 904 int lastStarter; 905 int indexAfterLastStarter = nfdString.length(); 906 for(;;) { 907 if(indexAfterLastStarter == 0) { return; } // no starter at all 908 lastStarter = Character.codePointBefore(nfdString, indexAfterLastStarter); 909 if(nfd.getCombiningClass(lastStarter) == 0) { break; } 910 indexAfterLastStarter -= Character.charCount(lastStarter); 911 } 912 // No closure to Hangul syllables since we decompose them on the fly. 913 if(Hangul.isJamoL(lastStarter)) { return; } 914 915 // Are there any composites whose decomposition starts with the lastStarter? 916 // Note: Normalizer2Impl does not currently return start sets for NFC_QC=Maybe characters. 917 // We might find some more equivalent mappings here if it did. 918 UnicodeSet composites = new UnicodeSet(); 919 if(!nfcImpl.getCanonStartSet(lastStarter, composites)) { return; } 920 921 StringBuilder newNFDString = new StringBuilder(), newString = new StringBuilder(); 922 long[] newCEs = new long[Collation.MAX_EXPANSION_LENGTH]; 923 UnicodeSetIterator iter = new UnicodeSetIterator(composites); 924 while(iter.next()) { 925 assert(iter.codepoint != UnicodeSetIterator.IS_STRING); 926 int composite = iter.codepoint; 927 String decomp = nfd.getDecomposition(composite); 928 if(!mergeCompositeIntoString(nfdString, indexAfterLastStarter, composite, decomp, 929 newNFDString, newString)) { 930 continue; 931 } 932 int newCEsLength = dataBuilder.getCEs(nfdPrefix, newNFDString, newCEs, 0); 933 if(newCEsLength > Collation.MAX_EXPANSION_LENGTH) { 934 // Ignore mappings that we cannot store. 935 continue; 936 } 937 // Note: It is possible that the newCEs do not make use of the mapping 938 // for which we are adding the tail composites, in which case we might be adding 939 // unnecessary mappings. 940 // For example, when we add tail composites for ae^ (^=combining circumflex), 941 // UCA discontiguous-contraction matching does not find any matches 942 // for ae_^ (_=any combining diacritic below) *unless* there is also 943 // a contraction mapping for ae. 944 // Thus, if there is no ae contraction, then the ae^ mapping is ignored 945 // while fetching the newCEs for ae_^. 946 // TODO: Try to detect this effectively. 947 // (Alternatively, print a warning when prefix contractions are missing.) 948 949 // We do not need an explicit mapping for the NFD strings. 950 // It is fine if the NFD input collates like this via a sequence of mappings. 951 // It also saves a little bit of space, and may reduce the set of characters with contractions. 952 int ce32 = addIfDifferent(nfdPrefix, newString, 953 newCEs, newCEsLength, Collation.UNASSIGNED_CE32); 954 if(ce32 != Collation.UNASSIGNED_CE32) { 955 // was different, was added 956 addOnlyClosure(nfdPrefix, newNFDString, newCEs, newCEsLength, ce32); 957 } 958 } 959 } 960 961 private boolean mergeCompositeIntoString(CharSequence nfdString, int indexAfterLastStarter, 962 int composite, CharSequence decomp, 963 StringBuilder newNFDString, StringBuilder newString) { 964 assert(Character.codePointBefore(nfdString, indexAfterLastStarter) == 965 Character.codePointAt(decomp, 0)); 966 int lastStarterLength = Character.offsetByCodePoints(decomp, 0, 1); 967 if(lastStarterLength == decomp.length()) { 968 // Singleton decompositions should be found by addWithClosure() 969 // and the CanonicalIterator, so we can ignore them here. 970 return false; 971 } 972 if(equalSubSequences(nfdString, indexAfterLastStarter, decomp, lastStarterLength)) { 973 // same strings, nothing new to be found here 974 return false; 975 } 976 977 // Make new FCD strings that combine a composite, or its decomposition, 978 // into the nfdString's last starter and the combining marks following it. 979 // Make an NFD version, and a version with the composite. 980 newNFDString.setLength(0); 981 newNFDString.append(nfdString, 0, indexAfterLastStarter); 982 newString.setLength(0); 983 newString.append(nfdString, 0, indexAfterLastStarter - lastStarterLength) 984 .appendCodePoint(composite); 985 986 // The following is related to discontiguous contraction matching, 987 // but builds only FCD strings (or else returns false). 988 int sourceIndex = indexAfterLastStarter; 989 int decompIndex = lastStarterLength; 990 // Small optimization: We keep the source character across loop iterations 991 // because we do not always consume it, 992 // and then need not fetch it again nor look up its combining class again. 993 int sourceChar = Collation.SENTINEL_CP; 994 // The cc variables need to be declared before the loop so that at the end 995 // they are set to the last combining classes seen. 996 int sourceCC = 0; 997 int decompCC = 0; 998 for(;;) { 999 if(sourceChar < 0) { 1000 if(sourceIndex >= nfdString.length()) { break; } 1001 sourceChar = Character.codePointAt(nfdString, sourceIndex); 1002 sourceCC = nfd.getCombiningClass(sourceChar); 1003 assert(sourceCC != 0); 1004 } 1005 // We consume a decomposition character in each iteration. 1006 if(decompIndex >= decomp.length()) { break; } 1007 int decompChar = Character.codePointAt(decomp, decompIndex); 1008 decompCC = nfd.getCombiningClass(decompChar); 1009 // Compare the two characters and their combining classes. 1010 if(decompCC == 0) { 1011 // Unable to merge because the source contains a non-zero combining mark 1012 // but the composite's decomposition contains another starter. 1013 // The strings would not be equivalent. 1014 return false; 1015 } else if(sourceCC < decompCC) { 1016 // Composite + sourceChar would not be FCD. 1017 return false; 1018 } else if(decompCC < sourceCC) { 1019 newNFDString.appendCodePoint(decompChar); 1020 decompIndex += Character.charCount(decompChar); 1021 } else if(decompChar != sourceChar) { 1022 // Blocked because same combining class. 1023 return false; 1024 } else { // match: decompChar == sourceChar 1025 newNFDString.appendCodePoint(decompChar); 1026 decompIndex += Character.charCount(decompChar); 1027 sourceIndex += Character.charCount(decompChar); 1028 sourceChar = Collation.SENTINEL_CP; 1029 } 1030 } 1031 // We are at the end of at least one of the two inputs. 1032 if(sourceChar >= 0) { // more characters from nfdString but not from decomp 1033 if(sourceCC < decompCC) { 1034 // Appending the next source character to the composite would not be FCD. 1035 return false; 1036 } 1037 newNFDString.append(nfdString, sourceIndex, nfdString.length()); 1038 newString.append(nfdString, sourceIndex, nfdString.length()); 1039 } else if(decompIndex < decomp.length()) { // more characters from decomp, not from nfdString 1040 newNFDString.append(decomp, decompIndex, decomp.length()); 1041 } 1042 assert(nfd.isNormalized(newNFDString)); 1043 assert(fcd.isNormalized(newString)); 1044 assert(nfd.normalize(newString).equals(newNFDString.toString())); // canonically equivalent 1045 return true; 1046 } 1047 1048 private boolean equalSubSequences(CharSequence left, int leftStart, CharSequence right, int rightStart) { 1049 // C++ UnicodeString::compare(leftStart, 0x7fffffff, right, rightStart, 0x7fffffff) == 0 1050 int leftLength = left.length(); 1051 if((leftLength - leftStart) != (right.length() - rightStart)) { return false; } 1052 while(leftStart < leftLength) { 1053 if(left.charAt(leftStart++) != right.charAt(rightStart++)) { 1054 return false; 1055 } 1056 } 1057 return true; 1058 } 1059 private boolean ignorePrefix(CharSequence s) { 1060 // Do not map non-FCD prefixes. 1061 return !isFCD(s); 1062 } 1063 private boolean ignoreString(CharSequence s) { 1064 // Do not map non-FCD strings. 1065 // Do not map strings that start with Hangul syllables: We decompose those on the fly. 1066 return !isFCD(s) || Hangul.isHangul(s.charAt(0)); 1067 } 1068 private boolean isFCD(CharSequence s) { 1069 return fcd.isNormalized(s); 1070 } 1071 1072 private static final UnicodeSet COMPOSITES = new UnicodeSet("[:NFD_QC=N:]"); 1073 static { 1074 // Hangul is decomposed on the fly during collation. 1075 COMPOSITES.remove(Hangul.HANGUL_BASE, Hangul.HANGUL_END); 1076 } 1077 1078 private void closeOverComposites() { 1079 String prefix = ""; // empty 1080 UnicodeSetIterator iter = new UnicodeSetIterator(COMPOSITES); 1081 while(iter.next()) { 1082 assert(iter.codepoint != UnicodeSetIterator.IS_STRING); 1083 String nfdString = nfd.getDecomposition(iter.codepoint); 1084 cesLength = dataBuilder.getCEs(nfdString, ces, 0); 1085 if(cesLength > Collation.MAX_EXPANSION_LENGTH) { 1086 // Too many CEs from the decomposition (unusual), ignore this composite. 1087 // We could add a capacity parameter to getCEs() and reallocate if necessary. 1088 // However, this can only really happen in contrived cases. 1089 continue; 1090 } 1091 String composite = iter.getString(); 1092 addIfDifferent(prefix, composite, ces, cesLength, Collation.UNASSIGNED_CE32); 1093 } 1094 } 1095 1096 private int addIfDifferent(CharSequence prefix, CharSequence str, 1097 long[] newCEs, int newCEsLength, int ce32) { 1098 long[] oldCEs = new long[Collation.MAX_EXPANSION_LENGTH]; 1099 int oldCEsLength = dataBuilder.getCEs(prefix, str, oldCEs, 0); 1100 if(!sameCEs(newCEs, newCEsLength, oldCEs, oldCEsLength)) { 1101 if(ce32 == Collation.UNASSIGNED_CE32) { 1102 ce32 = dataBuilder.encodeCEs(newCEs, newCEsLength); 1103 } 1104 dataBuilder.addCE32(prefix, str, ce32); 1105 } 1106 return ce32; 1107 } 1108 1109 private static boolean sameCEs(long[] ces1, int ces1Length, 1110 long[] ces2, int ces2Length) { 1111 if(ces1Length != ces2Length) { 1112 return false; 1113 } 1114 assert(ces1Length <= Collation.MAX_EXPANSION_LENGTH); 1115 for(int i = 0; i < ces1Length; ++i) { 1116 if(ces1[i] != ces2[i]) { return false; } 1117 } 1118 return true; 1119 } 1120 1121 private static final int alignWeightRight(int w) { 1122 if(w != 0) { 1123 while((w & 0xff) == 0) { w >>>= 8; } 1124 } 1125 return w; 1126 } 1127 1128 /** 1129 * Walks the tailoring graph and overwrites tailored nodes with new CEs. 1130 * After this, the graph is destroyed. 1131 * The nodes array can then be used only as a source of tailored CEs. 1132 */ 1133 private void makeTailoredCEs() { 1134 CollationWeights primaries = new CollationWeights(); 1135 CollationWeights secondaries = new CollationWeights(); 1136 CollationWeights tertiaries = new CollationWeights(); 1137 long[] nodesArray = nodes.getBuffer(); 1138 if(DEBUG) { 1139 System.out.println("\nCollationBuilder.makeTailoredCEs()"); 1140 } 1141 1142 for(int rpi = 0; rpi < rootPrimaryIndexes.size(); ++rpi) { 1143 int i = rootPrimaryIndexes.elementAti(rpi); 1144 long node = nodesArray[i]; 1145 long p = weight32FromNode(node); 1146 int s = p == 0 ? 0 : Collation.COMMON_WEIGHT16; 1147 int t = s; 1148 int q = 0; 1149 boolean pIsTailored = false; 1150 boolean sIsTailored = false; 1151 boolean tIsTailored = false; 1152 if(DEBUG) { 1153 System.out.printf("\nprimary %x\n", alignWeightRight((int)p)); 1154 } 1155 int pIndex = p == 0 ? 0 : rootElements.findPrimary(p); 1156 int nextIndex = nextIndexFromNode(node); 1157 while(nextIndex != 0) { 1158 i = nextIndex; 1159 node = nodesArray[i]; 1160 nextIndex = nextIndexFromNode(node); 1161 int strength = strengthFromNode(node); 1162 if(strength == Collator.QUATERNARY) { 1163 assert(isTailoredNode(node)); 1164 if(DEBUG) { 1165 System.out.print(" quat+ "); 1166 } 1167 if(q == 3) { 1168 // C++ U_BUFFER_OVERFLOW_ERROR 1169 throw new UnsupportedOperationException("quaternary tailoring gap too small"); 1170 } 1171 ++q; 1172 } else { 1173 if(strength == Collator.TERTIARY) { 1174 if(isTailoredNode(node)) { 1175 if(DEBUG) { 1176 System.out.print(" ter+ "); 1177 } 1178 if(!tIsTailored) { 1179 // First tailored tertiary node for [p, s]. 1180 int tCount = countTailoredNodes(nodesArray, nextIndex, 1181 Collator.TERTIARY) + 1; 1182 int tLimit; 1183 if(t == 0) { 1184 // Gap at the beginning of the tertiary CE range. 1185 t = rootElements.getTertiaryBoundary() - 0x100; 1186 tLimit = (int)rootElements.getFirstTertiaryCE() & Collation.ONLY_TERTIARY_MASK; 1187 } else if(!pIsTailored && !sIsTailored) { 1188 // p and s are root weights. 1189 tLimit = rootElements.getTertiaryAfter(pIndex, s, t); 1190 } else if(t == Collation.BEFORE_WEIGHT16) { 1191 tLimit = Collation.COMMON_WEIGHT16; 1192 } else { 1193 // [p, s] is tailored. 1194 assert(t == Collation.COMMON_WEIGHT16); 1195 tLimit = rootElements.getTertiaryBoundary(); 1196 } 1197 assert(tLimit == 0x4000 || (tLimit & ~Collation.ONLY_TERTIARY_MASK) == 0); 1198 tertiaries.initForTertiary(); 1199 if(!tertiaries.allocWeights(t, tLimit, tCount)) { 1200 // C++ U_BUFFER_OVERFLOW_ERROR 1201 throw new UnsupportedOperationException("tertiary tailoring gap too small"); 1202 } 1203 tIsTailored = true; 1204 } 1205 t = (int)tertiaries.nextWeight(); 1206 assert(t != 0xffffffff); 1207 } else { 1208 t = weight16FromNode(node); 1209 tIsTailored = false; 1210 if(DEBUG) { 1211 System.out.printf(" ter %x\n", alignWeightRight(t)); 1212 } 1213 } 1214 } else { 1215 if(strength == Collator.SECONDARY) { 1216 if(isTailoredNode(node)) { 1217 if(DEBUG) { 1218 System.out.print(" sec+ "); 1219 } 1220 if(!sIsTailored) { 1221 // First tailored secondary node for p. 1222 int sCount = countTailoredNodes(nodesArray, nextIndex, 1223 Collator.SECONDARY) + 1; 1224 int sLimit; 1225 if(s == 0) { 1226 // Gap at the beginning of the secondary CE range. 1227 s = rootElements.getSecondaryBoundary() - 0x100; 1228 sLimit = (int)(rootElements.getFirstSecondaryCE() >> 16); 1229 } else if(!pIsTailored) { 1230 // p is a root primary. 1231 sLimit = rootElements.getSecondaryAfter(pIndex, s); 1232 } else if(s == Collation.BEFORE_WEIGHT16) { 1233 sLimit = Collation.COMMON_WEIGHT16; 1234 } else { 1235 // p is a tailored primary. 1236 assert(s == Collation.COMMON_WEIGHT16); 1237 sLimit = rootElements.getSecondaryBoundary(); 1238 } 1239 if(s == Collation.COMMON_WEIGHT16) { 1240 // Do not tailor into the getSortKey() range of 1241 // compressed common secondaries. 1242 s = rootElements.getLastCommonSecondary(); 1243 } 1244 secondaries.initForSecondary(); 1245 if(!secondaries.allocWeights(s, sLimit, sCount)) { 1246 // C++ U_BUFFER_OVERFLOW_ERROR 1247 if(DEBUG) { 1248 System.out.printf("!secondaries.allocWeights(%x, %x, sCount=%d)\n", 1249 alignWeightRight(s), alignWeightRight(sLimit), 1250 alignWeightRight(sCount)); 1251 } 1252 throw new UnsupportedOperationException("secondary tailoring gap too small"); 1253 } 1254 sIsTailored = true; 1255 } 1256 s = (int)secondaries.nextWeight(); 1257 assert(s != 0xffffffff); 1258 } else { 1259 s = weight16FromNode(node); 1260 sIsTailored = false; 1261 if(DEBUG) { 1262 System.out.printf(" sec %x\n", alignWeightRight(s)); 1263 } 1264 } 1265 } else /* Collator.PRIMARY */ { 1266 assert(isTailoredNode(node)); 1267 if(DEBUG) { 1268 System.out.print("pri+ "); 1269 } 1270 if(!pIsTailored) { 1271 // First tailored primary node in this list. 1272 int pCount = countTailoredNodes(nodesArray, nextIndex, 1273 Collator.PRIMARY) + 1; 1274 boolean isCompressible = baseData.isCompressiblePrimary(p); 1275 long pLimit = 1276 rootElements.getPrimaryAfter(p, pIndex, isCompressible); 1277 primaries.initForPrimary(isCompressible); 1278 if(!primaries.allocWeights(p, pLimit, pCount)) { 1279 // C++ U_BUFFER_OVERFLOW_ERROR // TODO: introduce a more specific UErrorCode? 1280 throw new UnsupportedOperationException("primary tailoring gap too small"); 1281 } 1282 pIsTailored = true; 1283 } 1284 p = primaries.nextWeight(); 1285 assert(p != 0xffffffffL); 1286 s = Collation.COMMON_WEIGHT16; 1287 sIsTailored = false; 1288 } 1289 t = s == 0 ? 0 : Collation.COMMON_WEIGHT16; 1290 tIsTailored = false; 1291 } 1292 q = 0; 1293 } 1294 if(isTailoredNode(node)) { 1295 nodesArray[i] = Collation.makeCE(p, s, t, q); 1296 if(DEBUG) { 1297 System.out.printf("%016x\n", nodesArray[i]); 1298 } 1299 } 1300 } 1301 } 1302 } 1303 1304 /** 1305 * Counts the tailored nodes of the given strength up to the next node 1306 * which is either stronger or has an explicit weight of this strength. 1307 */ 1308 private static int countTailoredNodes(long[] nodesArray, int i, int strength) { 1309 int count = 0; 1310 for(;;) { 1311 if(i == 0) { break; } 1312 long node = nodesArray[i]; 1313 if(strengthFromNode(node) < strength) { break; } 1314 if(strengthFromNode(node) == strength) { 1315 if(isTailoredNode(node)) { 1316 ++count; 1317 } else { 1318 break; 1319 } 1320 } 1321 i = nextIndexFromNode(node); 1322 } 1323 return count; 1324 } 1325 1326 private static final class CEFinalizer implements CollationDataBuilder.CEModifier { 1327 CEFinalizer(long[] ces) { 1328 finalCEs = ces; 1329 } 1330 @Override 1331 public long modifyCE32(int ce32) { 1332 assert(!Collation.isSpecialCE32(ce32)); 1333 if(CollationBuilder.isTempCE32(ce32)) { 1334 // retain case bits 1335 return finalCEs[CollationBuilder.indexFromTempCE32(ce32)] | ((ce32 & 0xc0) << 8); 1336 } else { 1337 return Collation.NO_CE; 1338 } 1339 } 1340 @Override 1341 public long modifyCE(long ce) { 1342 if(CollationBuilder.isTempCE(ce)) { 1343 // retain case bits 1344 return finalCEs[CollationBuilder.indexFromTempCE(ce)] | (ce & 0xc000); 1345 } else { 1346 return Collation.NO_CE; 1347 } 1348 } 1349 1350 private long[] finalCEs; 1351 }; 1352 1353 /** Replaces temporary CEs with the final CEs they point to. */ 1354 private void finalizeCEs() { 1355 CollationDataBuilder newBuilder = new CollationDataBuilder(); 1356 newBuilder.initForTailoring(baseData); 1357 CEFinalizer finalizer = new CEFinalizer(nodes.getBuffer()); 1358 newBuilder.copyFrom(dataBuilder, finalizer); 1359 dataBuilder = newBuilder; 1360 } 1361 1362 /** 1363 * Encodes "temporary CE" data into a CE that fits into the CE32 data structure, 1364 * with 2-byte primary, 1-byte secondary and 6-bit tertiary, 1365 * with valid CE byte values. 1366 * 1367 * The index must not exceed 20 bits (0xfffff). 1368 * The strength must fit into 2 bits (Collator.PRIMARY..Collator.QUATERNARY). 1369 * 1370 * Temporary CEs are distinguished from real CEs by their use of 1371 * secondary weights 06..45 which are otherwise reserved for compressed sort keys. 1372 * 1373 * The case bits are unused and available. 1374 */ 1375 private static long tempCEFromIndexAndStrength(int index, int strength) { 1376 return 1377 // CE byte offsets, to ensure valid CE bytes, and case bits 11 1378 0x4040000006002000L + 1379 // index bits 19..13 -> primary byte 1 = CE bits 63..56 (byte values 40..BF) 1380 ((long)(index & 0xfe000) << 43) + 1381 // index bits 12..6 -> primary byte 2 = CE bits 55..48 (byte values 40..BF) 1382 ((long)(index & 0x1fc0) << 42) + 1383 // index bits 5..0 -> secondary byte 1 = CE bits 31..24 (byte values 06..45) 1384 ((index & 0x3f) << 24) + 1385 // strength bits 1..0 -> tertiary byte 1 = CE bits 13..8 (byte values 20..23) 1386 (strength << 8); 1387 } 1388 private static int indexFromTempCE(long tempCE) { 1389 tempCE -= 0x4040000006002000L; 1390 return 1391 ((int)(tempCE >> 43) & 0xfe000) | 1392 ((int)(tempCE >> 42) & 0x1fc0) | 1393 ((int)(tempCE >> 24) & 0x3f); 1394 } 1395 private static int strengthFromTempCE(long tempCE) { 1396 return ((int)tempCE >> 8) & 3; 1397 } 1398 private static boolean isTempCE(long ce) { 1399 int sec = (int)ce >>> 24; 1400 return 6 <= sec && sec <= 0x45; 1401 } 1402 1403 private static int indexFromTempCE32(int tempCE32) { 1404 tempCE32 -= 0x40400620; 1405 return 1406 ((tempCE32 >> 11) & 0xfe000) | 1407 ((tempCE32 >> 10) & 0x1fc0) | 1408 ((tempCE32 >> 8) & 0x3f); 1409 } isTempCE32(int ce32)1410 private static boolean isTempCE32(int ce32) { 1411 return 1412 (ce32 & 0xff) >= 2 && // not a long-primary/long-secondary CE32 1413 6 <= ((ce32 >> 8) & 0xff) && ((ce32 >> 8) & 0xff) <= 0x45; 1414 } 1415 ceStrength(long ce)1416 private static int ceStrength(long ce) { 1417 return 1418 isTempCE(ce) ? strengthFromTempCE(ce) : 1419 (ce & 0xff00000000000000L) != 0 ? Collator.PRIMARY : 1420 ((int)ce & 0xff000000) != 0 ? Collator.SECONDARY : 1421 ce != 0 ? Collator.TERTIARY : 1422 Collator.IDENTICAL; 1423 } 1424 1425 /** At most 1M nodes, limited by the 20 bits in node bit fields. */ 1426 private static final int MAX_INDEX = 0xfffff; 1427 /** 1428 * Node bit 6 is set on a primary node if there are nodes 1429 * with secondary values below the common secondary weight (05). 1430 */ 1431 private static final int HAS_BEFORE2 = 0x40; 1432 /** 1433 * Node bit 5 is set on a primary or secondary node if there are nodes 1434 * with tertiary values below the common tertiary weight (05). 1435 */ 1436 private static final int HAS_BEFORE3 = 0x20; 1437 /** 1438 * Node bit 3 distinguishes a tailored node, which has no weight value, 1439 * from a node with an explicit (root or default) weight. 1440 */ 1441 private static final int IS_TAILORED = 8; 1442 nodeFromWeight32(long weight32)1443 private static long nodeFromWeight32(long weight32) { 1444 return weight32 << 32; 1445 } nodeFromWeight16(int weight16)1446 private static long nodeFromWeight16(int weight16) { 1447 return (long)weight16 << 48; 1448 } nodeFromPreviousIndex(int previous)1449 private static long nodeFromPreviousIndex(int previous) { 1450 return (long)previous << 28; 1451 } nodeFromNextIndex(int next)1452 private static long nodeFromNextIndex(int next) { 1453 return next << 8; 1454 } nodeFromStrength(int strength)1455 private static long nodeFromStrength(int strength) { 1456 return strength; 1457 } 1458 weight32FromNode(long node)1459 private static long weight32FromNode(long node) { 1460 return node >>> 32; 1461 } weight16FromNode(long node)1462 private static int weight16FromNode(long node) { 1463 return (int)(node >> 48) & 0xffff; 1464 } previousIndexFromNode(long node)1465 private static int previousIndexFromNode(long node) { 1466 return (int)(node >> 28) & MAX_INDEX; 1467 } nextIndexFromNode(long node)1468 private static int nextIndexFromNode(long node) { 1469 return ((int)node >> 8) & MAX_INDEX; 1470 } strengthFromNode(long node)1471 private static int strengthFromNode(long node) { 1472 return (int)node & 3; 1473 } 1474 nodeHasBefore2(long node)1475 private static boolean nodeHasBefore2(long node) { 1476 return (node & HAS_BEFORE2) != 0; 1477 } nodeHasBefore3(long node)1478 private static boolean nodeHasBefore3(long node) { 1479 return (node & HAS_BEFORE3) != 0; 1480 } nodeHasAnyBefore(long node)1481 private static boolean nodeHasAnyBefore(long node) { 1482 return (node & (HAS_BEFORE2 | HAS_BEFORE3)) != 0; 1483 } isTailoredNode(long node)1484 private static boolean isTailoredNode(long node) { 1485 return (node & IS_TAILORED) != 0; 1486 } 1487 changeNodePreviousIndex(long node, int previous)1488 private static long changeNodePreviousIndex(long node, int previous) { 1489 return (node & 0xffff00000fffffffL) | nodeFromPreviousIndex(previous); 1490 } changeNodeNextIndex(long node, int next)1491 private static long changeNodeNextIndex(long node, int next) { 1492 return (node & 0xfffffffff00000ffL) | nodeFromNextIndex(next); 1493 } 1494 1495 private Normalizer2 nfd, fcd; 1496 private Normalizer2Impl nfcImpl; 1497 1498 private CollationTailoring base; 1499 private CollationData baseData; 1500 private CollationRootElements rootElements; 1501 private long variableTop; 1502 1503 private CollationDataBuilder dataBuilder; 1504 private boolean fastLatinEnabled; 1505 private UnicodeSet optimizeSet = new UnicodeSet(); 1506 1507 private long[] ces = new long[Collation.MAX_EXPANSION_LENGTH]; 1508 private int cesLength; 1509 1510 /** 1511 * Indexes of nodes with root primary weights, sorted by primary. 1512 * Compact form of a TreeMap from root primary to node index. 1513 * 1514 * This is a performance optimization for finding reset positions. 1515 * Without this, we would have to search through the entire nodes list. 1516 * It also allows storing root primary weights in list head nodes, 1517 * without previous index, leaving room in root primary nodes for 32-bit primary weights. 1518 */ 1519 private UVector32 rootPrimaryIndexes; 1520 /** 1521 * Data structure for assigning tailored weights and CEs. 1522 * Doubly-linked lists of nodes in mostly collation order. 1523 * Each list starts with a root primary node and ends with a nextIndex of 0. 1524 * 1525 * When there are any nodes in the list, then there is always a root primary node at index 0. 1526 * This allows some code not to have to check explicitly for nextIndex==0. 1527 * 1528 * Root primary nodes have 32-bit weights but do not have previous indexes. 1529 * All other nodes have at most 16-bit weights and do have previous indexes. 1530 * 1531 * Nodes with explicit weights store root collator weights, 1532 * or default weak weights (e.g., secondary 05) for stronger nodes. 1533 * "Tailored" nodes, with the IS_TAILORED bit set, 1534 * do not store explicit weights but rather 1535 * create a difference of a certain strength from the preceding node. 1536 * 1537 * A root node is followed by either 1538 * - a root/default node of the same strength, or 1539 * - a root/default node of the next-weaker strength, or 1540 * - a tailored node of the same strength. 1541 * 1542 * A node of a given strength normally implies "common" weights on weaker levels. 1543 * 1544 * A node with HAS_BEFORE2 must be immediately followed by 1545 * a secondary node with an explicit below-common weight, then a secondary tailored node, 1546 * and later an explicit common-secondary node. 1547 * The below-common weight can be a root weight, 1548 * or it can be BEFORE_WEIGHT16 for tailoring before an implied common weight 1549 * or before the lowest root weight. 1550 * (&[before 2] resets to an explicit secondary node so that 1551 * the following addRelation(secondary) tailors right after that. 1552 * If we did not have this node and instead were to reset on the primary node, 1553 * then addRelation(secondary) would skip forward to the the COMMON_WEIGHT16 node.) 1554 * 1555 * If the flag is not set, then there are no explicit secondary nodes 1556 * with the common or lower weights. 1557 * 1558 * Same for HAS_BEFORE3 for tertiary nodes and weights. 1559 * A node must not have both flags set. 1560 * 1561 * Tailored CEs are initially represented in a CollationDataBuilder as temporary CEs 1562 * which point to stable indexes in this list, 1563 * and temporary CEs stored in a CollationDataBuilder only point to tailored nodes. 1564 * 1565 * A temporary CE in the ces[] array may point to a non-tailored reset-before-position node, 1566 * until the next relation is added. 1567 * 1568 * At the end, the tailored weights are allocated as necessary, 1569 * then the tailored nodes are replaced with final CEs, 1570 * and the CollationData is rewritten by replacing temporary CEs with final ones. 1571 * 1572 * We cannot simply insert new nodes in the middle of the array 1573 * because that would invalidate the indexes stored in existing temporary CEs. 1574 * We need to use a linked graph with stable indexes to existing nodes. 1575 * A doubly-linked list seems easiest to maintain. 1576 * 1577 * Each node is stored as an long, with its fields stored as bit fields. 1578 * 1579 * Root primary node: 1580 * - primary weight: 32 bits 63..32 1581 * - reserved/unused/zero: 4 bits 31..28 1582 * 1583 * Weaker root nodes & tailored nodes: 1584 * - a weight: 16 bits 63..48 1585 * + a root or default weight for a non-tailored node 1586 * + unused/zero for a tailored node 1587 * - index to the previous node: 20 bits 47..28 1588 * 1589 * All types of nodes: 1590 * - index to the next node: 20 bits 27..8 1591 * + nextIndex=0 in last node per root-primary list 1592 * - reserved/unused/zero bits: bits 7, 4, 2 1593 * - HAS_BEFORE2: bit 6 1594 * - HAS_BEFORE3: bit 5 1595 * - IS_TAILORED: bit 3 1596 * - the difference strength (primary/secondary/tertiary/quaternary): 2 bits 1..0 1597 * 1598 * We could allocate structs with pointers, but we would have to store them 1599 * in a pointer list so that they can be indexed from temporary CEs, 1600 * and they would require more memory allocations. 1601 */ 1602 private UVector64 nodes; 1603 } 1604