1 /* GENERATED SOURCE. DO NOT MODIFY. */ 2 // © 2016 and later: Unicode, Inc. and others. 3 // License & terms of use: http://www.unicode.org/copyright.html#License 4 /* 5 ******************************************************************************* 6 * Copyright (C) 2010-2014, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 ******************************************************************************* 9 * CollationIterator.java, ported from collationiterator.h/.cpp 10 * 11 * C++ version created on: 2010oct27 12 * created by: Markus W. Scherer 13 */ 14 15 package ohos.global.icu.impl.coll; 16 17 import ohos.global.icu.impl.Normalizer2Impl.Hangul; 18 import ohos.global.icu.impl.Trie2_32; 19 import ohos.global.icu.util.BytesTrie; 20 import ohos.global.icu.util.CharsTrie; 21 import ohos.global.icu.util.ICUException; 22 23 /** 24 * Collation element iterator and abstract character iterator. 25 * 26 * When a method returns a code point value, it must be in 0..10FFFF, 27 * except it can be negative as a sentinel value. 28 * @hide exposed on OHOS 29 */ 30 public abstract class CollationIterator { 31 private static final class CEBuffer { 32 /** Large enough for CEs of most short strings. */ 33 private static final int INITIAL_CAPACITY = 40; 34 CEBuffer()35 CEBuffer() {} 36 append(long ce)37 void append(long ce) { 38 if(length >= INITIAL_CAPACITY) { 39 ensureAppendCapacity(1); 40 } 41 buffer[length++] = ce; 42 } 43 appendUnsafe(long ce)44 void appendUnsafe(long ce) { 45 buffer[length++] = ce; 46 } 47 ensureAppendCapacity(int appCap)48 void ensureAppendCapacity(int appCap) { 49 int capacity = buffer.length; 50 if((length + appCap) <= capacity) { return; } 51 do { 52 if(capacity < 1000) { 53 capacity *= 4; 54 } else { 55 capacity *= 2; 56 } 57 } while(capacity < (length + appCap)); 58 long[] newBuffer = new long[capacity]; 59 System.arraycopy(buffer, 0, newBuffer, 0, length); 60 buffer = newBuffer; 61 } 62 incLength()63 void incLength() { 64 // Use INITIAL_CAPACITY for a very simple fastpath. 65 // (Rather than buffer.getCapacity().) 66 if(length >= INITIAL_CAPACITY) { 67 ensureAppendCapacity(1); 68 } 69 ++length; 70 } 71 set(int i, long ce)72 long set(int i, long ce) { 73 return buffer[i] = ce; 74 } get(int i)75 long get(int i) { return buffer[i]; } 76 getCEs()77 long[] getCEs() { return buffer; } 78 79 int length = 0; 80 81 private long[] buffer = new long[INITIAL_CAPACITY]; 82 } 83 84 // State of combining marks skipped in discontiguous contraction. 85 // We create a state object on first use and keep it around deactivated between uses. 86 private static final class SkippedState { 87 // Born active but empty. SkippedState()88 SkippedState() {} clear()89 void clear() { 90 oldBuffer.setLength(0); 91 pos = 0; 92 // The newBuffer is reset by setFirstSkipped(). 93 } 94 isEmpty()95 boolean isEmpty() { return oldBuffer.length() == 0; } 96 hasNext()97 boolean hasNext() { return pos < oldBuffer.length(); } 98 99 // Requires hasNext(). next()100 int next() { 101 int c = oldBuffer.codePointAt(pos); 102 pos += Character.charCount(c); 103 return c; 104 } 105 106 // Accounts for one more input code point read beyond the end of the marks buffer. incBeyond()107 void incBeyond() { 108 assert(!hasNext()); 109 ++pos; 110 } 111 112 // Goes backward through the skipped-marks buffer. 113 // Returns the number of code points read beyond the skipped marks 114 // that need to be backtracked through normal input. backwardNumCodePoints(int n)115 int backwardNumCodePoints(int n) { 116 int length = oldBuffer.length(); 117 int beyond = pos - length; 118 if(beyond > 0) { 119 if(beyond >= n) { 120 // Not back far enough to re-enter the oldBuffer. 121 pos -= n; 122 return n; 123 } else { 124 // Back out all beyond-oldBuffer code points and re-enter the buffer. 125 pos = oldBuffer.offsetByCodePoints(length, beyond - n); 126 return beyond; 127 } 128 } else { 129 // Go backwards from inside the oldBuffer. 130 pos = oldBuffer.offsetByCodePoints(pos, -n); 131 return 0; 132 } 133 } 134 setFirstSkipped(int c)135 void setFirstSkipped(int c) { 136 skipLengthAtMatch = 0; 137 newBuffer.setLength(0); 138 newBuffer.appendCodePoint(c); 139 } 140 skip(int c)141 void skip(int c) { 142 newBuffer.appendCodePoint(c); 143 } 144 recordMatch()145 void recordMatch() { skipLengthAtMatch = newBuffer.length(); } 146 147 // Replaces the characters we consumed with the newly skipped ones. replaceMatch()148 void replaceMatch() { 149 // Note: UnicodeString.replace() pins pos to at most length(). 150 int oldLength = oldBuffer.length(); 151 if(pos > oldLength) { pos = oldLength; } 152 oldBuffer.delete(0, pos).insert(0, newBuffer, 0, skipLengthAtMatch); 153 pos = 0; 154 } 155 saveTrieState(CharsTrie trie)156 void saveTrieState(CharsTrie trie) { trie.saveState(state); } resetToTrieState(CharsTrie trie)157 void resetToTrieState(CharsTrie trie) { trie.resetToState(state); } 158 159 // Combining marks skipped in previous discontiguous-contraction matching. 160 // After that discontiguous contraction was completed, we start reading them from here. 161 private final StringBuilder oldBuffer = new StringBuilder(); 162 // Combining marks newly skipped in current discontiguous-contraction matching. 163 // These might have been read from the normal text or from the oldBuffer. 164 private final StringBuilder newBuffer = new StringBuilder(); 165 // Reading index in oldBuffer, 166 // or counter for how many code points have been read beyond oldBuffer (pos-oldBuffer.length()). 167 private int pos; 168 // newBuffer.length() at the time of the last matching character. 169 // When a partial match fails, we back out skipped and partial-matching input characters. 170 private int skipLengthAtMatch; 171 // We save the trie state before we attempt to match a character, 172 // so that we can skip it and try the next one. 173 private CharsTrie.State state = new CharsTrie.State(); 174 }; 175 176 /** 177 * Partially constructs the iterator. 178 * In Java, we cache partially constructed iterators 179 * and finish their setup when starting to work on text 180 * (via reset(boolean) and the setText(numeric, ...) methods of subclasses). 181 * This avoids memory allocations for iterators that remain unused. 182 * 183 * <p>In C++, there is only one constructor, and iterators are 184 * stack-allocated as needed. 185 */ CollationIterator(CollationData d)186 public CollationIterator(CollationData d) { 187 trie = d.trie; 188 data = d; 189 numCpFwd = -1; 190 isNumeric = false; 191 ceBuffer = null; 192 } 193 CollationIterator(CollationData d, boolean numeric)194 public CollationIterator(CollationData d, boolean numeric) { 195 trie = d.trie; 196 data = d; 197 numCpFwd = -1; 198 isNumeric = numeric; 199 ceBuffer = new CEBuffer(); 200 } 201 202 @Override equals(Object other)203 public boolean equals(Object other) { 204 // Subclasses: Call this method and then add more specific checks. 205 // Compare the iterator state but not the collation data (trie & data fields): 206 // Assume that the caller compares the data. 207 // Ignore skipped since that should be unused between calls to nextCE(). 208 // (It only stays around to avoid another memory allocation.) 209 if(other == null) { return false; } 210 if(!this.getClass().equals(other.getClass())) { return false; } 211 CollationIterator o = (CollationIterator)other; 212 if(!(ceBuffer.length == o.ceBuffer.length && 213 cesIndex == o.cesIndex && 214 numCpFwd == o.numCpFwd && 215 isNumeric == o.isNumeric)) { 216 return false; 217 } 218 for(int i = 0; i < ceBuffer.length; ++i) { 219 if(ceBuffer.get(i) != o.ceBuffer.get(i)) { return false; } 220 } 221 return true; 222 } 223 224 @Override hashCode()225 public int hashCode() { 226 // Dummy return to prevent compile warnings. 227 return 0; 228 } 229 230 /** 231 * Resets the iterator state and sets the position to the specified offset. 232 * Subclasses must implement, and must call the parent class method, 233 * or CollationIterator.reset(). 234 */ resetToOffset(int newOffset)235 public abstract void resetToOffset(int newOffset); 236 getOffset()237 public abstract int getOffset(); 238 239 /** 240 * Returns the next collation element. 241 */ nextCE()242 public final long nextCE() { 243 if(cesIndex < ceBuffer.length) { 244 // Return the next buffered CE. 245 return ceBuffer.get(cesIndex++); 246 } 247 assert cesIndex == ceBuffer.length; 248 ceBuffer.incLength(); 249 long cAndCE32 = handleNextCE32(); 250 int c = (int)(cAndCE32 >> 32); 251 int ce32 = (int)cAndCE32; 252 int t = ce32 & 0xff; 253 if(t < Collation.SPECIAL_CE32_LOW_BYTE) { // Forced-inline of isSpecialCE32(ce32). 254 // Normal CE from the main data. 255 // Forced-inline of ceFromSimpleCE32(ce32). 256 return ceBuffer.set(cesIndex++, 257 ((long)(ce32 & 0xffff0000) << 32) | ((long)(ce32 & 0xff00) << 16) | (t << 8)); 258 } 259 CollationData d; 260 // The compiler should be able to optimize the previous and the following 261 // comparisons of t with the same constant. 262 if(t == Collation.SPECIAL_CE32_LOW_BYTE) { 263 if(c < 0) { 264 return ceBuffer.set(cesIndex++, Collation.NO_CE); 265 } 266 d = data.base; 267 ce32 = d.getCE32(c); 268 t = ce32 & 0xff; 269 if(t < Collation.SPECIAL_CE32_LOW_BYTE) { 270 // Normal CE from the base data. 271 return ceBuffer.set(cesIndex++, 272 ((long)(ce32 & 0xffff0000) << 32) | ((long)(ce32 & 0xff00) << 16) | (t << 8)); 273 } 274 } else { 275 d = data; 276 } 277 if(t == Collation.LONG_PRIMARY_CE32_LOW_BYTE) { 278 // Forced-inline of ceFromLongPrimaryCE32(ce32). 279 return ceBuffer.set(cesIndex++, 280 ((long)(ce32 - t) << 32) | Collation.COMMON_SEC_AND_TER_CE); 281 } 282 return nextCEFromCE32(d, c, ce32); 283 } 284 285 /** 286 * Fetches all CEs. 287 * @return getCEsLength() 288 */ fetchCEs()289 public final int fetchCEs() { 290 while(nextCE() != Collation.NO_CE) { 291 // No need to loop for each expansion CE. 292 cesIndex = ceBuffer.length; 293 } 294 return ceBuffer.length; 295 } 296 297 /** 298 * Overwrites the current CE (the last one returned by nextCE()). 299 */ setCurrentCE(long ce)300 final void setCurrentCE(long ce) { 301 assert cesIndex > 0; 302 ceBuffer.set(cesIndex - 1, ce); 303 } 304 305 /** 306 * Returns the previous collation element. 307 */ previousCE(UVector32 offsets)308 public final long previousCE(UVector32 offsets) { 309 if(ceBuffer.length > 0) { 310 // Return the previous buffered CE. 311 return ceBuffer.get(--ceBuffer.length); 312 } 313 offsets.removeAllElements(); 314 int limitOffset = getOffset(); 315 int c = previousCodePoint(); 316 if(c < 0) { return Collation.NO_CE; } 317 if(data.isUnsafeBackward(c, isNumeric)) { 318 return previousCEUnsafe(c, offsets); 319 } 320 // Simple, safe-backwards iteration: 321 // Get a CE going backwards, handle prefixes but no contractions. 322 int ce32 = data.getCE32(c); 323 CollationData d; 324 if(ce32 == Collation.FALLBACK_CE32) { 325 d = data.base; 326 ce32 = d.getCE32(c); 327 } else { 328 d = data; 329 } 330 if(Collation.isSimpleOrLongCE32(ce32)) { 331 return Collation.ceFromCE32(ce32); 332 } 333 appendCEsFromCE32(d, c, ce32, false); 334 if(ceBuffer.length > 1) { 335 offsets.addElement(getOffset()); 336 // For an expansion, the offset of each non-initial CE is the limit offset, 337 // consistent with forward iteration. 338 while(offsets.size() <= ceBuffer.length) { 339 offsets.addElement(limitOffset); 340 }; 341 } 342 return ceBuffer.get(--ceBuffer.length); 343 } 344 getCEsLength()345 public final int getCEsLength() { 346 return ceBuffer.length; 347 } 348 getCE(int i)349 public final long getCE(int i) { 350 return ceBuffer.get(i); 351 } 352 getCEs()353 public final long[] getCEs() { 354 return ceBuffer.getCEs(); 355 } 356 clearCEs()357 final void clearCEs() { 358 cesIndex = ceBuffer.length = 0; 359 } 360 clearCEsIfNoneRemaining()361 public final void clearCEsIfNoneRemaining() { 362 if(cesIndex == ceBuffer.length) { clearCEs(); } 363 } 364 365 /** 366 * Returns the next code point (with post-increment). 367 * Public for identical-level comparison and for testing. 368 */ nextCodePoint()369 public abstract int nextCodePoint(); 370 371 /** 372 * Returns the previous code point (with pre-decrement). 373 * Public for identical-level comparison and for testing. 374 */ previousCodePoint()375 public abstract int previousCodePoint(); 376 reset()377 protected final void reset() { 378 cesIndex = ceBuffer.length = 0; 379 if(skipped != null) { skipped.clear(); } 380 } 381 /** 382 * Resets the state as well as the numeric setting, 383 * and completes the initialization. 384 * Only exists in Java where we reset cached CollationIterator instances 385 * rather than stack-allocating temporary ones. 386 * (See also the constructor comments.) 387 */ reset(boolean numeric)388 protected final void reset(boolean numeric) { 389 if(ceBuffer == null) { 390 ceBuffer = new CEBuffer(); 391 } 392 reset(); 393 isNumeric = numeric; 394 } 395 396 /** 397 * Returns the next code point and its local CE32 value. 398 * Returns Collation.FALLBACK_CE32 at the end of the text (c<0) 399 * or when c's CE32 value is to be looked up in the base data (fallback). 400 * 401 * The code point is used for fallbacks, context and implicit weights. 402 * It is ignored when the returned CE32 is not special (e.g., FFFD_CE32). 403 * 404 * Returns the code point in bits 63..32 (signed) and the CE32 in bits 31..0. 405 */ handleNextCE32()406 protected long handleNextCE32() { 407 int c = nextCodePoint(); 408 if(c < 0) { return NO_CP_AND_CE32; } 409 return makeCodePointAndCE32Pair(c, data.getCE32(c)); 410 } makeCodePointAndCE32Pair(int c, int ce32)411 protected long makeCodePointAndCE32Pair(int c, int ce32) { 412 return ((long)c << 32) | (ce32 & 0xffffffffL); 413 } 414 protected static final long NO_CP_AND_CE32 = (-1L << 32) | (Collation.FALLBACK_CE32 & 0xffffffffL); 415 416 /** 417 * Called when handleNextCE32() returns a LEAD_SURROGATE_TAG for a lead surrogate code unit. 418 * Returns the trail surrogate in that case and advances past it, 419 * if a trail surrogate follows the lead surrogate. 420 * Otherwise returns any other code unit and does not advance. 421 */ handleGetTrailSurrogate()422 protected char handleGetTrailSurrogate() { 423 return 0; 424 } 425 426 /** 427 * Called when handleNextCE32() returns with c==0, to see whether it is a NUL terminator. 428 * (Not needed in Java.) 429 */ 430 /*protected boolean foundNULTerminator() { 431 return false; 432 }*/ 433 434 /** 435 * @return false if surrogate code points U+D800..U+DFFF 436 * map to their own implicit primary weights (for UTF-16), 437 * or true if they map to CE(U+FFFD) (for UTF-8) 438 */ forbidSurrogateCodePoints()439 protected boolean forbidSurrogateCodePoints() { 440 return false; 441 } 442 forwardNumCodePoints(int num)443 protected abstract void forwardNumCodePoints(int num); 444 backwardNumCodePoints(int num)445 protected abstract void backwardNumCodePoints(int num); 446 447 /** 448 * Returns the CE32 from the data trie. 449 * Normally the same as data.getCE32(), but overridden in the builder. 450 * Call this only when the faster data.getCE32() cannot be used. 451 */ getDataCE32(int c)452 protected int getDataCE32(int c) { 453 return data.getCE32(c); 454 } 455 getCE32FromBuilderData(int ce32)456 protected int getCE32FromBuilderData(int ce32) { 457 throw new ICUException("internal program error: should be unreachable"); 458 } 459 appendCEsFromCE32(CollationData d, int c, int ce32, boolean forward)460 protected final void appendCEsFromCE32(CollationData d, int c, int ce32, 461 boolean forward) { 462 while(Collation.isSpecialCE32(ce32)) { 463 switch(Collation.tagFromCE32(ce32)) { 464 case Collation.FALLBACK_TAG: 465 case Collation.RESERVED_TAG_3: 466 throw new ICUException("internal program error: should be unreachable"); 467 case Collation.LONG_PRIMARY_TAG: 468 ceBuffer.append(Collation.ceFromLongPrimaryCE32(ce32)); 469 return; 470 case Collation.LONG_SECONDARY_TAG: 471 ceBuffer.append(Collation.ceFromLongSecondaryCE32(ce32)); 472 return; 473 case Collation.LATIN_EXPANSION_TAG: 474 ceBuffer.ensureAppendCapacity(2); 475 ceBuffer.set(ceBuffer.length, Collation.latinCE0FromCE32(ce32)); 476 ceBuffer.set(ceBuffer.length + 1, Collation.latinCE1FromCE32(ce32)); 477 ceBuffer.length += 2; 478 return; 479 case Collation.EXPANSION32_TAG: { 480 int index = Collation.indexFromCE32(ce32); 481 int length = Collation.lengthFromCE32(ce32); 482 ceBuffer.ensureAppendCapacity(length); 483 do { 484 ceBuffer.appendUnsafe(Collation.ceFromCE32(d.ce32s[index++])); 485 } while(--length > 0); 486 return; 487 } 488 case Collation.EXPANSION_TAG: { 489 int index = Collation.indexFromCE32(ce32); 490 int length = Collation.lengthFromCE32(ce32); 491 ceBuffer.ensureAppendCapacity(length); 492 do { 493 ceBuffer.appendUnsafe(d.ces[index++]); 494 } while(--length > 0); 495 return; 496 } 497 case Collation.BUILDER_DATA_TAG: 498 ce32 = getCE32FromBuilderData(ce32); 499 if(ce32 == Collation.FALLBACK_CE32) { 500 d = data.base; 501 ce32 = d.getCE32(c); 502 } 503 break; 504 case Collation.PREFIX_TAG: 505 if(forward) { backwardNumCodePoints(1); } 506 ce32 = getCE32FromPrefix(d, ce32); 507 if(forward) { forwardNumCodePoints(1); } 508 break; 509 case Collation.CONTRACTION_TAG: { 510 int index = Collation.indexFromCE32(ce32); 511 int defaultCE32 = d.getCE32FromContexts(index); // Default if no suffix match. 512 if(!forward) { 513 // Backward contractions are handled by previousCEUnsafe(). 514 // c has contractions but they were not found. 515 ce32 = defaultCE32; 516 break; 517 } 518 int nextCp; 519 if(skipped == null && numCpFwd < 0) { 520 // Some portion of nextCE32FromContraction() pulled out here as an ASCII fast path, 521 // avoiding the function call and the nextSkippedCodePoint() overhead. 522 nextCp = nextCodePoint(); 523 if(nextCp < 0) { 524 // No more text. 525 ce32 = defaultCE32; 526 break; 527 } else if((ce32 & Collation.CONTRACT_NEXT_CCC) != 0 && 528 !CollationFCD.mayHaveLccc(nextCp)) { 529 // All contraction suffixes start with characters with lccc!=0 530 // but the next code point has lccc==0. 531 backwardNumCodePoints(1); 532 ce32 = defaultCE32; 533 break; 534 } 535 } else { 536 nextCp = nextSkippedCodePoint(); 537 if(nextCp < 0) { 538 // No more text. 539 ce32 = defaultCE32; 540 break; 541 } else if((ce32 & Collation.CONTRACT_NEXT_CCC) != 0 && 542 !CollationFCD.mayHaveLccc(nextCp)) { 543 // All contraction suffixes start with characters with lccc!=0 544 // but the next code point has lccc==0. 545 backwardNumSkipped(1); 546 ce32 = defaultCE32; 547 break; 548 } 549 } 550 ce32 = nextCE32FromContraction(d, ce32, d.contexts, index + 2, defaultCE32, nextCp); 551 if(ce32 == Collation.NO_CE32) { 552 // CEs from a discontiguous contraction plus the skipped combining marks 553 // have been appended already. 554 return; 555 } 556 break; 557 } 558 case Collation.DIGIT_TAG: 559 if(isNumeric) { 560 appendNumericCEs(ce32, forward); 561 return; 562 } else { 563 // Fetch the non-numeric-collation CE32 and continue. 564 ce32 = d.ce32s[Collation.indexFromCE32(ce32)]; 565 break; 566 } 567 case Collation.U0000_TAG: 568 assert(c == 0); 569 // NUL-terminated input not supported in Java. 570 // Fetch the normal ce32 for U+0000 and continue. 571 ce32 = d.ce32s[0]; 572 break; 573 case Collation.HANGUL_TAG: { 574 int[] jamoCE32s = d.jamoCE32s; 575 c -= Hangul.HANGUL_BASE; 576 int t = c % Hangul.JAMO_T_COUNT; 577 c /= Hangul.JAMO_T_COUNT; 578 int v = c % Hangul.JAMO_V_COUNT; 579 c /= Hangul.JAMO_V_COUNT; 580 if((ce32 & Collation.HANGUL_NO_SPECIAL_JAMO) != 0) { 581 // None of the Jamo CE32s are isSpecialCE32(). 582 // Avoid recursive function calls and per-Jamo tests. 583 ceBuffer.ensureAppendCapacity(t == 0 ? 2 : 3); 584 ceBuffer.set(ceBuffer.length, Collation.ceFromCE32(jamoCE32s[c])); 585 ceBuffer.set(ceBuffer.length + 1, Collation.ceFromCE32(jamoCE32s[19 + v])); 586 ceBuffer.length += 2; 587 if(t != 0) { 588 ceBuffer.appendUnsafe(Collation.ceFromCE32(jamoCE32s[39 + t])); 589 } 590 return; 591 } else { 592 // We should not need to compute each Jamo code point. 593 // In particular, there should be no offset or implicit ce32. 594 appendCEsFromCE32(d, Collation.SENTINEL_CP, jamoCE32s[c], forward); 595 appendCEsFromCE32(d, Collation.SENTINEL_CP, jamoCE32s[19 + v], forward); 596 if(t == 0) { return; } 597 // offset 39 = 19 + 21 - 1: 598 // 19 = JAMO_L_COUNT 599 // 21 = JAMO_T_COUNT 600 // -1 = omit t==0 601 ce32 = jamoCE32s[39 + t]; 602 c = Collation.SENTINEL_CP; 603 break; 604 } 605 } 606 case Collation.LEAD_SURROGATE_TAG: { 607 assert(forward); // Backward iteration should never see lead surrogate code _unit_ data. 608 assert(isLeadSurrogate(c)); 609 char trail; 610 if(Character.isLowSurrogate(trail = handleGetTrailSurrogate())) { 611 c = Character.toCodePoint((char)c, trail); 612 ce32 &= Collation.LEAD_TYPE_MASK; 613 if(ce32 == Collation.LEAD_ALL_UNASSIGNED) { 614 ce32 = Collation.UNASSIGNED_CE32; // unassigned-implicit 615 } else if(ce32 == Collation.LEAD_ALL_FALLBACK || 616 (ce32 = d.getCE32FromSupplementary(c)) == Collation.FALLBACK_CE32) { 617 // fall back to the base data 618 d = d.base; 619 ce32 = d.getCE32FromSupplementary(c); 620 } 621 } else { 622 // c is an unpaired surrogate. 623 ce32 = Collation.UNASSIGNED_CE32; 624 } 625 break; 626 } 627 case Collation.OFFSET_TAG: 628 assert(c >= 0); 629 ceBuffer.append(d.getCEFromOffsetCE32(c, ce32)); 630 return; 631 case Collation.IMPLICIT_TAG: 632 assert(c >= 0); 633 if(isSurrogate(c) && forbidSurrogateCodePoints()) { 634 ce32 = Collation.FFFD_CE32; 635 break; 636 } else { 637 ceBuffer.append(Collation.unassignedCEFromCodePoint(c)); 638 return; 639 } 640 } 641 } 642 ceBuffer.append(Collation.ceFromSimpleCE32(ce32)); 643 } 644 645 // TODO: Propose widening the UTF16 method. isSurrogate(int c)646 private static final boolean isSurrogate(int c) { 647 return (c & 0xfffff800) == 0xd800; 648 } 649 650 // TODO: Propose widening the UTF16 method. isLeadSurrogate(int c)651 protected static final boolean isLeadSurrogate(int c) { 652 return (c & 0xfffffc00) == 0xd800; 653 } 654 655 // TODO: Propose widening the UTF16 method. isTrailSurrogate(int c)656 protected static final boolean isTrailSurrogate(int c) { 657 return (c & 0xfffffc00) == 0xdc00; 658 } 659 660 // Main lookup trie of the data object. 661 protected final Trie2_32 trie; 662 protected final CollationData data; 663 nextCEFromCE32(CollationData d, int c, int ce32)664 private final long nextCEFromCE32(CollationData d, int c, int ce32) { 665 --ceBuffer.length; // Undo ceBuffer.incLength(). 666 appendCEsFromCE32(d, c, ce32, true); 667 return ceBuffer.get(cesIndex++); 668 } 669 getCE32FromPrefix(CollationData d, int ce32)670 private final int getCE32FromPrefix(CollationData d, int ce32) { 671 int index = Collation.indexFromCE32(ce32); 672 ce32 = d.getCE32FromContexts(index); // Default if no prefix match. 673 index += 2; 674 // Number of code points read before the original code point. 675 int lookBehind = 0; 676 CharsTrie prefixes = new CharsTrie(d.contexts, index); 677 for(;;) { 678 int c = previousCodePoint(); 679 if(c < 0) { break; } 680 ++lookBehind; 681 BytesTrie.Result match = prefixes.nextForCodePoint(c); 682 if(match.hasValue()) { 683 ce32 = prefixes.getValue(); 684 } 685 if(!match.hasNext()) { break; } 686 } 687 forwardNumCodePoints(lookBehind); 688 return ce32; 689 } 690 nextSkippedCodePoint()691 private final int nextSkippedCodePoint() { 692 if(skipped != null && skipped.hasNext()) { return skipped.next(); } 693 if(numCpFwd == 0) { return Collation.SENTINEL_CP; } 694 int c = nextCodePoint(); 695 if(skipped != null && !skipped.isEmpty() && c >= 0) { skipped.incBeyond(); } 696 if(numCpFwd > 0 && c >= 0) { --numCpFwd; } 697 return c; 698 } 699 backwardNumSkipped(int n)700 private final void backwardNumSkipped(int n) { 701 if(skipped != null && !skipped.isEmpty()) { 702 n = skipped.backwardNumCodePoints(n); 703 } 704 backwardNumCodePoints(n); 705 if(numCpFwd >= 0) { numCpFwd += n; } 706 } 707 nextCE32FromContraction( CollationData d, int contractionCE32, CharSequence trieChars, int trieOffset, int ce32, int c)708 private final int nextCE32FromContraction( 709 CollationData d, int contractionCE32, 710 CharSequence trieChars, int trieOffset, int ce32, int c) { 711 // c: next code point after the original one 712 713 // Number of code points read beyond the original code point. 714 // Needed for discontiguous contraction matching. 715 int lookAhead = 1; 716 // Number of code points read since the last match (initially only c). 717 int sinceMatch = 1; 718 // Normally we only need a contiguous match, 719 // and therefore need not remember the suffixes state from before a mismatch for retrying. 720 // If we are already processing skipped combining marks, then we do track the state. 721 CharsTrie suffixes = new CharsTrie(trieChars, trieOffset); 722 if(skipped != null && !skipped.isEmpty()) { skipped.saveTrieState(suffixes); } 723 BytesTrie.Result match = suffixes.firstForCodePoint(c); 724 for(;;) { 725 int nextCp; 726 if(match.hasValue()) { 727 ce32 = suffixes.getValue(); 728 if(!match.hasNext() || (c = nextSkippedCodePoint()) < 0) { 729 return ce32; 730 } 731 if(skipped != null && !skipped.isEmpty()) { skipped.saveTrieState(suffixes); } 732 sinceMatch = 1; 733 } else if(match == BytesTrie.Result.NO_MATCH || (nextCp = nextSkippedCodePoint()) < 0) { 734 // No match for c, or partial match (BytesTrie.Result.NO_VALUE) and no further text. 735 // Back up if necessary, and try a discontiguous contraction. 736 if((contractionCE32 & Collation.CONTRACT_TRAILING_CCC) != 0 && 737 // Discontiguous contraction matching extends an existing match. 738 // If there is no match yet, then there is nothing to do. 739 ((contractionCE32 & Collation.CONTRACT_SINGLE_CP_NO_MATCH) == 0 || 740 sinceMatch < lookAhead)) { 741 // The last character of at least one suffix has lccc!=0, 742 // allowing for discontiguous contractions. 743 // UCA S2.1.1 only processes non-starters immediately following 744 // "a match in the table" (sinceMatch=1). 745 if(sinceMatch > 1) { 746 // Return to the state after the last match. 747 // (Return to sinceMatch=0 and re-fetch the first partially-matched character.) 748 backwardNumSkipped(sinceMatch); 749 c = nextSkippedCodePoint(); 750 lookAhead -= sinceMatch - 1; 751 sinceMatch = 1; 752 } 753 if(d.getFCD16(c) > 0xff) { 754 return nextCE32FromDiscontiguousContraction( 755 d, suffixes, ce32, lookAhead, c); 756 } 757 } 758 break; 759 } else { 760 // Continue after partial match (BytesTrie.Result.NO_VALUE) for c. 761 // It does not have a result value, therefore it is not itself "a match in the table". 762 // If a partially-matched c has ccc!=0 then 763 // it might be skipped in discontiguous contraction. 764 c = nextCp; 765 ++sinceMatch; 766 } 767 ++lookAhead; 768 match = suffixes.nextForCodePoint(c); 769 } 770 backwardNumSkipped(sinceMatch); 771 return ce32; 772 } 773 nextCE32FromDiscontiguousContraction( CollationData d, CharsTrie suffixes, int ce32, int lookAhead, int c)774 private final int nextCE32FromDiscontiguousContraction( 775 CollationData d, CharsTrie suffixes, int ce32, 776 int lookAhead, int c) { 777 // UCA section 3.3.2 Contractions: 778 // Contractions that end with non-starter characters 779 // are known as discontiguous contractions. 780 // ... discontiguous contractions must be detected in input text 781 // whenever the final sequence of non-starter characters could be rearranged 782 // so as to make a contiguous matching sequence that is canonically equivalent. 783 784 // UCA: http://www.unicode.org/reports/tr10/#S2.1 785 // S2.1 Find the longest initial substring S at each point that has a match in the table. 786 // S2.1.1 If there are any non-starters following S, process each non-starter C. 787 // S2.1.2 If C is not blocked from S, find if S + C has a match in the table. 788 // Note: A non-starter in a string is called blocked 789 // if there is another non-starter of the same canonical combining class or zero 790 // between it and the last character of canonical combining class 0. 791 // S2.1.3 If there is a match, replace S by S + C, and remove C. 792 793 // First: Is a discontiguous contraction even possible? 794 int fcd16 = d.getFCD16(c); 795 assert(fcd16 > 0xff); // The caller checked this already, as a shortcut. 796 int nextCp = nextSkippedCodePoint(); 797 if(nextCp < 0) { 798 // No further text. 799 backwardNumSkipped(1); 800 return ce32; 801 } 802 ++lookAhead; 803 int prevCC = fcd16 & 0xff; 804 fcd16 = d.getFCD16(nextCp); 805 if(fcd16 <= 0xff) { 806 // The next code point after c is a starter (S2.1.1 "process each non-starter"). 807 backwardNumSkipped(2); 808 return ce32; 809 } 810 811 // We have read and matched (lookAhead-2) code points, 812 // read non-matching c and peeked ahead at nextCp. 813 // Return to the state before the mismatch and continue matching with nextCp. 814 if(skipped == null || skipped.isEmpty()) { 815 if(skipped == null) { 816 skipped = new SkippedState(); 817 } 818 suffixes.reset(); 819 if(lookAhead > 2) { 820 // Replay the partial match so far. 821 backwardNumCodePoints(lookAhead); 822 suffixes.firstForCodePoint(nextCodePoint()); 823 for(int i = 3; i < lookAhead; ++i) { 824 suffixes.nextForCodePoint(nextCodePoint()); 825 } 826 // Skip c (which did not match) and nextCp (which we will try now). 827 forwardNumCodePoints(2); 828 } 829 skipped.saveTrieState(suffixes); 830 } else { 831 // Reset to the trie state before the failed match of c. 832 skipped.resetToTrieState(suffixes); 833 } 834 835 skipped.setFirstSkipped(c); 836 // Number of code points read since the last match (at this point: c and nextCp). 837 int sinceMatch = 2; 838 c = nextCp; 839 for(;;) { 840 BytesTrie.Result match; 841 // "If C is not blocked from S, find if S + C has a match in the table." (S2.1.2) 842 if(prevCC < (fcd16 >> 8) && (match = suffixes.nextForCodePoint(c)).hasValue()) { 843 // "If there is a match, replace S by S + C, and remove C." (S2.1.3) 844 // Keep prevCC unchanged. 845 ce32 = suffixes.getValue(); 846 sinceMatch = 0; 847 skipped.recordMatch(); 848 if(!match.hasNext()) { break; } 849 skipped.saveTrieState(suffixes); 850 } else { 851 // No match for "S + C", skip C. 852 skipped.skip(c); 853 skipped.resetToTrieState(suffixes); 854 prevCC = fcd16 & 0xff; 855 } 856 if((c = nextSkippedCodePoint()) < 0) { break; } 857 ++sinceMatch; 858 fcd16 = d.getFCD16(c); 859 if(fcd16 <= 0xff) { 860 // The next code point after c is a starter (S2.1.1 "process each non-starter"). 861 break; 862 } 863 } 864 backwardNumSkipped(sinceMatch); 865 boolean isTopDiscontiguous = skipped.isEmpty(); 866 skipped.replaceMatch(); 867 if(isTopDiscontiguous && !skipped.isEmpty()) { 868 // We did get a match after skipping one or more combining marks, 869 // and we are not in a recursive discontiguous contraction. 870 // Append CEs from the contraction ce32 871 // and then from the combining marks that we skipped before the match. 872 c = Collation.SENTINEL_CP; 873 for(;;) { 874 appendCEsFromCE32(d, c, ce32, true); 875 // Fetch CE32s for skipped combining marks from the normal data, with fallback, 876 // rather than from the CollationData where we found the contraction. 877 if(!skipped.hasNext()) { break; } 878 c = skipped.next(); 879 ce32 = getDataCE32(c); 880 if(ce32 == Collation.FALLBACK_CE32) { 881 d = data.base; 882 ce32 = d.getCE32(c); 883 } else { 884 d = data; 885 } 886 // Note: A nested discontiguous-contraction match 887 // replaces consumed combining marks with newly skipped ones 888 // and resets the reading position to the beginning. 889 } 890 skipped.clear(); 891 ce32 = Collation.NO_CE32; // Signal to the caller that the result is in the ceBuffer. 892 } 893 return ce32; 894 } 895 896 /** 897 * Returns the previous CE when data.isUnsafeBackward(c, isNumeric). 898 */ previousCEUnsafe(int c, UVector32 offsets)899 private final long previousCEUnsafe(int c, UVector32 offsets) { 900 // We just move through the input counting safe and unsafe code points 901 // without collecting the unsafe-backward substring into a buffer and 902 // switching to it. 903 // This is to keep the logic simple. Otherwise we would have to handle 904 // prefix matching going before the backward buffer, switching 905 // to iteration and back, etc. 906 // In the most important case of iterating over a normal string, 907 // reading from the string itself is already maximally fast. 908 // The only drawback there is that after getting the CEs we always 909 // skip backward to the safe character rather than switching out 910 // of a backwardBuffer. 911 // But this should not be the common case for previousCE(), 912 // and correctness and maintainability are more important than 913 // complex optimizations. 914 // Find the first safe character before c. 915 int numBackward = 1; 916 while((c = previousCodePoint()) >= 0) { 917 ++numBackward; 918 if(!data.isUnsafeBackward(c, isNumeric)) { 919 break; 920 } 921 } 922 // Set the forward iteration limit. 923 // Note: This counts code points. 924 // We cannot enforce a limit in the middle of a surrogate pair or similar. 925 numCpFwd = numBackward; 926 // Reset the forward iterator. 927 cesIndex = 0; 928 assert(ceBuffer.length == 0); 929 // Go forward and collect the CEs. 930 int offset = getOffset(); 931 while(numCpFwd > 0) { 932 // nextCE() normally reads one code point. 933 // Contraction matching and digit specials read more and check numCpFwd. 934 --numCpFwd; 935 // Append one or more CEs to the ceBuffer. 936 nextCE(); 937 assert(ceBuffer.get(ceBuffer.length - 1) != Collation.NO_CE); 938 // No need to loop for getting each expansion CE from nextCE(). 939 cesIndex = ceBuffer.length; 940 // However, we need to write an offset for each CE. 941 // This is for CollationElementIterator.getOffset() to return 942 // intermediate offsets from the unsafe-backwards segment. 943 assert(offsets.size() < ceBuffer.length); 944 offsets.addElement(offset); 945 // For an expansion, the offset of each non-initial CE is the limit offset, 946 // consistent with forward iteration. 947 offset = getOffset(); 948 while(offsets.size() < ceBuffer.length) { 949 offsets.addElement(offset); 950 }; 951 } 952 assert(offsets.size() == ceBuffer.length); 953 // End offset corresponding to just after the unsafe-backwards segment. 954 offsets.addElement(offset); 955 // Reset the forward iteration limit 956 // and move backward to before the segment for which we fetched CEs. 957 numCpFwd = -1; 958 backwardNumCodePoints(numBackward); 959 // Use the collected CEs and return the last one. 960 cesIndex = 0; // Avoid cesIndex > ceBuffer.length when that gets decremented. 961 return ceBuffer.get(--ceBuffer.length); 962 } 963 964 /** 965 * Turns a string of digits (bytes 0..9) 966 * into a sequence of CEs that will sort in numeric order. 967 * 968 * Starts from this ce32's digit value and consumes the following/preceding digits. 969 * The digits string must not be empty and must not have leading zeros. 970 */ 971 private final void appendNumericCEs(int ce32, boolean forward) { 972 // Collect digits. 973 // TODO: Use some kind of a byte buffer? We only store values 0..9. 974 StringBuilder digits = new StringBuilder(); 975 if(forward) { 976 for(;;) { 977 char digit = Collation.digitFromCE32(ce32); 978 digits.append(digit); 979 if(numCpFwd == 0) { break; } 980 int c = nextCodePoint(); 981 if(c < 0) { break; } 982 ce32 = data.getCE32(c); 983 if(ce32 == Collation.FALLBACK_CE32) { 984 ce32 = data.base.getCE32(c); 985 } 986 if(!Collation.hasCE32Tag(ce32, Collation.DIGIT_TAG)) { 987 backwardNumCodePoints(1); 988 break; 989 } 990 if(numCpFwd > 0) { --numCpFwd; } 991 } 992 } else { 993 for(;;) { 994 char digit = Collation.digitFromCE32(ce32); 995 digits.append(digit); 996 int c = previousCodePoint(); 997 if(c < 0) { break; } 998 ce32 = data.getCE32(c); 999 if(ce32 == Collation.FALLBACK_CE32) { 1000 ce32 = data.base.getCE32(c); 1001 } 1002 if(!Collation.hasCE32Tag(ce32, Collation.DIGIT_TAG)) { 1003 forwardNumCodePoints(1); 1004 break; 1005 } 1006 } 1007 // Reverse the digit string. 1008 digits.reverse(); 1009 } 1010 int pos = 0; 1011 do { 1012 // Skip leading zeros. 1013 while(pos < (digits.length() - 1) && digits.charAt(pos) == 0) { ++pos; } 1014 // Write a sequence of CEs for at most 254 digits at a time. 1015 int segmentLength = digits.length() - pos; 1016 if(segmentLength > 254) { segmentLength = 254; } 1017 appendNumericSegmentCEs(digits.subSequence(pos, pos + segmentLength)); 1018 pos += segmentLength; 1019 } while(pos < digits.length()); 1020 } 1021 1022 /** 1023 * Turns 1..254 digits into a sequence of CEs. 1024 * Called by appendNumericCEs() for each segment of at most 254 digits. 1025 */ 1026 private final void appendNumericSegmentCEs(CharSequence digits) { 1027 int length = digits.length(); 1028 assert(1 <= length && length <= 254); 1029 assert(length == 1 || digits.charAt(0) != 0); 1030 long numericPrimary = data.numericPrimary; 1031 // Note: We use primary byte values 2..255: digits are not compressible. 1032 if(length <= 7) { 1033 // Very dense encoding for small numbers. 1034 int value = digits.charAt(0); 1035 for(int i = 1; i < length; ++i) { 1036 value = value * 10 + digits.charAt(i); 1037 } 1038 // Primary weight second byte values: 1039 // 74 byte values 2.. 75 for small numbers in two-byte primary weights. 1040 // 40 byte values 76..115 for medium numbers in three-byte primary weights. 1041 // 16 byte values 116..131 for large numbers in four-byte primary weights. 1042 // 124 byte values 132..255 for very large numbers with 4..127 digit pairs. 1043 int firstByte = 2; 1044 int numBytes = 74; 1045 if(value < numBytes) { 1046 // Two-byte primary for 0..73, good for day & month numbers etc. 1047 long primary = numericPrimary | ((firstByte + value) << 16); 1048 ceBuffer.append(Collation.makeCE(primary)); 1049 return; 1050 } 1051 value -= numBytes; 1052 firstByte += numBytes; 1053 numBytes = 40; 1054 if(value < numBytes * 254) { 1055 // Three-byte primary for 74..10233=74+40*254-1, good for year numbers and more. 1056 long primary = numericPrimary | 1057 ((firstByte + value / 254) << 16) | ((2 + value % 254) << 8); 1058 ceBuffer.append(Collation.makeCE(primary)); 1059 return; 1060 } 1061 value -= numBytes * 254; 1062 firstByte += numBytes; 1063 numBytes = 16; 1064 if(value < numBytes * 254 * 254) { 1065 // Four-byte primary for 10234..1042489=10234+16*254*254-1. 1066 long primary = numericPrimary | (2 + value % 254); 1067 value /= 254; 1068 primary |= (2 + value % 254) << 8; 1069 value /= 254; 1070 primary |= (firstByte + value % 254) << 16; 1071 ceBuffer.append(Collation.makeCE(primary)); 1072 return; 1073 } 1074 // original value > 1042489 1075 } 1076 assert(length >= 7); 1077 1078 // The second primary byte value 132..255 indicates the number of digit pairs (4..127), 1079 // then we generate primary bytes with those pairs. 1080 // Omit trailing 00 pairs. 1081 // Decrement the value for the last pair. 1082 1083 // Set the exponent. 4 pairs.132, 5 pairs.133, ..., 127 pairs.255. 1084 int numPairs = (length + 1) / 2; 1085 long primary = numericPrimary | ((132 - 4 + numPairs) << 16); 1086 // Find the length without trailing 00 pairs. 1087 while(digits.charAt(length - 1) == 0 && digits.charAt(length - 2) == 0) { 1088 length -= 2; 1089 } 1090 // Read the first pair. 1091 int pair; 1092 int pos; 1093 if((length & 1) != 0) { 1094 // Only "half a pair" if we have an odd number of digits. 1095 pair = digits.charAt(0); 1096 pos = 1; 1097 } else { 1098 pair = digits.charAt(0) * 10 + digits.charAt(1); 1099 pos = 2; 1100 } 1101 pair = 11 + 2 * pair; 1102 // Add the pairs of digits between pos and length. 1103 int shift = 8; 1104 while(pos < length) { 1105 if(shift == 0) { 1106 // Every three pairs/bytes we need to store a 4-byte-primary CE 1107 // and start with a new CE with the '0' primary lead byte. 1108 primary |= pair; 1109 ceBuffer.append(Collation.makeCE(primary)); 1110 primary = numericPrimary; 1111 shift = 16; 1112 } else { 1113 primary |= pair << shift; 1114 shift -= 8; 1115 } 1116 pair = 11 + 2 * (digits.charAt(pos) * 10 + digits.charAt(pos + 1)); 1117 pos += 2; 1118 } 1119 primary |= (pair - 1) << shift; 1120 ceBuffer.append(Collation.makeCE(primary)); 1121 } 1122 1123 private CEBuffer ceBuffer; 1124 private int cesIndex; 1125 1126 private SkippedState skipped; 1127 1128 // Number of code points to read forward, or -1. 1129 // Used as a forward iteration limit in previousCEUnsafe(). 1130 private int numCpFwd; 1131 // Numeric collation (CollationSettings.NUMERIC). 1132 private boolean isNumeric; 1133 } 1134