1 /* GENERATED SOURCE. DO NOT MODIFY. */ 2 // © 2016 and later: Unicode, Inc. and others. 3 // License & terms of use: http://www.unicode.org/copyright.html#License 4 /* 5 ******************************************************************************* 6 * Copyright (C) 2011-2014, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 ******************************************************************************* 9 * created on: 2011jan06 10 * created by: Markus W. Scherer 11 * ported from ICU4C ucharstrie.h/.cpp 12 */ 13 14 package ohos.global.icu.util; 15 16 import java.io.IOException; 17 import java.util.ArrayList; 18 import java.util.NoSuchElementException; 19 20 import ohos.global.icu.text.UTF16; 21 import ohos.global.icu.util.BytesTrie.Result; 22 23 /** 24 * Light-weight, non-const reader class for a CharsTrie. 25 * Traverses a char-serialized data structure with minimal state, 26 * for mapping strings (16-bit-unit sequences) to non-negative integer values. 27 * 28 * <p>This class is not intended for public subclassing. 29 * 30 * @author Markus W. Scherer 31 * @hide exposed on OHOS 32 */ 33 public final class CharsTrie implements Cloneable, Iterable<CharsTrie.Entry> { 34 /** 35 * Constructs a CharsTrie reader instance. 36 * 37 * <p>The CharSequence must contain a copy of a char sequence from the CharsTrieBuilder, 38 * with the offset indicating the first char of that sequence. 39 * The CharsTrie object will not read more chars than 40 * the CharsTrieBuilder generated in the corresponding build() call. 41 * 42 * <p>The CharSequence is not copied/cloned and must not be modified while 43 * the CharsTrie object is in use. 44 * 45 * @param trieChars CharSequence that contains the serialized trie. 46 * @param offset Root offset of the trie in the CharSequence. 47 */ CharsTrie(CharSequence trieChars, int offset)48 public CharsTrie(CharSequence trieChars, int offset) { 49 chars_=trieChars; 50 pos_=root_=offset; 51 remainingMatchLength_=-1; 52 } 53 54 /** 55 * Copy constructor. 56 * Makes a shallow copy of the other trie reader object and its state. 57 * Does not copy the char array which will be shared. 58 * Same as clone() but without the throws clause. 59 */ CharsTrie(CharsTrie other)60 public CharsTrie(CharsTrie other) { 61 chars_ = other.chars_; 62 root_ = other.root_; 63 pos_ = other.pos_; 64 remainingMatchLength_ = other.remainingMatchLength_; 65 } 66 67 /** 68 * Clones this trie reader object and its state, 69 * but not the char array which will be shared. 70 * @return A shallow clone of this trie. 71 */ 72 @Override clone()73 public CharsTrie clone() throws CloneNotSupportedException { 74 return (CharsTrie) super.clone(); // A shallow copy is just what we need. 75 } 76 77 /** 78 * Resets this trie to its initial state. 79 * @return this 80 */ reset()81 public CharsTrie reset() { 82 pos_=root_; 83 remainingMatchLength_=-1; 84 return this; 85 } 86 87 /** 88 * Returns the state of this trie as a 64-bit integer. 89 * The state value is never 0. 90 * 91 * @return opaque state value 92 * @see #resetToState64 93 */ getState64()94 public long getState64() { 95 return ((long)remainingMatchLength_ << 32) | pos_; 96 } 97 98 /** 99 * Resets this trie to the saved state. 100 * Unlike {@link #resetToState(State)}, the 64-bit state value 101 * must be from {@link #getState64()} from the same trie object or 102 * from one initialized the exact same way. 103 * Because of no validation, this method is faster. 104 * 105 * @param state The opaque trie state value from getState64(). 106 * @return this 107 * @see #getState64 108 * @see #resetToState 109 * @see #reset 110 */ resetToState64(long state)111 public CharsTrie resetToState64(long state) { 112 remainingMatchLength_ = (int)(state >> 32); 113 pos_ = (int)state; 114 return this; 115 } 116 117 /** 118 * CharsTrie state object, for saving a trie's current state 119 * and resetting the trie back to this state later. 120 * @hide exposed on OHOS 121 */ 122 public static final class State { 123 /** 124 * Constructs an empty State. 125 */ State()126 public State() {} 127 private CharSequence chars; 128 private int root; 129 private int pos; 130 private int remainingMatchLength; 131 } 132 133 /** 134 * Saves the state of this trie. 135 * @param state The State object to hold the trie's state. 136 * @return this 137 * @see #resetToState 138 */ saveState(State state)139 public CharsTrie saveState(State state) /*const*/ { 140 state.chars=chars_; 141 state.root=root_; 142 state.pos=pos_; 143 state.remainingMatchLength=remainingMatchLength_; 144 return this; 145 } 146 147 /** 148 * Resets this trie to the saved state. 149 * Slower than {@link #resetToState64(long)} which does not validate the state value. 150 * 151 * @param state The State object which holds a saved trie state. 152 * @return this 153 * @throws IllegalArgumentException if the state object contains no state, 154 * or the state of a different trie 155 * @see #saveState 156 * @see #reset 157 */ resetToState(State state)158 public CharsTrie resetToState(State state) { 159 if(chars_==state.chars && chars_!=null && root_==state.root) { 160 pos_=state.pos; 161 remainingMatchLength_=state.remainingMatchLength; 162 } else { 163 throw new IllegalArgumentException("incompatible trie state"); 164 } 165 return this; 166 } 167 168 /** 169 * Determines whether the string so far matches, whether it has a value, 170 * and whether another input char can continue a matching string. 171 * @return The match/value Result. 172 */ current()173 public Result current() /*const*/ { 174 int pos=pos_; 175 if(pos<0) { 176 return Result.NO_MATCH; 177 } else { 178 int node; 179 return (remainingMatchLength_<0 && (node=chars_.charAt(pos))>=kMinValueLead) ? 180 valueResults_[node>>15] : Result.NO_VALUE; 181 } 182 } 183 184 /** 185 * Traverses the trie from the initial state for this input char. 186 * Equivalent to reset().next(inUnit). 187 * @param inUnit Input char value. Values below 0 and above 0xffff will never match. 188 * @return The match/value Result. 189 */ first(int inUnit)190 public Result first(int inUnit) { 191 remainingMatchLength_=-1; 192 return nextImpl(root_, inUnit); 193 } 194 195 /** 196 * Traverses the trie from the initial state for the 197 * one or two UTF-16 code units for this input code point. 198 * Equivalent to reset().nextForCodePoint(cp). 199 * @param cp A Unicode code point 0..0x10ffff. 200 * @return The match/value Result. 201 */ firstForCodePoint(int cp)202 public Result firstForCodePoint(int cp) { 203 return cp<=0xffff ? 204 first(cp) : 205 (first(UTF16.getLeadSurrogate(cp)).hasNext() ? 206 next(UTF16.getTrailSurrogate(cp)) : 207 Result.NO_MATCH); 208 } 209 210 /** 211 * Traverses the trie from the current state for this input char. 212 * @param inUnit Input char value. Values below 0 and above 0xffff will never match. 213 * @return The match/value Result. 214 */ next(int inUnit)215 public Result next(int inUnit) { 216 int pos=pos_; 217 if(pos<0) { 218 return Result.NO_MATCH; 219 } 220 int length=remainingMatchLength_; // Actual remaining match length minus 1. 221 if(length>=0) { 222 // Remaining part of a linear-match node. 223 if(inUnit==chars_.charAt(pos++)) { 224 remainingMatchLength_=--length; 225 pos_=pos; 226 int node; 227 return (length<0 && (node=chars_.charAt(pos))>=kMinValueLead) ? 228 valueResults_[node>>15] : Result.NO_VALUE; 229 } else { 230 stop(); 231 return Result.NO_MATCH; 232 } 233 } 234 return nextImpl(pos, inUnit); 235 } 236 237 /** 238 * Traverses the trie from the current state for the 239 * one or two UTF-16 code units for this input code point. 240 * @param cp A Unicode code point 0..0x10ffff. 241 * @return The match/value Result. 242 */ nextForCodePoint(int cp)243 public Result nextForCodePoint(int cp) { 244 return cp<=0xffff ? 245 next(cp) : 246 (next(UTF16.getLeadSurrogate(cp)).hasNext() ? 247 next(UTF16.getTrailSurrogate(cp)) : 248 Result.NO_MATCH); 249 } 250 251 /** 252 * Traverses the trie from the current state for this string. 253 * Equivalent to 254 * <pre> 255 * Result result=current(); 256 * for(each c in s) 257 * if(!result.hasNext()) return Result.NO_MATCH; 258 * result=next(c); 259 * return result; 260 * </pre> 261 * @param s Contains a string. 262 * @param sIndex The start index of the string in s. 263 * @param sLimit The (exclusive) end index of the string in s. 264 * @return The match/value Result. 265 */ next(CharSequence s, int sIndex, int sLimit)266 public Result next(CharSequence s, int sIndex, int sLimit) { 267 if(sIndex>=sLimit) { 268 // Empty input. 269 return current(); 270 } 271 int pos=pos_; 272 if(pos<0) { 273 return Result.NO_MATCH; 274 } 275 int length=remainingMatchLength_; // Actual remaining match length minus 1. 276 for(;;) { 277 // Fetch the next input unit, if there is one. 278 // Continue a linear-match node. 279 char inUnit; 280 for(;;) { 281 if(sIndex==sLimit) { 282 remainingMatchLength_=length; 283 pos_=pos; 284 int node; 285 return (length<0 && (node=chars_.charAt(pos))>=kMinValueLead) ? 286 valueResults_[node>>15] : Result.NO_VALUE; 287 } 288 inUnit=s.charAt(sIndex++); 289 if(length<0) { 290 remainingMatchLength_=length; 291 break; 292 } 293 if(inUnit!=chars_.charAt(pos)) { 294 stop(); 295 return Result.NO_MATCH; 296 } 297 ++pos; 298 --length; 299 } 300 int node=chars_.charAt(pos++); 301 for(;;) { 302 if(node<kMinLinearMatch) { 303 Result result=branchNext(pos, node, inUnit); 304 if(result==Result.NO_MATCH) { 305 return Result.NO_MATCH; 306 } 307 // Fetch the next input unit, if there is one. 308 if(sIndex==sLimit) { 309 return result; 310 } 311 if(result==Result.FINAL_VALUE) { 312 // No further matching units. 313 stop(); 314 return Result.NO_MATCH; 315 } 316 inUnit=s.charAt(sIndex++); 317 pos=pos_; // branchNext() advanced pos and wrote it to pos_ . 318 node=chars_.charAt(pos++); 319 } else if(node<kMinValueLead) { 320 // Match length+1 units. 321 length=node-kMinLinearMatch; // Actual match length minus 1. 322 if(inUnit!=chars_.charAt(pos)) { 323 stop(); 324 return Result.NO_MATCH; 325 } 326 ++pos; 327 --length; 328 break; 329 } else if((node&kValueIsFinal)!=0) { 330 // No further matching units. 331 stop(); 332 return Result.NO_MATCH; 333 } else { 334 // Skip intermediate value. 335 pos=skipNodeValue(pos, node); 336 node&=kNodeTypeMask; 337 } 338 } 339 } 340 } 341 342 /** 343 * Returns a matching string's value if called immediately after 344 * current()/first()/next() returned Result.INTERMEDIATE_VALUE or Result.FINAL_VALUE. 345 * getValue() can be called multiple times. 346 * 347 * Do not call getValue() after Result.NO_MATCH or Result.NO_VALUE! 348 * @return The value for the string so far. 349 */ getValue()350 public int getValue() /*const*/ { 351 int pos=pos_; 352 int leadUnit=chars_.charAt(pos++); 353 assert(leadUnit>=kMinValueLead); 354 return (leadUnit&kValueIsFinal)!=0 ? 355 readValue(chars_, pos, leadUnit&0x7fff) : readNodeValue(chars_, pos, leadUnit); 356 } 357 358 /** 359 * Determines whether all strings reachable from the current state 360 * map to the same value, and if so, returns that value. 361 * @return The unique value in bits 32..1 with bit 0 set, 362 * if all strings reachable from the current state 363 * map to the same value; otherwise returns 0. 364 */ getUniqueValue()365 public long getUniqueValue() /*const*/ { 366 int pos=pos_; 367 if(pos<0) { 368 return 0; 369 } 370 // Skip the rest of a pending linear-match node. 371 long uniqueValue=findUniqueValue(chars_, pos+remainingMatchLength_+1, 0); 372 // Ignore internally used bits 63..33; extend the actual value's sign bit from bit 32. 373 return (uniqueValue<<31)>>31; 374 } 375 376 /** 377 * Finds each char which continues the string from the current state. 378 * That is, each char c for which it would be next(c)!=Result.NO_MATCH now. 379 * @param out Each next char is appended to this object. 380 * (Only uses the out.append(c) method.) 381 * @return The number of chars which continue the string from here. 382 */ getNextChars(Appendable out)383 public int getNextChars(Appendable out) /*const*/ { 384 int pos=pos_; 385 if(pos<0) { 386 return 0; 387 } 388 if(remainingMatchLength_>=0) { 389 append(out, chars_.charAt(pos)); // Next unit of a pending linear-match node. 390 return 1; 391 } 392 int node=chars_.charAt(pos++); 393 if(node>=kMinValueLead) { 394 if((node&kValueIsFinal)!=0) { 395 return 0; 396 } else { 397 pos=skipNodeValue(pos, node); 398 node&=kNodeTypeMask; 399 } 400 } 401 if(node<kMinLinearMatch) { 402 if(node==0) { 403 node=chars_.charAt(pos++); 404 } 405 getNextBranchChars(chars_, pos, ++node, out); 406 return node; 407 } else { 408 // First unit of the linear-match node. 409 append(out, chars_.charAt(pos)); 410 return 1; 411 } 412 } 413 414 /** 415 * Iterates from the current state of this trie. 416 * @return A new CharsTrie.Iterator. 417 */ 418 @Override iterator()419 public Iterator iterator() { 420 return new Iterator(chars_, pos_, remainingMatchLength_, 0); 421 } 422 423 /** 424 * Iterates from the current state of this trie. 425 * @param maxStringLength If 0, the iterator returns full strings. 426 * Otherwise, the iterator returns strings with this maximum length. 427 * @return A new CharsTrie.Iterator. 428 */ iterator(int maxStringLength)429 public Iterator iterator(int maxStringLength) { 430 return new Iterator(chars_, pos_, remainingMatchLength_, maxStringLength); 431 } 432 433 /** 434 * Iterates from the root of a char-serialized BytesTrie. 435 * @param trieChars CharSequence that contains the serialized trie. 436 * @param offset Root offset of the trie in the CharSequence. 437 * @param maxStringLength If 0, the iterator returns full strings. 438 * Otherwise, the iterator returns strings with this maximum length. 439 * @return A new CharsTrie.Iterator. 440 */ iterator(CharSequence trieChars, int offset, int maxStringLength)441 public static Iterator iterator(CharSequence trieChars, int offset, int maxStringLength) { 442 return new Iterator(trieChars, offset, -1, maxStringLength); 443 } 444 445 /** 446 * Return value type for the Iterator. 447 * @hide exposed on OHOS 448 */ 449 public static final class Entry { 450 /** 451 * The string. 452 */ 453 public CharSequence chars; 454 /** 455 * The value associated with the string. 456 */ 457 public int value; 458 Entry()459 private Entry() { 460 } 461 } 462 463 /** 464 * Iterator for all of the (string, value) pairs in a CharsTrie. 465 * @hide exposed on OHOS 466 */ 467 public static final class Iterator implements java.util.Iterator<Entry> { Iterator(CharSequence trieChars, int offset, int remainingMatchLength, int maxStringLength)468 private Iterator(CharSequence trieChars, int offset, int remainingMatchLength, int maxStringLength) { 469 chars_=trieChars; 470 pos_=initialPos_=offset; 471 remainingMatchLength_=initialRemainingMatchLength_=remainingMatchLength; 472 maxLength_=maxStringLength; 473 int length=remainingMatchLength_; // Actual remaining match length minus 1. 474 if(length>=0) { 475 // Pending linear-match node, append remaining bytes to str_. 476 ++length; 477 if(maxLength_>0 && length>maxLength_) { 478 length=maxLength_; // This will leave remainingMatchLength>=0 as a signal. 479 } 480 str_.append(chars_, pos_, pos_+length); 481 pos_+=length; 482 remainingMatchLength_-=length; 483 } 484 } 485 486 /** 487 * Resets this iterator to its initial state. 488 * @return this 489 */ reset()490 public Iterator reset() { 491 pos_=initialPos_; 492 remainingMatchLength_=initialRemainingMatchLength_; 493 skipValue_=false; 494 int length=remainingMatchLength_+1; // Remaining match length. 495 if(maxLength_>0 && length>maxLength_) { 496 length=maxLength_; 497 } 498 str_.setLength(length); 499 pos_+=length; 500 remainingMatchLength_-=length; 501 stack_.clear(); 502 return this; 503 } 504 505 /** 506 * @return true if there are more elements. 507 */ 508 @Override hasNext()509 public boolean hasNext() /*const*/ { return pos_>=0 || !stack_.isEmpty(); } 510 511 /** 512 * Finds the next (string, value) pair if there is one. 513 * 514 * If the string is truncated to the maximum length and does not 515 * have a real value, then the value is set to -1. 516 * In this case, this "not a real value" is indistinguishable from 517 * a real value of -1. 518 * @return An Entry with the string and value of the next element. 519 * @throws NoSuchElementException - iteration has no more elements. 520 */ 521 @Override next()522 public Entry next() { 523 int pos=pos_; 524 if(pos<0) { 525 if(stack_.isEmpty()) { 526 throw new NoSuchElementException(); 527 } 528 // Pop the state off the stack and continue with the next outbound edge of 529 // the branch node. 530 long top=stack_.remove(stack_.size()-1); 531 int length=(int)top; 532 pos=(int)(top>>32); 533 str_.setLength(length&0xffff); 534 length>>>=16; 535 if(length>1) { 536 pos=branchNext(pos, length); 537 if(pos<0) { 538 return entry_; // Reached a final value. 539 } 540 } else { 541 str_.append(chars_.charAt(pos++)); 542 } 543 } 544 if(remainingMatchLength_>=0) { 545 // We only get here if we started in a pending linear-match node 546 // with more than maxLength remaining units. 547 return truncateAndStop(); 548 } 549 for(;;) { 550 int node=chars_.charAt(pos++); 551 if(node>=kMinValueLead) { 552 if(skipValue_) { 553 pos=skipNodeValue(pos, node); 554 node&=kNodeTypeMask; 555 skipValue_=false; 556 } else { 557 // Deliver value for the string so far. 558 boolean isFinal=(node&kValueIsFinal)!=0; 559 if(isFinal) { 560 entry_.value=readValue(chars_, pos, node&0x7fff); 561 } else { 562 entry_.value=readNodeValue(chars_, pos, node); 563 } 564 if(isFinal || (maxLength_>0 && str_.length()==maxLength_)) { 565 pos_=-1; 566 } else { 567 // We cannot skip the value right here because it shares its 568 // lead unit with a match node which we have to evaluate 569 // next time. 570 // Instead, keep pos_ on the node lead unit itself. 571 pos_=pos-1; 572 skipValue_=true; 573 } 574 entry_.chars=str_; 575 return entry_; 576 } 577 } 578 if(maxLength_>0 && str_.length()==maxLength_) { 579 return truncateAndStop(); 580 } 581 if(node<kMinLinearMatch) { 582 if(node==0) { 583 node=chars_.charAt(pos++); 584 } 585 pos=branchNext(pos, node+1); 586 if(pos<0) { 587 return entry_; // Reached a final value. 588 } 589 } else { 590 // Linear-match node, append length units to str_. 591 int length=node-kMinLinearMatch+1; 592 if(maxLength_>0 && str_.length()+length>maxLength_) { 593 str_.append(chars_, pos, pos+maxLength_-str_.length()); 594 return truncateAndStop(); 595 } 596 str_.append(chars_, pos, pos+length); 597 pos+=length; 598 } 599 } 600 } 601 602 /** 603 * Iterator.remove() is not supported. 604 * @throws UnsupportedOperationException (always) 605 */ 606 @Override remove()607 public void remove() { 608 throw new UnsupportedOperationException(); 609 } 610 truncateAndStop()611 private Entry truncateAndStop() { 612 pos_=-1; 613 // We reset entry_.chars every time we return entry_ 614 // just because the caller might have modified the Entry. 615 entry_.chars=str_; 616 entry_.value=-1; // no real value for str 617 return entry_; 618 } 619 branchNext(int pos, int length)620 private int branchNext(int pos, int length) { 621 while(length>kMaxBranchLinearSubNodeLength) { 622 ++pos; // ignore the comparison unit 623 // Push state for the greater-or-equal edge. 624 stack_.add(((long)skipDelta(chars_, pos)<<32)|((length-(length>>1))<<16)|str_.length()); 625 // Follow the less-than edge. 626 length>>=1; 627 pos=jumpByDelta(chars_, pos); 628 } 629 // List of key-value pairs where values are either final values or jump deltas. 630 // Read the first (key, value) pair. 631 char trieUnit=chars_.charAt(pos++); 632 int node=chars_.charAt(pos++); 633 boolean isFinal=(node&kValueIsFinal)!=0; 634 int value=readValue(chars_, pos, node&=0x7fff); 635 pos=skipValue(pos, node); 636 stack_.add(((long)pos<<32)|((length-1)<<16)|str_.length()); 637 str_.append(trieUnit); 638 if(isFinal) { 639 pos_=-1; 640 entry_.chars=str_; 641 entry_.value=value; 642 return -1; 643 } else { 644 return pos+value; 645 } 646 } 647 648 private CharSequence chars_; 649 private int pos_; 650 private int initialPos_; 651 private int remainingMatchLength_; 652 private int initialRemainingMatchLength_; 653 private boolean skipValue_; // Skip intermediate value which was already delivered. 654 655 private StringBuilder str_=new StringBuilder(); 656 private int maxLength_; 657 private Entry entry_=new Entry(); 658 659 // The stack stores longs for backtracking to another 660 // outbound edge of a branch node. 661 // Each long has the offset in chars_ in bits 62..32, 662 // the str_.length() from before the node in bits 15..0, 663 // and the remaining branch length in bits 31..16. 664 // (We could store the remaining branch length minus 1 in bits 30..16 and not use bit 31, 665 // but the code looks more confusing that way.) 666 private ArrayList<Long> stack_=new ArrayList<>(); 667 } 668 stop()669 private void stop() { 670 pos_=-1; 671 } 672 673 // Reads a compact 32-bit integer. 674 // pos is already after the leadUnit, and the lead unit has bit 15 reset. readValue(CharSequence chars, int pos, int leadUnit)675 private static int readValue(CharSequence chars, int pos, int leadUnit) { 676 int value; 677 if(leadUnit<kMinTwoUnitValueLead) { 678 value=leadUnit; 679 } else if(leadUnit<kThreeUnitValueLead) { 680 value=((leadUnit-kMinTwoUnitValueLead)<<16)|chars.charAt(pos); 681 } else { 682 value=(chars.charAt(pos)<<16)|chars.charAt(pos+1); 683 } 684 return value; 685 } skipValue(int pos, int leadUnit)686 private static int skipValue(int pos, int leadUnit) { 687 if(leadUnit>=kMinTwoUnitValueLead) { 688 if(leadUnit<kThreeUnitValueLead) { 689 ++pos; 690 } else { 691 pos+=2; 692 } 693 } 694 return pos; 695 } skipValue(CharSequence chars, int pos)696 private static int skipValue(CharSequence chars, int pos) { 697 int leadUnit=chars.charAt(pos++); 698 return skipValue(pos, leadUnit&0x7fff); 699 } 700 readNodeValue(CharSequence chars, int pos, int leadUnit)701 private static int readNodeValue(CharSequence chars, int pos, int leadUnit) { 702 assert(kMinValueLead<=leadUnit && leadUnit<kValueIsFinal); 703 int value; 704 if(leadUnit<kMinTwoUnitNodeValueLead) { 705 value=(leadUnit>>6)-1; 706 } else if(leadUnit<kThreeUnitNodeValueLead) { 707 value=(((leadUnit&0x7fc0)-kMinTwoUnitNodeValueLead)<<10)|chars.charAt(pos); 708 } else { 709 value=(chars.charAt(pos)<<16)|chars.charAt(pos+1); 710 } 711 return value; 712 } skipNodeValue(int pos, int leadUnit)713 private static int skipNodeValue(int pos, int leadUnit) { 714 assert(kMinValueLead<=leadUnit && leadUnit<kValueIsFinal); 715 if(leadUnit>=kMinTwoUnitNodeValueLead) { 716 if(leadUnit<kThreeUnitNodeValueLead) { 717 ++pos; 718 } else { 719 pos+=2; 720 } 721 } 722 return pos; 723 } 724 jumpByDelta(CharSequence chars, int pos)725 private static int jumpByDelta(CharSequence chars, int pos) { 726 int delta=chars.charAt(pos++); 727 if(delta>=kMinTwoUnitDeltaLead) { 728 if(delta==kThreeUnitDeltaLead) { 729 delta=(chars.charAt(pos)<<16)|chars.charAt(pos+1); 730 pos+=2; 731 } else { 732 delta=((delta-kMinTwoUnitDeltaLead)<<16)|chars.charAt(pos++); 733 } 734 } 735 return pos+delta; 736 } 737 skipDelta(CharSequence chars, int pos)738 private static int skipDelta(CharSequence chars, int pos) { 739 int delta=chars.charAt(pos++); 740 if(delta>=kMinTwoUnitDeltaLead) { 741 if(delta==kThreeUnitDeltaLead) { 742 pos+=2; 743 } else { 744 ++pos; 745 } 746 } 747 return pos; 748 } 749 750 private static Result[] valueResults_={ Result.INTERMEDIATE_VALUE, Result.FINAL_VALUE }; 751 752 // Handles a branch node for both next(unit) and next(string). branchNext(int pos, int length, int inUnit)753 private Result branchNext(int pos, int length, int inUnit) { 754 // Branch according to the current unit. 755 if(length==0) { 756 length=chars_.charAt(pos++); 757 } 758 ++length; 759 // The length of the branch is the number of units to select from. 760 // The data structure encodes a binary search. 761 while(length>kMaxBranchLinearSubNodeLength) { 762 if(inUnit<chars_.charAt(pos++)) { 763 length>>=1; 764 pos=jumpByDelta(chars_, pos); 765 } else { 766 length=length-(length>>1); 767 pos=skipDelta(chars_, pos); 768 } 769 } 770 // Drop down to linear search for the last few units. 771 // length>=2 because the loop body above sees length>kMaxBranchLinearSubNodeLength>=3 772 // and divides length by 2. 773 do { 774 if(inUnit==chars_.charAt(pos++)) { 775 Result result; 776 int node=chars_.charAt(pos); 777 if((node&kValueIsFinal)!=0) { 778 // Leave the final value for getValue() to read. 779 result=Result.FINAL_VALUE; 780 } else { 781 // Use the non-final value as the jump delta. 782 ++pos; 783 // int delta=readValue(pos, node); 784 int delta; 785 if(node<kMinTwoUnitValueLead) { 786 delta=node; 787 } else if(node<kThreeUnitValueLead) { 788 delta=((node-kMinTwoUnitValueLead)<<16)|chars_.charAt(pos++); 789 } else { 790 delta=(chars_.charAt(pos)<<16)|chars_.charAt(pos+1); 791 pos+=2; 792 } 793 // end readValue() 794 pos+=delta; 795 node=chars_.charAt(pos); 796 result= node>=kMinValueLead ? valueResults_[node>>15] : Result.NO_VALUE; 797 } 798 pos_=pos; 799 return result; 800 } 801 --length; 802 pos=skipValue(chars_, pos); 803 } while(length>1); 804 if(inUnit==chars_.charAt(pos++)) { 805 pos_=pos; 806 int node=chars_.charAt(pos); 807 return node>=kMinValueLead ? valueResults_[node>>15] : Result.NO_VALUE; 808 } else { 809 stop(); 810 return Result.NO_MATCH; 811 } 812 } 813 814 // Requires remainingLength_<0. nextImpl(int pos, int inUnit)815 private Result nextImpl(int pos, int inUnit) { 816 int node=chars_.charAt(pos++); 817 for(;;) { 818 if(node<kMinLinearMatch) { 819 return branchNext(pos, node, inUnit); 820 } else if(node<kMinValueLead) { 821 // Match the first of length+1 units. 822 int length=node-kMinLinearMatch; // Actual match length minus 1. 823 if(inUnit==chars_.charAt(pos++)) { 824 remainingMatchLength_=--length; 825 pos_=pos; 826 return (length<0 && (node=chars_.charAt(pos))>=kMinValueLead) ? 827 valueResults_[node>>15] : Result.NO_VALUE; 828 } else { 829 // No match. 830 break; 831 } 832 } else if((node&kValueIsFinal)!=0) { 833 // No further matching units. 834 break; 835 } else { 836 // Skip intermediate value. 837 pos=skipNodeValue(pos, node); 838 node&=kNodeTypeMask; 839 } 840 } 841 stop(); 842 return Result.NO_MATCH; 843 } 844 845 // Helper functions for getUniqueValue(). 846 // Recursively finds a unique value (or whether there is not a unique one) 847 // from a branch. 848 // uniqueValue: On input, same as for getUniqueValue()/findUniqueValue(). 849 // On return, if not 0, then bits 63..33 contain the updated non-negative pos. findUniqueValueFromBranch(CharSequence chars, int pos, int length, long uniqueValue)850 private static long findUniqueValueFromBranch(CharSequence chars, int pos, int length, 851 long uniqueValue) { 852 while(length>kMaxBranchLinearSubNodeLength) { 853 ++pos; // ignore the comparison unit 854 uniqueValue=findUniqueValueFromBranch(chars, jumpByDelta(chars, pos), length>>1, uniqueValue); 855 if(uniqueValue==0) { 856 return 0; 857 } 858 length=length-(length>>1); 859 pos=skipDelta(chars, pos); 860 } 861 do { 862 ++pos; // ignore a comparison unit 863 // handle its value 864 int node=chars.charAt(pos++); 865 boolean isFinal=(node&kValueIsFinal)!=0; 866 node&=0x7fff; 867 int value=readValue(chars, pos, node); 868 pos=skipValue(pos, node); 869 if(isFinal) { 870 if(uniqueValue!=0) { 871 if(value!=(int)(uniqueValue>>1)) { 872 return 0; 873 } 874 } else { 875 uniqueValue=((long)value<<1)|1; 876 } 877 } else { 878 uniqueValue=findUniqueValue(chars, pos+value, uniqueValue); 879 if(uniqueValue==0) { 880 return 0; 881 } 882 } 883 } while(--length>1); 884 // ignore the last comparison byte 885 return ((long)(pos+1)<<33)|(uniqueValue&0x1ffffffffL); 886 } 887 // Recursively finds a unique value (or whether there is not a unique one) 888 // starting from a position on a node lead unit. 889 // uniqueValue: If there is one, then bits 32..1 contain the value and bit 0 is set. 890 // Otherwise, uniqueValue is 0. Bits 63..33 are ignored. findUniqueValue(CharSequence chars, int pos, long uniqueValue)891 private static long findUniqueValue(CharSequence chars, int pos, long uniqueValue) { 892 int node=chars.charAt(pos++); 893 for(;;) { 894 if(node<kMinLinearMatch) { 895 if(node==0) { 896 node=chars.charAt(pos++); 897 } 898 uniqueValue=findUniqueValueFromBranch(chars, pos, node+1, uniqueValue); 899 if(uniqueValue==0) { 900 return 0; 901 } 902 pos=(int)(uniqueValue>>>33); 903 node=chars.charAt(pos++); 904 } else if(node<kMinValueLead) { 905 // linear-match node 906 pos+=node-kMinLinearMatch+1; // Ignore the match units. 907 node=chars.charAt(pos++); 908 } else { 909 boolean isFinal=(node&kValueIsFinal)!=0; 910 int value; 911 if(isFinal) { 912 value=readValue(chars, pos, node&0x7fff); 913 } else { 914 value=readNodeValue(chars, pos, node); 915 } 916 if(uniqueValue!=0) { 917 if(value!=(int)(uniqueValue>>1)) { 918 return 0; 919 } 920 } else { 921 uniqueValue=((long)value<<1)|1; 922 } 923 if(isFinal) { 924 return uniqueValue; 925 } 926 pos=skipNodeValue(pos, node); 927 node&=kNodeTypeMask; 928 } 929 } 930 } 931 932 // Helper functions for getNextChars(). 933 // getNextChars() when pos is on a branch node. getNextBranchChars(CharSequence chars, int pos, int length, Appendable out)934 private static void getNextBranchChars(CharSequence chars, int pos, int length, Appendable out) { 935 while(length>kMaxBranchLinearSubNodeLength) { 936 ++pos; // ignore the comparison unit 937 getNextBranchChars(chars, jumpByDelta(chars, pos), length>>1, out); 938 length=length-(length>>1); 939 pos=skipDelta(chars, pos); 940 } 941 do { 942 append(out, chars.charAt(pos++)); 943 pos=skipValue(chars, pos); 944 } while(--length>1); 945 append(out, chars.charAt(pos)); 946 } append(Appendable out, int c)947 private static void append(Appendable out, int c) { 948 try { 949 out.append((char)c); 950 } catch(IOException e) { 951 throw new ICUUncheckedIOException(e); 952 } 953 } 954 955 // CharsTrie data structure 956 // 957 // The trie consists of a series of char-serialized nodes for incremental 958 // Unicode string/char sequence matching. (char=16-bit unsigned integer) 959 // The root node is at the beginning of the trie data. 960 // 961 // Types of nodes are distinguished by their node lead unit ranges. 962 // After each node, except a final-value node, another node follows to 963 // encode match values or continue matching further units. 964 // 965 // Node types: 966 // - Final-value node: Stores a 32-bit integer in a compact, variable-length format. 967 // The value is for the string/char sequence so far. 968 // - Match node, optionally with an intermediate value in a different compact format. 969 // The value, if present, is for the string/char sequence so far. 970 // 971 // Aside from the value, which uses the node lead unit's high bits: 972 // 973 // - Linear-match node: Matches a number of units. 974 // - Branch node: Branches to other nodes according to the current input unit. 975 // The node unit is the length of the branch (number of units to select from) 976 // minus 1. It is followed by a sub-node: 977 // - If the length is at most kMaxBranchLinearSubNodeLength, then 978 // there are length-1 (key, value) pairs and then one more comparison unit. 979 // If one of the key units matches, then the value is either a final value for 980 // the string so far, or a "jump" delta to the next node. 981 // If the last unit matches, then matching continues with the next node. 982 // (Values have the same encoding as final-value nodes.) 983 // - If the length is greater than kMaxBranchLinearSubNodeLength, then 984 // there is one unit and one "jump" delta. 985 // If the input unit is less than the sub-node unit, then "jump" by delta to 986 // the next sub-node which will have a length of length/2. 987 // (The delta has its own compact encoding.) 988 // Otherwise, skip the "jump" delta to the next sub-node 989 // which will have a length of length-length/2. 990 991 // Match-node lead unit values, after masking off intermediate-value bits: 992 993 // 0000..002f: Branch node. If node!=0 then the length is node+1, otherwise 994 // the length is one more than the next unit. 995 996 // For a branch sub-node with at most this many entries, we drop down 997 // to a linear search. 998 /*package*/ static final int kMaxBranchLinearSubNodeLength=5; 999 1000 // 0030..003f: Linear-match node, match 1..16 units and continue reading the next node. 1001 /*package*/ static final int kMinLinearMatch=0x30; 1002 /*package*/ static final int kMaxLinearMatchLength=0x10; 1003 1004 // Match-node lead unit bits 14..6 for the optional intermediate value. 1005 // If these bits are 0, then there is no intermediate value. 1006 // Otherwise, see the *NodeValue* constants below. 1007 /*package*/ static final int kMinValueLead=kMinLinearMatch+kMaxLinearMatchLength; // 0x0040 1008 /*package*/ static final int kNodeTypeMask=kMinValueLead-1; // 0x003f 1009 1010 // A final-value node has bit 15 set. 1011 /*package*/ static final int kValueIsFinal=0x8000; 1012 1013 // Compact value: After testing and masking off bit 15, use the following thresholds. 1014 /*package*/ static final int kMaxOneUnitValue=0x3fff; 1015 1016 /*package*/ static final int kMinTwoUnitValueLead=kMaxOneUnitValue+1; // 0x4000 1017 /*package*/ static final int kThreeUnitValueLead=0x7fff; 1018 1019 /*package*/ static final int kMaxTwoUnitValue=((kThreeUnitValueLead-kMinTwoUnitValueLead)<<16)-1; // 0x3ffeffff 1020 1021 // Compact intermediate-value integer, lead unit shared with a branch or linear-match node. 1022 /*package*/ static final int kMaxOneUnitNodeValue=0xff; 1023 /*package*/ static final int kMinTwoUnitNodeValueLead=kMinValueLead+((kMaxOneUnitNodeValue+1)<<6); // 0x4040 1024 /*package*/ static final int kThreeUnitNodeValueLead=0x7fc0; 1025 1026 /*package*/ static final int kMaxTwoUnitNodeValue= 1027 ((kThreeUnitNodeValueLead-kMinTwoUnitNodeValueLead)<<10)-1; // 0xfdffff 1028 1029 // Compact delta integers. 1030 /*package*/ static final int kMaxOneUnitDelta=0xfbff; 1031 /*package*/ static final int kMinTwoUnitDeltaLead=kMaxOneUnitDelta+1; // 0xfc00 1032 /*package*/ static final int kThreeUnitDeltaLead=0xffff; 1033 1034 /*package*/ static final int kMaxTwoUnitDelta=((kThreeUnitDeltaLead-kMinTwoUnitDeltaLead)<<16)-1; // 0x03feffff 1035 1036 // Fixed value referencing the CharsTrie words. 1037 private CharSequence chars_; 1038 private int root_; 1039 1040 // Iterator variables. 1041 1042 // Pointer to next trie unit to read. NULL if no more matches. 1043 private int pos_; 1044 // Remaining length of a linear-match node, minus 1. Negative if not in such a node. 1045 private int remainingMatchLength_; 1046 } 1047