1 /* GENERATED SOURCE. DO NOT MODIFY. */ 2 // © 2016 and later: Unicode, Inc. and others. 3 // License & terms of use: http://www.unicode.org/copyright.html#License 4 /* 5 ******************************************************************************* 6 * Copyright (C) 2010-2014, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 ******************************************************************************* 9 * created on: 2010nov23 10 * created by: Markus W. Scherer 11 * ported from ICU4C bytestrie.h/.cpp 12 */ 13 package ohos.global.icu.util; 14 15 import java.io.IOException; 16 import java.nio.ByteBuffer; 17 import java.util.ArrayList; 18 import java.util.NoSuchElementException; 19 20 /** 21 * Light-weight, non-const reader class for a BytesTrie. 22 * Traverses a byte-serialized data structure with minimal state, 23 * for mapping byte sequences to non-negative integer values. 24 * 25 * <p>This class is not intended for public subclassing. 26 * 27 * @author Markus W. Scherer 28 * @hide exposed on OHOS 29 */ 30 public final class BytesTrie implements Cloneable, Iterable<BytesTrie.Entry> { 31 /** 32 * Constructs a BytesTrie reader instance. 33 * 34 * <p>The array must contain a copy of a byte sequence from the BytesTrieBuilder, 35 * with the offset indicating the first byte of that sequence. 36 * The BytesTrie object will not read more bytes than 37 * the BytesTrieBuilder generated in the corresponding build() call. 38 * 39 * <p>The array is not copied/cloned and must not be modified while 40 * the BytesTrie object is in use. 41 * 42 * @param trieBytes Bytes array that contains the serialized trie. 43 * @param offset Root offset of the trie in the array. 44 */ BytesTrie(byte[] trieBytes, int offset)45 public BytesTrie(byte[] trieBytes, int offset) { 46 bytes_=trieBytes; 47 pos_=root_=offset; 48 remainingMatchLength_=-1; 49 } 50 51 /** 52 * Copy constructor. 53 * Makes a shallow copy of the other trie reader object and its state. 54 * Does not copy the byte array which will be shared. 55 * Same as clone() but without the throws clause. 56 */ BytesTrie(BytesTrie other)57 public BytesTrie(BytesTrie other) { 58 bytes_ = other.bytes_; 59 root_ = other.root_; 60 pos_ = other.pos_; 61 remainingMatchLength_ = other.remainingMatchLength_; 62 } 63 64 /** 65 * Clones this trie reader object and its state, 66 * but not the byte array which will be shared. 67 * @return A shallow clone of this trie. 68 */ 69 @Override clone()70 public BytesTrie clone() throws CloneNotSupportedException { 71 return (BytesTrie) super.clone(); // A shallow copy is just what we need. 72 } 73 74 /** 75 * Resets this trie to its initial state. 76 * @return this 77 */ reset()78 public BytesTrie reset() { 79 pos_=root_; 80 remainingMatchLength_=-1; 81 return this; 82 } 83 84 /** 85 * Returns the state of this trie as a 64-bit integer. 86 * The state value is never 0. 87 * 88 * @return opaque state value 89 * @see #resetToState64 90 */ getState64()91 public long getState64() { 92 return ((long)remainingMatchLength_ << 32) | pos_; 93 } 94 95 /** 96 * Resets this trie to the saved state. 97 * Unlike {@link #resetToState(State)}, the 64-bit state value 98 * must be from {@link #getState64()} from the same trie object or 99 * from one initialized the exact same way. 100 * Because of no validation, this method is faster. 101 * 102 * @param state The opaque trie state value from getState64(). 103 * @return this 104 * @see #getState64 105 * @see #resetToState 106 * @see #reset 107 */ resetToState64(long state)108 public BytesTrie resetToState64(long state) { 109 remainingMatchLength_ = (int)(state >> 32); 110 pos_ = (int)state; 111 return this; 112 } 113 114 /** 115 * BytesTrie state object, for saving a trie's current state 116 * and resetting the trie back to this state later. 117 * @hide exposed on OHOS 118 */ 119 public static final class State { 120 /** 121 * Constructs an empty State. 122 */ State()123 public State() {} 124 private byte[] bytes; 125 private int root; 126 private int pos; 127 private int remainingMatchLength; 128 } 129 130 /** 131 * Saves the state of this trie. 132 * @param state The State object to hold the trie's state. 133 * @return this 134 * @see #resetToState 135 */ saveState(State state)136 public BytesTrie saveState(State state) /*const*/ { 137 state.bytes=bytes_; 138 state.root=root_; 139 state.pos=pos_; 140 state.remainingMatchLength=remainingMatchLength_; 141 return this; 142 } 143 144 /** 145 * Resets this trie to the saved state. 146 * Slower than {@link #resetToState64(long)} which does not validate the state value. 147 * 148 * @param state The State object which holds a saved trie state. 149 * @return this 150 * @throws IllegalArgumentException if the state object contains no state, 151 * or the state of a different trie 152 * @see #saveState 153 * @see #reset 154 */ resetToState(State state)155 public BytesTrie resetToState(State state) { 156 if(bytes_==state.bytes && bytes_!=null && root_==state.root) { 157 pos_=state.pos; 158 remainingMatchLength_=state.remainingMatchLength; 159 } else { 160 throw new IllegalArgumentException("incompatible trie state"); 161 } 162 return this; 163 } 164 165 /** 166 * Return values for BytesTrie.next(), CharsTrie.next() and similar methods. 167 * @hide exposed on OHOS 168 */ 169 public enum Result { 170 /** 171 * The input unit(s) did not continue a matching string. 172 * Once current()/next() return NO_MATCH, 173 * all further calls to current()/next() will also return NO_MATCH, 174 * until the trie is reset to its original state or to a saved state. 175 */ 176 NO_MATCH, 177 /** 178 * The input unit(s) continued a matching string 179 * but there is no value for the string so far. 180 * (It is a prefix of a longer string.) 181 */ 182 NO_VALUE, 183 /** 184 * The input unit(s) continued a matching string 185 * and there is a value for the string so far. 186 * This value will be returned by getValue(). 187 * No further input byte/unit can continue a matching string. 188 */ 189 FINAL_VALUE, 190 /** 191 * The input unit(s) continued a matching string 192 * and there is a value for the string so far. 193 * This value will be returned by getValue(). 194 * Another input byte/unit can continue a matching string. 195 */ 196 INTERMEDIATE_VALUE; 197 198 // Note: The following methods assume the particular order 199 // of enum constants, treating the ordinal() values like bit sets. 200 // Do not reorder the enum constants! 201 202 /** 203 * Same as (result!=NO_MATCH). 204 * @return true if the input bytes/units so far are part of a matching string/byte sequence. 205 */ matches()206 public boolean matches() { return this!=NO_MATCH; } 207 208 /** 209 * Equivalent to (result==INTERMEDIATE_VALUE || result==FINAL_VALUE). 210 * @return true if there is a value for the input bytes/units so far. 211 * @see #getValue 212 */ hasValue()213 public boolean hasValue() { return ordinal()>=2; } 214 215 /** 216 * Equivalent to (result==NO_VALUE || result==INTERMEDIATE_VALUE). 217 * @return true if another input byte/unit can continue a matching string. 218 */ hasNext()219 public boolean hasNext() { return (ordinal()&1)!=0; } 220 } 221 222 /** 223 * Determines whether the byte sequence so far matches, whether it has a value, 224 * and whether another input byte can continue a matching byte sequence. 225 * @return The match/value Result. 226 */ current()227 public Result current() /*const*/ { 228 int pos=pos_; 229 if(pos<0) { 230 return Result.NO_MATCH; 231 } else { 232 int node; 233 return (remainingMatchLength_<0 && (node=bytes_[pos]&0xff)>=kMinValueLead) ? 234 valueResults_[node&kValueIsFinal] : Result.NO_VALUE; 235 } 236 } 237 238 /** 239 * Traverses the trie from the initial state for this input byte. 240 * Equivalent to reset().next(inByte). 241 * @param inByte Input byte value. Values -0x100..-1 are treated like 0..0xff. 242 * Values below -0x100 and above 0xff will never match. 243 * @return The match/value Result. 244 */ first(int inByte)245 public Result first(int inByte) { 246 remainingMatchLength_=-1; 247 if(inByte<0) { 248 inByte+=0x100; 249 } 250 return nextImpl(root_, inByte); 251 } 252 253 /** 254 * Traverses the trie from the current state for this input byte. 255 * @param inByte Input byte value. Values -0x100..-1 are treated like 0..0xff. 256 * Values below -0x100 and above 0xff will never match. 257 * @return The match/value Result. 258 */ next(int inByte)259 public Result next(int inByte) { 260 int pos=pos_; 261 if(pos<0) { 262 return Result.NO_MATCH; 263 } 264 if(inByte<0) { 265 inByte+=0x100; 266 } 267 int length=remainingMatchLength_; // Actual remaining match length minus 1. 268 if(length>=0) { 269 // Remaining part of a linear-match node. 270 if(inByte==(bytes_[pos++]&0xff)) { 271 remainingMatchLength_=--length; 272 pos_=pos; 273 int node; 274 return (length<0 && (node=bytes_[pos]&0xff)>=kMinValueLead) ? 275 valueResults_[node&kValueIsFinal] : Result.NO_VALUE; 276 } else { 277 stop(); 278 return Result.NO_MATCH; 279 } 280 } 281 return nextImpl(pos, inByte); 282 } 283 284 /** 285 * Traverses the trie from the current state for this byte sequence. 286 * Equivalent to 287 * <pre> 288 * Result result=current(); 289 * for(each c in s) 290 * if(!result.hasNext()) return Result.NO_MATCH; 291 * result=next(c); 292 * return result; 293 * </pre> 294 * @param s Contains a string or byte sequence. 295 * @param sIndex The start index of the byte sequence in s. 296 * @param sLimit The (exclusive) end index of the byte sequence in s. 297 * @return The match/value Result. 298 */ next(byte[] s, int sIndex, int sLimit)299 public Result next(byte[] s, int sIndex, int sLimit) { 300 if(sIndex>=sLimit) { 301 // Empty input. 302 return current(); 303 } 304 int pos=pos_; 305 if(pos<0) { 306 return Result.NO_MATCH; 307 } 308 int length=remainingMatchLength_; // Actual remaining match length minus 1. 309 for(;;) { 310 // Fetch the next input byte, if there is one. 311 // Continue a linear-match node. 312 byte inByte; 313 for(;;) { 314 if(sIndex==sLimit) { 315 remainingMatchLength_=length; 316 pos_=pos; 317 int node; 318 return (length<0 && (node=(bytes_[pos]&0xff))>=kMinValueLead) ? 319 valueResults_[node&kValueIsFinal] : Result.NO_VALUE; 320 } 321 inByte=s[sIndex++]; 322 if(length<0) { 323 remainingMatchLength_=length; 324 break; 325 } 326 if(inByte!=bytes_[pos]) { 327 stop(); 328 return Result.NO_MATCH; 329 } 330 ++pos; 331 --length; 332 } 333 for(;;) { 334 int node=bytes_[pos++]&0xff; 335 if(node<kMinLinearMatch) { 336 Result result=branchNext(pos, node, inByte&0xff); 337 if(result==Result.NO_MATCH) { 338 return Result.NO_MATCH; 339 } 340 // Fetch the next input byte, if there is one. 341 if(sIndex==sLimit) { 342 return result; 343 } 344 if(result==Result.FINAL_VALUE) { 345 // No further matching bytes. 346 stop(); 347 return Result.NO_MATCH; 348 } 349 inByte=s[sIndex++]; 350 pos=pos_; // branchNext() advanced pos and wrote it to pos_ . 351 } else if(node<kMinValueLead) { 352 // Match length+1 bytes. 353 length=node-kMinLinearMatch; // Actual match length minus 1. 354 if(inByte!=bytes_[pos]) { 355 stop(); 356 return Result.NO_MATCH; 357 } 358 ++pos; 359 --length; 360 break; 361 } else if((node&kValueIsFinal)!=0) { 362 // No further matching bytes. 363 stop(); 364 return Result.NO_MATCH; 365 } else { 366 // Skip intermediate value. 367 pos=skipValue(pos, node); 368 // The next node must not also be a value node. 369 assert((bytes_[pos]&0xff)<kMinValueLead); 370 } 371 } 372 } 373 } 374 375 /** 376 * Returns a matching byte sequence's value if called immediately after 377 * current()/first()/next() returned Result.INTERMEDIATE_VALUE or Result.FINAL_VALUE. 378 * getValue() can be called multiple times. 379 * 380 * Do not call getValue() after Result.NO_MATCH or Result.NO_VALUE! 381 * @return The value for the byte sequence so far. 382 */ 383 public int getValue() /*const*/ { 384 int pos=pos_; 385 int leadByte=bytes_[pos++]&0xff; 386 assert(leadByte>=kMinValueLead); 387 return readValue(bytes_, pos, leadByte>>1); 388 } 389 390 /** 391 * Determines whether all byte sequences reachable from the current state 392 * map to the same value, and if so, returns that value. 393 * @return The unique value in bits 32..1 with bit 0 set, 394 * if all byte sequences reachable from the current state 395 * map to the same value; otherwise returns 0. 396 */ getUniqueValue()397 public long getUniqueValue() /*const*/ { 398 int pos=pos_; 399 if(pos<0) { 400 return 0; 401 } 402 // Skip the rest of a pending linear-match node. 403 long uniqueValue=findUniqueValue(bytes_, pos+remainingMatchLength_+1, 0); 404 // Ignore internally used bits 63..33; extend the actual value's sign bit from bit 32. 405 return (uniqueValue<<31)>>31; 406 } 407 408 /** 409 * Finds each byte which continues the byte sequence from the current state. 410 * That is, each byte b for which it would be next(b)!=Result.NO_MATCH now. 411 * @param out Each next byte is 0-extended to a char and appended to this object. 412 * (Only uses the out.append(c) method.) 413 * @return The number of bytes which continue the byte sequence from here. 414 */ getNextBytes(Appendable out)415 public int getNextBytes(Appendable out) /*const*/ { 416 int pos=pos_; 417 if(pos<0) { 418 return 0; 419 } 420 if(remainingMatchLength_>=0) { 421 append(out, bytes_[pos]&0xff); // Next byte of a pending linear-match node. 422 return 1; 423 } 424 int node=bytes_[pos++]&0xff; 425 if(node>=kMinValueLead) { 426 if((node&kValueIsFinal)!=0) { 427 return 0; 428 } else { 429 pos=skipValue(pos, node); 430 node=bytes_[pos++]&0xff; 431 assert(node<kMinValueLead); 432 } 433 } 434 if(node<kMinLinearMatch) { 435 if(node==0) { 436 node=bytes_[pos++]&0xff; 437 } 438 getNextBranchBytes(bytes_, pos, ++node, out); 439 return node; 440 } else { 441 // First byte of the linear-match node. 442 append(out, bytes_[pos]&0xff); 443 return 1; 444 } 445 } 446 447 /** 448 * Iterates from the current state of this trie. 449 * @return A new BytesTrie.Iterator. 450 */ 451 @Override 452 public Iterator iterator() { 453 return new Iterator(bytes_, pos_, remainingMatchLength_, 0); 454 } 455 456 /** 457 * Iterates from the current state of this trie. 458 * @param maxStringLength If 0, the iterator returns full strings/byte sequences. 459 * Otherwise, the iterator returns strings with this maximum length. 460 * @return A new BytesTrie.Iterator. 461 */ 462 public Iterator iterator(int maxStringLength) { 463 return new Iterator(bytes_, pos_, remainingMatchLength_, maxStringLength); 464 } 465 466 /** 467 * Iterates from the root of a byte-serialized BytesTrie. 468 * @param trieBytes Bytes array that contains the serialized trie. 469 * @param offset Root offset of the trie in the array. 470 * @param maxStringLength If 0, the iterator returns full strings/byte sequences. 471 * Otherwise, the iterator returns strings with this maximum length. 472 * @return A new BytesTrie.Iterator. 473 */ 474 public static Iterator iterator(byte[] trieBytes, int offset, int maxStringLength) { 475 return new Iterator(trieBytes, offset, -1, maxStringLength); 476 } 477 478 /** 479 * Return value type for the Iterator. 480 * @hide exposed on OHOS 481 */ 482 public static final class Entry { 483 private Entry(int capacity) { 484 bytes=new byte[capacity]; 485 } 486 487 /** 488 * @return The length of the byte sequence. 489 */ 490 public int bytesLength() { return length; } 491 /** 492 * Returns a byte of the byte sequence. 493 * @param index An index into the byte sequence. 494 * @return The index-th byte sequence byte. 495 */ 496 public byte byteAt(int index) { return bytes[index]; } 497 /** 498 * Copies the byte sequence into a byte array. 499 * @param dest Destination byte array. 500 * @param destOffset Starting offset to where in dest the byte sequence is copied. 501 */ 502 public void copyBytesTo(byte[] dest, int destOffset) { 503 System.arraycopy(bytes, 0, dest, destOffset, length); 504 } 505 /** 506 * @return The byte sequence as a read-only ByteBuffer. 507 */ 508 public ByteBuffer bytesAsByteBuffer() { 509 return ByteBuffer.wrap(bytes, 0, length).asReadOnlyBuffer(); 510 } 511 512 /** 513 * The value associated with the byte sequence. 514 */ 515 public int value; 516 517 private void ensureCapacity(int len) { 518 if(bytes.length<len) { 519 byte[] newBytes=new byte[Math.min(2*bytes.length, 2*len)]; 520 System.arraycopy(bytes, 0, newBytes, 0, length); 521 bytes=newBytes; 522 } 523 } 524 private void append(byte b) { 525 ensureCapacity(length+1); 526 bytes[length++]=b; 527 } 528 private void append(byte[] b, int off, int len) { 529 ensureCapacity(length+len); 530 System.arraycopy(b, off, bytes, length, len); 531 length+=len; 532 } 533 private void truncateString(int newLength) { length=newLength; } 534 535 private byte[] bytes; 536 private int length; 537 } 538 539 /** 540 * Iterator for all of the (byte sequence, value) pairs in a BytesTrie. 541 * @hide exposed on OHOS 542 */ 543 public static final class Iterator implements java.util.Iterator<Entry> { 544 private Iterator(byte[] trieBytes, int offset, int remainingMatchLength, int maxStringLength) { 545 bytes_=trieBytes; 546 pos_=initialPos_=offset; 547 remainingMatchLength_=initialRemainingMatchLength_=remainingMatchLength; 548 maxLength_=maxStringLength; 549 entry_=new Entry(maxLength_!=0 ? maxLength_ : 32); 550 int length=remainingMatchLength_; // Actual remaining match length minus 1. 551 if(length>=0) { 552 // Pending linear-match node, append remaining bytes to entry_. 553 ++length; 554 if(maxLength_>0 && length>maxLength_) { 555 length=maxLength_; // This will leave remainingMatchLength>=0 as a signal. 556 } 557 entry_.append(bytes_, pos_, length); 558 pos_+=length; 559 remainingMatchLength_-=length; 560 } 561 } 562 563 /** 564 * Resets this iterator to its initial state. 565 * @return this 566 */ reset()567 public Iterator reset() { 568 pos_=initialPos_; 569 remainingMatchLength_=initialRemainingMatchLength_; 570 int length=remainingMatchLength_+1; // Remaining match length. 571 if(maxLength_>0 && length>maxLength_) { 572 length=maxLength_; 573 } 574 entry_.truncateString(length); 575 pos_+=length; 576 remainingMatchLength_-=length; 577 stack_.clear(); 578 return this; 579 } 580 581 /** 582 * @return true if there are more elements. 583 */ 584 @Override hasNext()585 public boolean hasNext() /*const*/ { return pos_>=0 || !stack_.isEmpty(); } 586 587 /** 588 * Finds the next (byte sequence, value) pair if there is one. 589 * 590 * If the byte sequence is truncated to the maximum length and does not 591 * have a real value, then the value is set to -1. 592 * In this case, this "not a real value" is indistinguishable from 593 * a real value of -1. 594 * @return An Entry with the string and value of the next element. 595 * @throws NoSuchElementException - iteration has no more elements. 596 */ 597 @Override next()598 public Entry next() { 599 int pos=pos_; 600 if(pos<0) { 601 if(stack_.isEmpty()) { 602 throw new NoSuchElementException(); 603 } 604 // Pop the state off the stack and continue with the next outbound edge of 605 // the branch node. 606 long top=stack_.remove(stack_.size()-1); 607 int length=(int)top; 608 pos=(int)(top>>32); 609 entry_.truncateString(length&0xffff); 610 length>>>=16; 611 if(length>1) { 612 pos=branchNext(pos, length); 613 if(pos<0) { 614 return entry_; // Reached a final value. 615 } 616 } else { 617 entry_.append(bytes_[pos++]); 618 } 619 } 620 if(remainingMatchLength_>=0) { 621 // We only get here if we started in a pending linear-match node 622 // with more than maxLength remaining bytes. 623 return truncateAndStop(); 624 } 625 for(;;) { 626 int node=bytes_[pos++]&0xff; 627 if(node>=kMinValueLead) { 628 // Deliver value for the byte sequence so far. 629 boolean isFinal=(node&kValueIsFinal)!=0; 630 entry_.value=readValue(bytes_, pos, node>>1); 631 if(isFinal || (maxLength_>0 && entry_.length==maxLength_)) { 632 pos_=-1; 633 } else { 634 pos_=skipValue(pos, node); 635 } 636 return entry_; 637 } 638 if(maxLength_>0 && entry_.length==maxLength_) { 639 return truncateAndStop(); 640 } 641 if(node<kMinLinearMatch) { 642 if(node==0) { 643 node=bytes_[pos++]&0xff; 644 } 645 pos=branchNext(pos, node+1); 646 if(pos<0) { 647 return entry_; // Reached a final value. 648 } 649 } else { 650 // Linear-match node, append length bytes to entry_. 651 int length=node-kMinLinearMatch+1; 652 if(maxLength_>0 && entry_.length+length>maxLength_) { 653 entry_.append(bytes_, pos, maxLength_-entry_.length); 654 return truncateAndStop(); 655 } 656 entry_.append(bytes_, pos, length); 657 pos+=length; 658 } 659 } 660 } 661 662 /** 663 * Iterator.remove() is not supported. 664 * @throws UnsupportedOperationException (always) 665 */ 666 @Override remove()667 public void remove() { 668 throw new UnsupportedOperationException(); 669 } 670 truncateAndStop()671 private Entry truncateAndStop() { 672 pos_=-1; 673 entry_.value=-1; // no real value for str 674 return entry_; 675 } 676 branchNext(int pos, int length)677 private int branchNext(int pos, int length) { 678 while(length>kMaxBranchLinearSubNodeLength) { 679 ++pos; // ignore the comparison byte 680 // Push state for the greater-or-equal edge. 681 stack_.add(((long)skipDelta(bytes_, pos)<<32)|((length-(length>>1))<<16)|entry_.length); 682 // Follow the less-than edge. 683 length>>=1; 684 pos=jumpByDelta(bytes_, pos); 685 } 686 // List of key-value pairs where values are either final values or jump deltas. 687 // Read the first (key, value) pair. 688 byte trieByte=bytes_[pos++]; 689 int node=bytes_[pos++]&0xff; 690 boolean isFinal=(node&kValueIsFinal)!=0; 691 int value=readValue(bytes_, pos, node>>1); 692 pos=skipValue(pos, node); 693 stack_.add(((long)pos<<32)|((length-1)<<16)|entry_.length); 694 entry_.append(trieByte); 695 if(isFinal) { 696 pos_=-1; 697 entry_.value=value; 698 return -1; 699 } else { 700 return pos+value; 701 } 702 } 703 704 private byte[] bytes_; 705 private int pos_; 706 private int initialPos_; 707 private int remainingMatchLength_; 708 private int initialRemainingMatchLength_; 709 710 private int maxLength_; 711 private Entry entry_; 712 713 // The stack stores longs for backtracking to another 714 // outbound edge of a branch node. 715 // Each long has the offset from bytes_ in bits 62..32, 716 // the entry_.stringLength() from before the node in bits 15..0, 717 // and the remaining branch length in bits 24..16. (Bits 31..25 are unused.) 718 // (We could store the remaining branch length minus 1 in bits 23..16 and not use bits 31..24, 719 // but the code looks more confusing that way.) 720 private ArrayList<Long> stack_=new ArrayList<>(); 721 } 722 stop()723 private void stop() { 724 pos_=-1; 725 } 726 727 // Reads a compact 32-bit integer. 728 // pos is already after the leadByte, and the lead byte is already shifted right by 1. readValue(byte[] bytes, int pos, int leadByte)729 private static int readValue(byte[] bytes, int pos, int leadByte) { 730 int value; 731 if(leadByte<kMinTwoByteValueLead) { 732 value=leadByte-kMinOneByteValueLead; 733 } else if(leadByte<kMinThreeByteValueLead) { 734 value=((leadByte-kMinTwoByteValueLead)<<8)|(bytes[pos]&0xff); 735 } else if(leadByte<kFourByteValueLead) { 736 value=((leadByte-kMinThreeByteValueLead)<<16)|((bytes[pos]&0xff)<<8)|(bytes[pos+1]&0xff); 737 } else if(leadByte==kFourByteValueLead) { 738 value=((bytes[pos]&0xff)<<16)|((bytes[pos+1]&0xff)<<8)|(bytes[pos+2]&0xff); 739 } else { 740 value=(bytes[pos]<<24)|((bytes[pos+1]&0xff)<<16)|((bytes[pos+2]&0xff)<<8)|(bytes[pos+3]&0xff); 741 } 742 return value; 743 } skipValue(int pos, int leadByte)744 private static int skipValue(int pos, int leadByte) { 745 assert(leadByte>=kMinValueLead); 746 if(leadByte>=(kMinTwoByteValueLead<<1)) { 747 if(leadByte<(kMinThreeByteValueLead<<1)) { 748 ++pos; 749 } else if(leadByte<(kFourByteValueLead<<1)) { 750 pos+=2; 751 } else { 752 pos+=3+((leadByte>>1)&1); 753 } 754 } 755 return pos; 756 } skipValue(byte[] bytes, int pos)757 private static int skipValue(byte[] bytes, int pos) { 758 int leadByte=bytes[pos++]&0xff; 759 return skipValue(pos, leadByte); 760 } 761 762 // Reads a jump delta and jumps. jumpByDelta(byte[] bytes, int pos)763 private static int jumpByDelta(byte[] bytes, int pos) { 764 int delta=bytes[pos++]&0xff; 765 if(delta<kMinTwoByteDeltaLead) { 766 // nothing to do 767 } else if(delta<kMinThreeByteDeltaLead) { 768 delta=((delta-kMinTwoByteDeltaLead)<<8)|(bytes[pos++]&0xff); 769 } else if(delta<kFourByteDeltaLead) { 770 delta=((delta-kMinThreeByteDeltaLead)<<16)|((bytes[pos]&0xff)<<8)|(bytes[pos+1]&0xff); 771 pos+=2; 772 } else if(delta==kFourByteDeltaLead) { 773 delta=((bytes[pos]&0xff)<<16)|((bytes[pos+1]&0xff)<<8)|(bytes[pos+2]&0xff); 774 pos+=3; 775 } else { 776 delta=(bytes[pos]<<24)|((bytes[pos+1]&0xff)<<16)|((bytes[pos+2]&0xff)<<8)|(bytes[pos+3]&0xff); 777 pos+=4; 778 } 779 return pos+delta; 780 } 781 skipDelta(byte[] bytes, int pos)782 private static int skipDelta(byte[] bytes, int pos) { 783 int delta=bytes[pos++]&0xff; 784 if(delta>=kMinTwoByteDeltaLead) { 785 if(delta<kMinThreeByteDeltaLead) { 786 ++pos; 787 } else if(delta<kFourByteDeltaLead) { 788 pos+=2; 789 } else { 790 pos+=3+(delta&1); 791 } 792 } 793 return pos; 794 } 795 796 private static Result[] valueResults_={ Result.INTERMEDIATE_VALUE, Result.FINAL_VALUE }; 797 798 // Handles a branch node for both next(byte) and next(string). branchNext(int pos, int length, int inByte)799 private Result branchNext(int pos, int length, int inByte) { 800 // Branch according to the current byte. 801 if(length==0) { 802 length=bytes_[pos++]&0xff; 803 } 804 ++length; 805 // The length of the branch is the number of bytes to select from. 806 // The data structure encodes a binary search. 807 while(length>kMaxBranchLinearSubNodeLength) { 808 if(inByte<(bytes_[pos++]&0xff)) { 809 length>>=1; 810 pos=jumpByDelta(bytes_, pos); 811 } else { 812 length=length-(length>>1); 813 pos=skipDelta(bytes_, pos); 814 } 815 } 816 // Drop down to linear search for the last few bytes. 817 // length>=2 because the loop body above sees length>kMaxBranchLinearSubNodeLength>=3 818 // and divides length by 2. 819 do { 820 if(inByte==(bytes_[pos++]&0xff)) { 821 Result result; 822 int node=bytes_[pos]&0xff; 823 assert(node>=kMinValueLead); 824 if((node&kValueIsFinal)!=0) { 825 // Leave the final value for getValue() to read. 826 result=Result.FINAL_VALUE; 827 } else { 828 // Use the non-final value as the jump delta. 829 ++pos; 830 // int delta=readValue(pos, node>>1); 831 node>>=1; 832 int delta; 833 if(node<kMinTwoByteValueLead) { 834 delta=node-kMinOneByteValueLead; 835 } else if(node<kMinThreeByteValueLead) { 836 delta=((node-kMinTwoByteValueLead)<<8)|(bytes_[pos++]&0xff); 837 } else if(node<kFourByteValueLead) { 838 delta=((node-kMinThreeByteValueLead)<<16)|((bytes_[pos]&0xff)<<8)|(bytes_[pos+1]&0xff); 839 pos+=2; 840 } else if(node==kFourByteValueLead) { 841 delta=((bytes_[pos]&0xff)<<16)|((bytes_[pos+1]&0xff)<<8)|(bytes_[pos+2]&0xff); 842 pos+=3; 843 } else { 844 delta=(bytes_[pos]<<24)|((bytes_[pos+1]&0xff)<<16)|((bytes_[pos+2]&0xff)<<8)|(bytes_[pos+3]&0xff); 845 pos+=4; 846 } 847 // end readValue() 848 pos+=delta; 849 node=bytes_[pos]&0xff; 850 result= node>=kMinValueLead ? valueResults_[node&kValueIsFinal] : Result.NO_VALUE; 851 } 852 pos_=pos; 853 return result; 854 } 855 --length; 856 pos=skipValue(bytes_, pos); 857 } while(length>1); 858 if(inByte==(bytes_[pos++]&0xff)) { 859 pos_=pos; 860 int node=bytes_[pos]&0xff; 861 return node>=kMinValueLead ? valueResults_[node&kValueIsFinal] : Result.NO_VALUE; 862 } else { 863 stop(); 864 return Result.NO_MATCH; 865 } 866 } 867 868 // Requires remainingLength_<0. nextImpl(int pos, int inByte)869 private Result nextImpl(int pos, int inByte) { 870 for(;;) { 871 int node=bytes_[pos++]&0xff; 872 if(node<kMinLinearMatch) { 873 return branchNext(pos, node, inByte); 874 } else if(node<kMinValueLead) { 875 // Match the first of length+1 bytes. 876 int length=node-kMinLinearMatch; // Actual match length minus 1. 877 if(inByte==(bytes_[pos++]&0xff)) { 878 remainingMatchLength_=--length; 879 pos_=pos; 880 return (length<0 && (node=bytes_[pos]&0xff)>=kMinValueLead) ? 881 valueResults_[node&kValueIsFinal] : Result.NO_VALUE; 882 } else { 883 // No match. 884 break; 885 } 886 } else if((node&kValueIsFinal)!=0) { 887 // No further matching bytes. 888 break; 889 } else { 890 // Skip intermediate value. 891 pos=skipValue(pos, node); 892 // The next node must not also be a value node. 893 assert((bytes_[pos]&0xff)<kMinValueLead); 894 } 895 } 896 stop(); 897 return Result.NO_MATCH; 898 } 899 900 // Helper functions for getUniqueValue(). 901 // Recursively finds a unique value (or whether there is not a unique one) 902 // from a branch. 903 // uniqueValue: On input, same as for getUniqueValue()/findUniqueValue(). 904 // On return, if not 0, then bits 63..33 contain the updated non-negative pos. 905 private static long findUniqueValueFromBranch(byte[] bytes, int pos, int length, 906 long uniqueValue) { 907 while(length>kMaxBranchLinearSubNodeLength) { 908 ++pos; // ignore the comparison byte 909 uniqueValue=findUniqueValueFromBranch(bytes, jumpByDelta(bytes, pos), length>>1, uniqueValue); 910 if(uniqueValue==0) { 911 return 0; 912 } 913 length=length-(length>>1); 914 pos=skipDelta(bytes, pos); 915 } 916 do { 917 ++pos; // ignore a comparison byte 918 // handle its value 919 int node=bytes[pos++]&0xff; 920 boolean isFinal=(node&kValueIsFinal)!=0; 921 int value=readValue(bytes, pos, node>>1); 922 pos=skipValue(pos, node); 923 if(isFinal) { 924 if(uniqueValue!=0) { 925 if(value!=(int)(uniqueValue>>1)) { 926 return 0; 927 } 928 } else { 929 uniqueValue=((long)value<<1)|1; 930 } 931 } else { 932 uniqueValue=findUniqueValue(bytes, pos+value, uniqueValue); 933 if(uniqueValue==0) { 934 return 0; 935 } 936 } 937 } while(--length>1); 938 // ignore the last comparison byte 939 return ((long)(pos+1)<<33)|(uniqueValue&0x1ffffffffL); 940 } 941 // Recursively finds a unique value (or whether there is not a unique one) 942 // starting from a position on a node lead byte. 943 // uniqueValue: If there is one, then bits 32..1 contain the value and bit 0 is set. 944 // Otherwise, uniqueValue is 0. Bits 63..33 are ignored. findUniqueValue(byte[] bytes, int pos, long uniqueValue)945 private static long findUniqueValue(byte[] bytes, int pos, long uniqueValue) { 946 for(;;) { 947 int node=bytes[pos++]&0xff; 948 if(node<kMinLinearMatch) { 949 if(node==0) { 950 node=bytes[pos++]&0xff; 951 } 952 uniqueValue=findUniqueValueFromBranch(bytes, pos, node+1, uniqueValue); 953 if(uniqueValue==0) { 954 return 0; 955 } 956 pos=(int)(uniqueValue>>>33); 957 } else if(node<kMinValueLead) { 958 // linear-match node 959 pos+=node-kMinLinearMatch+1; // Ignore the match bytes. 960 } else { 961 boolean isFinal=(node&kValueIsFinal)!=0; 962 int value=readValue(bytes, pos, node>>1); 963 if(uniqueValue!=0) { 964 if(value!=(int)(uniqueValue>>1)) { 965 return 0; 966 } 967 } else { 968 uniqueValue=((long)value<<1)|1; 969 } 970 if(isFinal) { 971 return uniqueValue; 972 } 973 pos=skipValue(pos, node); 974 } 975 } 976 } 977 978 // Helper functions for getNextBytes(). 979 // getNextBytes() when pos is on a branch node. getNextBranchBytes(byte[] bytes, int pos, int length, Appendable out)980 private static void getNextBranchBytes(byte[] bytes, int pos, int length, Appendable out) { 981 while(length>kMaxBranchLinearSubNodeLength) { 982 ++pos; // ignore the comparison byte 983 getNextBranchBytes(bytes, jumpByDelta(bytes, pos), length>>1, out); 984 length=length-(length>>1); 985 pos=skipDelta(bytes, pos); 986 } 987 do { 988 append(out, bytes[pos++]&0xff); 989 pos=skipValue(bytes, pos); 990 } while(--length>1); 991 append(out, bytes[pos]&0xff); 992 } append(Appendable out, int c)993 private static void append(Appendable out, int c) { 994 try { 995 out.append((char)c); 996 } catch(IOException e) { 997 throw new ICUUncheckedIOException(e); 998 } 999 } 1000 1001 // BytesTrie data structure 1002 // 1003 // The trie consists of a series of byte-serialized nodes for incremental 1004 // string/byte sequence matching. The root node is at the beginning of the trie data. 1005 // 1006 // Types of nodes are distinguished by their node lead byte ranges. 1007 // After each node, except a final-value node, another node follows to 1008 // encode match values or continue matching further bytes. 1009 // 1010 // Node types: 1011 // - Value node: Stores a 32-bit integer in a compact, variable-length format. 1012 // The value is for the string/byte sequence so far. 1013 // One node bit indicates whether the value is final or whether 1014 // matching continues with the next node. 1015 // - Linear-match node: Matches a number of bytes. 1016 // - Branch node: Branches to other nodes according to the current input byte. 1017 // The node byte is the length of the branch (number of bytes to select from) 1018 // minus 1. It is followed by a sub-node: 1019 // - If the length is at most kMaxBranchLinearSubNodeLength, then 1020 // there are length-1 (key, value) pairs and then one more comparison byte. 1021 // If one of the key bytes matches, then the value is either a final value for 1022 // the string/byte sequence so far, or a "jump" delta to the next node. 1023 // If the last byte matches, then matching continues with the next node. 1024 // (Values have the same encoding as value nodes.) 1025 // - If the length is greater than kMaxBranchLinearSubNodeLength, then 1026 // there is one byte and one "jump" delta. 1027 // If the input byte is less than the sub-node byte, then "jump" by delta to 1028 // the next sub-node which will have a length of length/2. 1029 // (The delta has its own compact encoding.) 1030 // Otherwise, skip the "jump" delta to the next sub-node 1031 // which will have a length of length-length/2. 1032 1033 // Node lead byte values. 1034 1035 // 00..0f: Branch node. If node!=0 then the length is node+1, otherwise 1036 // the length is one more than the next byte. 1037 1038 // For a branch sub-node with at most this many entries, we drop down 1039 // to a linear search. 1040 /*package*/ static final int kMaxBranchLinearSubNodeLength=5; 1041 1042 // 10..1f: Linear-match node, match 1..16 bytes and continue reading the next node. 1043 /*package*/ static final int kMinLinearMatch=0x10; 1044 /*package*/ static final int kMaxLinearMatchLength=0x10; 1045 1046 // 20..ff: Variable-length value node. 1047 // If odd, the value is final. (Otherwise, intermediate value or jump delta.) 1048 // Then shift-right by 1 bit. 1049 // The remaining lead byte value indicates the number of following bytes (0..4) 1050 // and contains the value's top bits. 1051 /*package*/ static final int kMinValueLead=kMinLinearMatch+kMaxLinearMatchLength; // 0x20 1052 // It is a final value if bit 0 is set. 1053 private static final int kValueIsFinal=1; 1054 1055 // Compact value: After testing bit 0, shift right by 1 and then use the following thresholds. 1056 /*package*/ static final int kMinOneByteValueLead=kMinValueLead/2; // 0x10 1057 /*package*/ static final int kMaxOneByteValue=0x40; // At least 6 bits in the first byte. 1058 1059 /*package*/ static final int kMinTwoByteValueLead=kMinOneByteValueLead+kMaxOneByteValue+1; // 0x51 1060 /*package*/ static final int kMaxTwoByteValue=0x1aff; 1061 1062 /*package*/ static final int kMinThreeByteValueLead=kMinTwoByteValueLead+(kMaxTwoByteValue>>8)+1; // 0x6c 1063 /*package*/ static final int kFourByteValueLead=0x7e; 1064 1065 // A little more than Unicode code points. (0x11ffff) 1066 /*package*/ static final int kMaxThreeByteValue=((kFourByteValueLead-kMinThreeByteValueLead)<<16)-1; 1067 1068 /*package*/ static final int kFiveByteValueLead=0x7f; 1069 1070 // Compact delta integers. 1071 /*package*/ static final int kMaxOneByteDelta=0xbf; 1072 /*package*/ static final int kMinTwoByteDeltaLead=kMaxOneByteDelta+1; // 0xc0 1073 /*package*/ static final int kMinThreeByteDeltaLead=0xf0; 1074 /*package*/ static final int kFourByteDeltaLead=0xfe; 1075 /*package*/ static final int kFiveByteDeltaLead=0xff; 1076 1077 /*package*/ static final int kMaxTwoByteDelta=((kMinThreeByteDeltaLead-kMinTwoByteDeltaLead)<<8)-1; // 0x2fff 1078 /*package*/ static final int kMaxThreeByteDelta=((kFourByteDeltaLead-kMinThreeByteDeltaLead)<<16)-1; // 0xdffff 1079 1080 // Fixed value referencing the BytesTrie bytes. 1081 private byte[] bytes_; 1082 private int root_; 1083 1084 // Iterator variables. 1085 1086 // Index of next trie byte to read. Negative if no more matches. 1087 private int pos_; 1088 // Remaining length of a linear-match node, minus 1. Negative if not in such a node. 1089 private int remainingMatchLength_; 1090 }; 1091