1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * Copyright (C) 2010-2014, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ******************************************************************************* 8 * created on: 2010nov23 9 * created by: Markus W. Scherer 10 * ported from ICU4C bytestrie.h/.cpp 11 */ 12 package com.ibm.icu.util; 13 14 import java.io.IOException; 15 import java.nio.ByteBuffer; 16 import java.util.ArrayList; 17 import java.util.NoSuchElementException; 18 19 /** 20 * Light-weight, non-const reader class for a BytesTrie. 21 * Traverses a byte-serialized data structure with minimal state, 22 * for mapping byte sequences to non-negative integer values. 23 * 24 * <p>This class is not intended for public subclassing. 25 * 26 * @stable ICU 4.8 27 * @author Markus W. Scherer 28 */ 29 public final class BytesTrie implements Cloneable, Iterable<BytesTrie.Entry> { 30 /** 31 * Constructs a BytesTrie reader instance. 32 * 33 * <p>The array must contain a copy of a byte sequence from the BytesTrieBuilder, 34 * with the offset indicating the first byte of that sequence. 35 * The BytesTrie object will not read more bytes than 36 * the BytesTrieBuilder generated in the corresponding build() call. 37 * 38 * <p>The array is not copied/cloned and must not be modified while 39 * the BytesTrie object is in use. 40 * 41 * @param trieBytes Bytes array that contains the serialized trie. 42 * @param offset Root offset of the trie in the array. 43 * @stable ICU 4.8 44 */ BytesTrie(byte[] trieBytes, int offset)45 public BytesTrie(byte[] trieBytes, int offset) { 46 bytes_=trieBytes; 47 pos_=root_=offset; 48 remainingMatchLength_=-1; 49 } 50 51 /** 52 * Copy constructor. 53 * Makes a shallow copy of the other trie reader object and its state. 54 * Does not copy the byte array which will be shared. 55 * Same as clone() but without the throws clause. 56 * 57 * @stable ICU 64 58 */ BytesTrie(BytesTrie other)59 public BytesTrie(BytesTrie other) { 60 bytes_ = other.bytes_; 61 root_ = other.root_; 62 pos_ = other.pos_; 63 remainingMatchLength_ = other.remainingMatchLength_; 64 } 65 66 /** 67 * Clones this trie reader object and its state, 68 * but not the byte array which will be shared. 69 * @return A shallow clone of this trie. 70 * @stable ICU 4.8 71 */ 72 @Override clone()73 public BytesTrie clone() throws CloneNotSupportedException { 74 return (BytesTrie) super.clone(); // A shallow copy is just what we need. 75 } 76 77 /** 78 * Resets this trie to its initial state. 79 * @return this 80 * @stable ICU 4.8 81 */ reset()82 public BytesTrie reset() { 83 pos_=root_; 84 remainingMatchLength_=-1; 85 return this; 86 } 87 88 /** 89 * Returns the state of this trie as a 64-bit integer. 90 * The state value is never 0. 91 * 92 * @return opaque state value 93 * @see #resetToState64 94 * @stable ICU 64 95 */ getState64()96 public long getState64() { 97 return ((long)remainingMatchLength_ << 32) | pos_; 98 } 99 100 /** 101 * Resets this trie to the saved state. 102 * Unlike {@link #resetToState(State)}, the 64-bit state value 103 * must be from {@link #getState64()} from the same trie object or 104 * from one initialized the exact same way. 105 * Because of no validation, this method is faster. 106 * 107 * @param state The opaque trie state value from getState64(). 108 * @return this 109 * @see #getState64 110 * @see #resetToState 111 * @see #reset 112 * @stable ICU 64 113 */ resetToState64(long state)114 public BytesTrie resetToState64(long state) { 115 remainingMatchLength_ = (int)(state >> 32); 116 pos_ = (int)state; 117 return this; 118 } 119 120 /** 121 * BytesTrie state object, for saving a trie's current state 122 * and resetting the trie back to this state later. 123 * @stable ICU 4.8 124 */ 125 public static final class State { 126 /** 127 * Constructs an empty State. 128 * @stable ICU 4.8 129 */ State()130 public State() {} 131 private byte[] bytes; 132 private int root; 133 private int pos; 134 private int remainingMatchLength; 135 } 136 137 /** 138 * Saves the state of this trie. 139 * @param state The State object to hold the trie's state. 140 * @return this 141 * @see #resetToState 142 * @stable ICU 4.8 143 */ saveState(State state)144 public BytesTrie saveState(State state) /*const*/ { 145 state.bytes=bytes_; 146 state.root=root_; 147 state.pos=pos_; 148 state.remainingMatchLength=remainingMatchLength_; 149 return this; 150 } 151 152 /** 153 * Resets this trie to the saved state. 154 * Slower than {@link #resetToState64(long)} which does not validate the state value. 155 * 156 * @param state The State object which holds a saved trie state. 157 * @return this 158 * @throws IllegalArgumentException if the state object contains no state, 159 * or the state of a different trie 160 * @see #saveState 161 * @see #reset 162 * @stable ICU 4.8 163 */ resetToState(State state)164 public BytesTrie resetToState(State state) { 165 if(bytes_==state.bytes && bytes_!=null && root_==state.root) { 166 pos_=state.pos; 167 remainingMatchLength_=state.remainingMatchLength; 168 } else { 169 throw new IllegalArgumentException("incompatible trie state"); 170 } 171 return this; 172 } 173 174 /** 175 * Return values for BytesTrie.next(), CharsTrie.next() and similar methods. 176 * @stable ICU 4.8 177 */ 178 public enum Result { 179 /** 180 * The input unit(s) did not continue a matching string. 181 * Once current()/next() return NO_MATCH, 182 * all further calls to current()/next() will also return NO_MATCH, 183 * until the trie is reset to its original state or to a saved state. 184 * @stable ICU 4.8 185 */ 186 NO_MATCH, 187 /** 188 * The input unit(s) continued a matching string 189 * but there is no value for the string so far. 190 * (It is a prefix of a longer string.) 191 * @stable ICU 4.8 192 */ 193 NO_VALUE, 194 /** 195 * The input unit(s) continued a matching string 196 * and there is a value for the string so far. 197 * This value will be returned by getValue(). 198 * No further input byte/unit can continue a matching string. 199 * @stable ICU 4.8 200 */ 201 FINAL_VALUE, 202 /** 203 * The input unit(s) continued a matching string 204 * and there is a value for the string so far. 205 * This value will be returned by getValue(). 206 * Another input byte/unit can continue a matching string. 207 * @stable ICU 4.8 208 */ 209 INTERMEDIATE_VALUE; 210 211 // Note: The following methods assume the particular order 212 // of enum constants, treating the ordinal() values like bit sets. 213 // Do not reorder the enum constants! 214 215 /** 216 * Same as (result!=NO_MATCH). 217 * @return true if the input bytes/units so far are part of a matching string/byte sequence. 218 * @stable ICU 4.8 219 */ matches()220 public boolean matches() { return this!=NO_MATCH; } 221 222 /** 223 * Equivalent to (result==INTERMEDIATE_VALUE || result==FINAL_VALUE). 224 * @return true if there is a value for the input bytes/units so far. 225 * @see #getValue 226 * @stable ICU 4.8 227 */ hasValue()228 public boolean hasValue() { return ordinal()>=2; } 229 230 /** 231 * Equivalent to (result==NO_VALUE || result==INTERMEDIATE_VALUE). 232 * @return true if another input byte/unit can continue a matching string. 233 * @stable ICU 4.8 234 */ hasNext()235 public boolean hasNext() { return (ordinal()&1)!=0; } 236 } 237 238 /** 239 * Determines whether the byte sequence so far matches, whether it has a value, 240 * and whether another input byte can continue a matching byte sequence. 241 * @return The match/value Result. 242 * @stable ICU 4.8 243 */ current()244 public Result current() /*const*/ { 245 int pos=pos_; 246 if(pos<0) { 247 return Result.NO_MATCH; 248 } else { 249 int node; 250 return (remainingMatchLength_<0 && (node=bytes_[pos]&0xff)>=kMinValueLead) ? 251 valueResults_[node&kValueIsFinal] : Result.NO_VALUE; 252 } 253 } 254 255 /** 256 * Traverses the trie from the initial state for this input byte. 257 * Equivalent to reset().next(inByte). 258 * @param inByte Input byte value. Values -0x100..-1 are treated like 0..0xff. 259 * Values below -0x100 and above 0xff will never match. 260 * @return The match/value Result. 261 * @stable ICU 4.8 262 */ first(int inByte)263 public Result first(int inByte) { 264 remainingMatchLength_=-1; 265 if(inByte<0) { 266 inByte+=0x100; 267 } 268 return nextImpl(root_, inByte); 269 } 270 271 /** 272 * Traverses the trie from the current state for this input byte. 273 * @param inByte Input byte value. Values -0x100..-1 are treated like 0..0xff. 274 * Values below -0x100 and above 0xff will never match. 275 * @return The match/value Result. 276 * @stable ICU 4.8 277 */ next(int inByte)278 public Result next(int inByte) { 279 int pos=pos_; 280 if(pos<0) { 281 return Result.NO_MATCH; 282 } 283 if(inByte<0) { 284 inByte+=0x100; 285 } 286 int length=remainingMatchLength_; // Actual remaining match length minus 1. 287 if(length>=0) { 288 // Remaining part of a linear-match node. 289 if(inByte==(bytes_[pos++]&0xff)) { 290 remainingMatchLength_=--length; 291 pos_=pos; 292 int node; 293 return (length<0 && (node=bytes_[pos]&0xff)>=kMinValueLead) ? 294 valueResults_[node&kValueIsFinal] : Result.NO_VALUE; 295 } else { 296 stop(); 297 return Result.NO_MATCH; 298 } 299 } 300 return nextImpl(pos, inByte); 301 } 302 303 /** 304 * Traverses the trie from the current state for this byte sequence. 305 * Equivalent to 306 * <pre> 307 * Result result=current(); 308 * for(each c in s) 309 * if(!result.hasNext()) return Result.NO_MATCH; 310 * result=next(c); 311 * return result; 312 * </pre> 313 * @param s Contains a string or byte sequence. 314 * @param sIndex The start index of the byte sequence in s. 315 * @param sLimit The (exclusive) end index of the byte sequence in s. 316 * @return The match/value Result. 317 * @stable ICU 4.8 318 */ next(byte[] s, int sIndex, int sLimit)319 public Result next(byte[] s, int sIndex, int sLimit) { 320 if(sIndex>=sLimit) { 321 // Empty input. 322 return current(); 323 } 324 int pos=pos_; 325 if(pos<0) { 326 return Result.NO_MATCH; 327 } 328 int length=remainingMatchLength_; // Actual remaining match length minus 1. 329 for(;;) { 330 // Fetch the next input byte, if there is one. 331 // Continue a linear-match node. 332 byte inByte; 333 for(;;) { 334 if(sIndex==sLimit) { 335 remainingMatchLength_=length; 336 pos_=pos; 337 int node; 338 return (length<0 && (node=(bytes_[pos]&0xff))>=kMinValueLead) ? 339 valueResults_[node&kValueIsFinal] : Result.NO_VALUE; 340 } 341 inByte=s[sIndex++]; 342 if(length<0) { 343 remainingMatchLength_=length; 344 break; 345 } 346 if(inByte!=bytes_[pos]) { 347 stop(); 348 return Result.NO_MATCH; 349 } 350 ++pos; 351 --length; 352 } 353 for(;;) { 354 int node=bytes_[pos++]&0xff; 355 if(node<kMinLinearMatch) { 356 Result result=branchNext(pos, node, inByte&0xff); 357 if(result==Result.NO_MATCH) { 358 return Result.NO_MATCH; 359 } 360 // Fetch the next input byte, if there is one. 361 if(sIndex==sLimit) { 362 return result; 363 } 364 if(result==Result.FINAL_VALUE) { 365 // No further matching bytes. 366 stop(); 367 return Result.NO_MATCH; 368 } 369 inByte=s[sIndex++]; 370 pos=pos_; // branchNext() advanced pos and wrote it to pos_ . 371 } else if(node<kMinValueLead) { 372 // Match length+1 bytes. 373 length=node-kMinLinearMatch; // Actual match length minus 1. 374 if(inByte!=bytes_[pos]) { 375 stop(); 376 return Result.NO_MATCH; 377 } 378 ++pos; 379 --length; 380 break; 381 } else if((node&kValueIsFinal)!=0) { 382 // No further matching bytes. 383 stop(); 384 return Result.NO_MATCH; 385 } else { 386 // Skip intermediate value. 387 pos=skipValue(pos, node); 388 // The next node must not also be a value node. 389 assert((bytes_[pos]&0xff)<kMinValueLead); 390 } 391 } 392 } 393 } 394 395 /** 396 * Returns a matching byte sequence's value if called immediately after 397 * current()/first()/next() returned Result.INTERMEDIATE_VALUE or Result.FINAL_VALUE. 398 * getValue() can be called multiple times. 399 * 400 * Do not call getValue() after Result.NO_MATCH or Result.NO_VALUE! 401 * @return The value for the byte sequence so far. 402 * @stable ICU 4.8 403 */ 404 public int getValue() /*const*/ { 405 int pos=pos_; 406 int leadByte=bytes_[pos++]&0xff; 407 assert(leadByte>=kMinValueLead); 408 return readValue(bytes_, pos, leadByte>>1); 409 } 410 411 /** 412 * Determines whether all byte sequences reachable from the current state 413 * map to the same value, and if so, returns that value. 414 * @return The unique value in bits 32..1 with bit 0 set, 415 * if all byte sequences reachable from the current state 416 * map to the same value; otherwise returns 0. 417 * @stable ICU 4.8 418 */ getUniqueValue()419 public long getUniqueValue() /*const*/ { 420 int pos=pos_; 421 if(pos<0) { 422 return 0; 423 } 424 // Skip the rest of a pending linear-match node. 425 long uniqueValue=findUniqueValue(bytes_, pos+remainingMatchLength_+1, 0); 426 // Ignore internally used bits 63..33; extend the actual value's sign bit from bit 32. 427 return (uniqueValue<<31)>>31; 428 } 429 430 /** 431 * Finds each byte which continues the byte sequence from the current state. 432 * That is, each byte b for which it would be next(b)!=Result.NO_MATCH now. 433 * @param out Each next byte is 0-extended to a char and appended to this object. 434 * (Only uses the out.append(c) method.) 435 * @return The number of bytes which continue the byte sequence from here. 436 * @stable ICU 4.8 437 */ getNextBytes(Appendable out)438 public int getNextBytes(Appendable out) /*const*/ { 439 int pos=pos_; 440 if(pos<0) { 441 return 0; 442 } 443 if(remainingMatchLength_>=0) { 444 append(out, bytes_[pos]&0xff); // Next byte of a pending linear-match node. 445 return 1; 446 } 447 int node=bytes_[pos++]&0xff; 448 if(node>=kMinValueLead) { 449 if((node&kValueIsFinal)!=0) { 450 return 0; 451 } else { 452 pos=skipValue(pos, node); 453 node=bytes_[pos++]&0xff; 454 assert(node<kMinValueLead); 455 } 456 } 457 if(node<kMinLinearMatch) { 458 if(node==0) { 459 node=bytes_[pos++]&0xff; 460 } 461 getNextBranchBytes(bytes_, pos, ++node, out); 462 return node; 463 } else { 464 // First byte of the linear-match node. 465 append(out, bytes_[pos]&0xff); 466 return 1; 467 } 468 } 469 470 /** 471 * Iterates from the current state of this trie. 472 * @return A new BytesTrie.Iterator. 473 * @stable ICU 4.8 474 */ 475 @Override 476 public Iterator iterator() { 477 return new Iterator(bytes_, pos_, remainingMatchLength_, 0); 478 } 479 480 /** 481 * Iterates from the current state of this trie. 482 * @param maxStringLength If 0, the iterator returns full strings/byte sequences. 483 * Otherwise, the iterator returns strings with this maximum length. 484 * @return A new BytesTrie.Iterator. 485 * @stable ICU 4.8 486 */ 487 public Iterator iterator(int maxStringLength) { 488 return new Iterator(bytes_, pos_, remainingMatchLength_, maxStringLength); 489 } 490 491 /** 492 * Iterates from the root of a byte-serialized BytesTrie. 493 * @param trieBytes Bytes array that contains the serialized trie. 494 * @param offset Root offset of the trie in the array. 495 * @param maxStringLength If 0, the iterator returns full strings/byte sequences. 496 * Otherwise, the iterator returns strings with this maximum length. 497 * @return A new BytesTrie.Iterator. 498 * @stable ICU 4.8 499 */ 500 public static Iterator iterator(byte[] trieBytes, int offset, int maxStringLength) { 501 return new Iterator(trieBytes, offset, -1, maxStringLength); 502 } 503 504 /** 505 * Return value type for the Iterator. 506 * @stable ICU 4.8 507 */ 508 public static final class Entry { 509 private Entry(int capacity) { 510 bytes=new byte[capacity]; 511 } 512 513 /** 514 * @return The length of the byte sequence. 515 * @stable ICU 4.8 516 */ 517 public int bytesLength() { return length; } 518 /** 519 * Returns a byte of the byte sequence. 520 * @param index An index into the byte sequence. 521 * @return The index-th byte sequence byte. 522 * @stable ICU 4.8 523 */ 524 public byte byteAt(int index) { return bytes[index]; } 525 /** 526 * Copies the byte sequence into a byte array. 527 * @param dest Destination byte array. 528 * @param destOffset Starting offset to where in dest the byte sequence is copied. 529 * @stable ICU 4.8 530 */ 531 public void copyBytesTo(byte[] dest, int destOffset) { 532 System.arraycopy(bytes, 0, dest, destOffset, length); 533 } 534 /** 535 * @return The byte sequence as a read-only ByteBuffer. 536 * @stable ICU 4.8 537 */ 538 public ByteBuffer bytesAsByteBuffer() { 539 return ByteBuffer.wrap(bytes, 0, length).asReadOnlyBuffer(); 540 } 541 542 /** 543 * The value associated with the byte sequence. 544 * @stable ICU 4.8 545 */ 546 public int value; 547 548 private void ensureCapacity(int len) { 549 if(bytes.length<len) { 550 byte[] newBytes=new byte[Math.min(2*bytes.length, 2*len)]; 551 System.arraycopy(bytes, 0, newBytes, 0, length); 552 bytes=newBytes; 553 } 554 } 555 private void append(byte b) { 556 ensureCapacity(length+1); 557 bytes[length++]=b; 558 } 559 private void append(byte[] b, int off, int len) { 560 ensureCapacity(length+len); 561 System.arraycopy(b, off, bytes, length, len); 562 length+=len; 563 } 564 private void truncateString(int newLength) { length=newLength; } 565 566 private byte[] bytes; 567 private int length; 568 } 569 570 /** 571 * Iterator for all of the (byte sequence, value) pairs in a BytesTrie. 572 * @stable ICU 4.8 573 */ 574 public static final class Iterator implements java.util.Iterator<Entry> { 575 private Iterator(byte[] trieBytes, int offset, int remainingMatchLength, int maxStringLength) { 576 bytes_=trieBytes; 577 pos_=initialPos_=offset; 578 remainingMatchLength_=initialRemainingMatchLength_=remainingMatchLength; 579 maxLength_=maxStringLength; 580 entry_=new Entry(maxLength_!=0 ? maxLength_ : 32); 581 int length=remainingMatchLength_; // Actual remaining match length minus 1. 582 if(length>=0) { 583 // Pending linear-match node, append remaining bytes to entry_. 584 ++length; 585 if(maxLength_>0 && length>maxLength_) { 586 length=maxLength_; // This will leave remainingMatchLength>=0 as a signal. 587 } 588 entry_.append(bytes_, pos_, length); 589 pos_+=length; 590 remainingMatchLength_-=length; 591 } 592 } 593 594 /** 595 * Resets this iterator to its initial state. 596 * @return this 597 * @stable ICU 4.8 598 */ reset()599 public Iterator reset() { 600 pos_=initialPos_; 601 remainingMatchLength_=initialRemainingMatchLength_; 602 int length=remainingMatchLength_+1; // Remaining match length. 603 if(maxLength_>0 && length>maxLength_) { 604 length=maxLength_; 605 } 606 entry_.truncateString(length); 607 pos_+=length; 608 remainingMatchLength_-=length; 609 stack_.clear(); 610 return this; 611 } 612 613 /** 614 * @return true if there are more elements. 615 * @stable ICU 4.8 616 */ 617 @Override hasNext()618 public boolean hasNext() /*const*/ { return pos_>=0 || !stack_.isEmpty(); } 619 620 /** 621 * Finds the next (byte sequence, value) pair if there is one. 622 * 623 * If the byte sequence is truncated to the maximum length and does not 624 * have a real value, then the value is set to -1. 625 * In this case, this "not a real value" is indistinguishable from 626 * a real value of -1. 627 * @return An Entry with the string and value of the next element. 628 * @throws NoSuchElementException - iteration has no more elements. 629 * @stable ICU 4.8 630 */ 631 @Override next()632 public Entry next() { 633 int pos=pos_; 634 if(pos<0) { 635 if(stack_.isEmpty()) { 636 throw new NoSuchElementException(); 637 } 638 // Pop the state off the stack and continue with the next outbound edge of 639 // the branch node. 640 long top=stack_.remove(stack_.size()-1); 641 int length=(int)top; 642 pos=(int)(top>>32); 643 entry_.truncateString(length&0xffff); 644 length>>>=16; 645 if(length>1) { 646 pos=branchNext(pos, length); 647 if(pos<0) { 648 return entry_; // Reached a final value. 649 } 650 } else { 651 entry_.append(bytes_[pos++]); 652 } 653 } 654 if(remainingMatchLength_>=0) { 655 // We only get here if we started in a pending linear-match node 656 // with more than maxLength remaining bytes. 657 return truncateAndStop(); 658 } 659 for(;;) { 660 int node=bytes_[pos++]&0xff; 661 if(node>=kMinValueLead) { 662 // Deliver value for the byte sequence so far. 663 boolean isFinal=(node&kValueIsFinal)!=0; 664 entry_.value=readValue(bytes_, pos, node>>1); 665 if(isFinal || (maxLength_>0 && entry_.length==maxLength_)) { 666 pos_=-1; 667 } else { 668 pos_=skipValue(pos, node); 669 } 670 return entry_; 671 } 672 if(maxLength_>0 && entry_.length==maxLength_) { 673 return truncateAndStop(); 674 } 675 if(node<kMinLinearMatch) { 676 if(node==0) { 677 node=bytes_[pos++]&0xff; 678 } 679 pos=branchNext(pos, node+1); 680 if(pos<0) { 681 return entry_; // Reached a final value. 682 } 683 } else { 684 // Linear-match node, append length bytes to entry_. 685 int length=node-kMinLinearMatch+1; 686 if(maxLength_>0 && entry_.length+length>maxLength_) { 687 entry_.append(bytes_, pos, maxLength_-entry_.length); 688 return truncateAndStop(); 689 } 690 entry_.append(bytes_, pos, length); 691 pos+=length; 692 } 693 } 694 } 695 696 /** 697 * Iterator.remove() is not supported. 698 * @throws UnsupportedOperationException (always) 699 * @stable ICU 4.8 700 */ 701 @Override remove()702 public void remove() { 703 throw new UnsupportedOperationException(); 704 } 705 truncateAndStop()706 private Entry truncateAndStop() { 707 pos_=-1; 708 entry_.value=-1; // no real value for str 709 return entry_; 710 } 711 branchNext(int pos, int length)712 private int branchNext(int pos, int length) { 713 while(length>kMaxBranchLinearSubNodeLength) { 714 ++pos; // ignore the comparison byte 715 // Push state for the greater-or-equal edge. 716 stack_.add(((long)skipDelta(bytes_, pos)<<32)|((length-(length>>1))<<16)|entry_.length); 717 // Follow the less-than edge. 718 length>>=1; 719 pos=jumpByDelta(bytes_, pos); 720 } 721 // List of key-value pairs where values are either final values or jump deltas. 722 // Read the first (key, value) pair. 723 byte trieByte=bytes_[pos++]; 724 int node=bytes_[pos++]&0xff; 725 boolean isFinal=(node&kValueIsFinal)!=0; 726 int value=readValue(bytes_, pos, node>>1); 727 pos=skipValue(pos, node); 728 stack_.add(((long)pos<<32)|((length-1)<<16)|entry_.length); 729 entry_.append(trieByte); 730 if(isFinal) { 731 pos_=-1; 732 entry_.value=value; 733 return -1; 734 } else { 735 return pos+value; 736 } 737 } 738 739 private byte[] bytes_; 740 private int pos_; 741 private int initialPos_; 742 private int remainingMatchLength_; 743 private int initialRemainingMatchLength_; 744 745 private int maxLength_; 746 private Entry entry_; 747 748 // The stack stores longs for backtracking to another 749 // outbound edge of a branch node. 750 // Each long has the offset from bytes_ in bits 62..32, 751 // the entry_.stringLength() from before the node in bits 15..0, 752 // and the remaining branch length in bits 24..16. (Bits 31..25 are unused.) 753 // (We could store the remaining branch length minus 1 in bits 23..16 and not use bits 31..24, 754 // but the code looks more confusing that way.) 755 private ArrayList<Long> stack_=new ArrayList<>(); 756 } 757 stop()758 private void stop() { 759 pos_=-1; 760 } 761 762 // Reads a compact 32-bit integer. 763 // pos is already after the leadByte, and the lead byte is already shifted right by 1. readValue(byte[] bytes, int pos, int leadByte)764 private static int readValue(byte[] bytes, int pos, int leadByte) { 765 int value; 766 if(leadByte<kMinTwoByteValueLead) { 767 value=leadByte-kMinOneByteValueLead; 768 } else if(leadByte<kMinThreeByteValueLead) { 769 value=((leadByte-kMinTwoByteValueLead)<<8)|(bytes[pos]&0xff); 770 } else if(leadByte<kFourByteValueLead) { 771 value=((leadByte-kMinThreeByteValueLead)<<16)|((bytes[pos]&0xff)<<8)|(bytes[pos+1]&0xff); 772 } else if(leadByte==kFourByteValueLead) { 773 value=((bytes[pos]&0xff)<<16)|((bytes[pos+1]&0xff)<<8)|(bytes[pos+2]&0xff); 774 } else { 775 value=(bytes[pos]<<24)|((bytes[pos+1]&0xff)<<16)|((bytes[pos+2]&0xff)<<8)|(bytes[pos+3]&0xff); 776 } 777 return value; 778 } skipValue(int pos, int leadByte)779 private static int skipValue(int pos, int leadByte) { 780 assert(leadByte>=kMinValueLead); 781 if(leadByte>=(kMinTwoByteValueLead<<1)) { 782 if(leadByte<(kMinThreeByteValueLead<<1)) { 783 ++pos; 784 } else if(leadByte<(kFourByteValueLead<<1)) { 785 pos+=2; 786 } else { 787 pos+=3+((leadByte>>1)&1); 788 } 789 } 790 return pos; 791 } skipValue(byte[] bytes, int pos)792 private static int skipValue(byte[] bytes, int pos) { 793 int leadByte=bytes[pos++]&0xff; 794 return skipValue(pos, leadByte); 795 } 796 797 /** 798 * Reads a jump delta and jumps. 799 * @internal 800 * @deprecated This API is ICU internal only. 801 */ 802 @Deprecated jumpByDelta(byte[] bytes, int pos)803 public static int jumpByDelta(byte[] bytes, int pos) { 804 int delta=bytes[pos++]&0xff; 805 if(delta<kMinTwoByteDeltaLead) { 806 // nothing to do 807 } else if(delta<kMinThreeByteDeltaLead) { 808 delta=((delta-kMinTwoByteDeltaLead)<<8)|(bytes[pos++]&0xff); 809 } else if(delta<kFourByteDeltaLead) { 810 delta=((delta-kMinThreeByteDeltaLead)<<16)|((bytes[pos]&0xff)<<8)|(bytes[pos+1]&0xff); 811 pos+=2; 812 } else if(delta==kFourByteDeltaLead) { 813 delta=((bytes[pos]&0xff)<<16)|((bytes[pos+1]&0xff)<<8)|(bytes[pos+2]&0xff); 814 pos+=3; 815 } else { 816 delta=(bytes[pos]<<24)|((bytes[pos+1]&0xff)<<16)|((bytes[pos+2]&0xff)<<8)|(bytes[pos+3]&0xff); 817 pos+=4; 818 } 819 return pos+delta; 820 } 821 skipDelta(byte[] bytes, int pos)822 private static int skipDelta(byte[] bytes, int pos) { 823 int delta=bytes[pos++]&0xff; 824 if(delta>=kMinTwoByteDeltaLead) { 825 if(delta<kMinThreeByteDeltaLead) { 826 ++pos; 827 } else if(delta<kFourByteDeltaLead) { 828 pos+=2; 829 } else { 830 pos+=3+(delta&1); 831 } 832 } 833 return pos; 834 } 835 836 private static Result[] valueResults_={ Result.INTERMEDIATE_VALUE, Result.FINAL_VALUE }; 837 838 // Handles a branch node for both next(byte) and next(string). branchNext(int pos, int length, int inByte)839 private Result branchNext(int pos, int length, int inByte) { 840 // Branch according to the current byte. 841 if(length==0) { 842 length=bytes_[pos++]&0xff; 843 } 844 ++length; 845 // The length of the branch is the number of bytes to select from. 846 // The data structure encodes a binary search. 847 while(length>kMaxBranchLinearSubNodeLength) { 848 if(inByte<(bytes_[pos++]&0xff)) { 849 length>>=1; 850 pos=jumpByDelta(bytes_, pos); 851 } else { 852 length=length-(length>>1); 853 pos=skipDelta(bytes_, pos); 854 } 855 } 856 // Drop down to linear search for the last few bytes. 857 // length>=2 because the loop body above sees length>kMaxBranchLinearSubNodeLength>=3 858 // and divides length by 2. 859 do { 860 if(inByte==(bytes_[pos++]&0xff)) { 861 Result result; 862 int node=bytes_[pos]&0xff; 863 assert(node>=kMinValueLead); 864 if((node&kValueIsFinal)!=0) { 865 // Leave the final value for getValue() to read. 866 result=Result.FINAL_VALUE; 867 } else { 868 // Use the non-final value as the jump delta. 869 ++pos; 870 // int delta=readValue(pos, node>>1); 871 node>>=1; 872 int delta; 873 if(node<kMinTwoByteValueLead) { 874 delta=node-kMinOneByteValueLead; 875 } else if(node<kMinThreeByteValueLead) { 876 delta=((node-kMinTwoByteValueLead)<<8)|(bytes_[pos++]&0xff); 877 } else if(node<kFourByteValueLead) { 878 delta=((node-kMinThreeByteValueLead)<<16)|((bytes_[pos]&0xff)<<8)|(bytes_[pos+1]&0xff); 879 pos+=2; 880 } else if(node==kFourByteValueLead) { 881 delta=((bytes_[pos]&0xff)<<16)|((bytes_[pos+1]&0xff)<<8)|(bytes_[pos+2]&0xff); 882 pos+=3; 883 } else { 884 delta=(bytes_[pos]<<24)|((bytes_[pos+1]&0xff)<<16)|((bytes_[pos+2]&0xff)<<8)|(bytes_[pos+3]&0xff); 885 pos+=4; 886 } 887 // end readValue() 888 pos+=delta; 889 node=bytes_[pos]&0xff; 890 result= node>=kMinValueLead ? valueResults_[node&kValueIsFinal] : Result.NO_VALUE; 891 } 892 pos_=pos; 893 return result; 894 } 895 --length; 896 pos=skipValue(bytes_, pos); 897 } while(length>1); 898 if(inByte==(bytes_[pos++]&0xff)) { 899 pos_=pos; 900 int node=bytes_[pos]&0xff; 901 return node>=kMinValueLead ? valueResults_[node&kValueIsFinal] : Result.NO_VALUE; 902 } else { 903 stop(); 904 return Result.NO_MATCH; 905 } 906 } 907 908 // Requires remainingLength_<0. nextImpl(int pos, int inByte)909 private Result nextImpl(int pos, int inByte) { 910 for(;;) { 911 int node=bytes_[pos++]&0xff; 912 if(node<kMinLinearMatch) { 913 return branchNext(pos, node, inByte); 914 } else if(node<kMinValueLead) { 915 // Match the first of length+1 bytes. 916 int length=node-kMinLinearMatch; // Actual match length minus 1. 917 if(inByte==(bytes_[pos++]&0xff)) { 918 remainingMatchLength_=--length; 919 pos_=pos; 920 return (length<0 && (node=bytes_[pos]&0xff)>=kMinValueLead) ? 921 valueResults_[node&kValueIsFinal] : Result.NO_VALUE; 922 } else { 923 // No match. 924 break; 925 } 926 } else if((node&kValueIsFinal)!=0) { 927 // No further matching bytes. 928 break; 929 } else { 930 // Skip intermediate value. 931 pos=skipValue(pos, node); 932 // The next node must not also be a value node. 933 assert((bytes_[pos]&0xff)<kMinValueLead); 934 } 935 } 936 stop(); 937 return Result.NO_MATCH; 938 } 939 940 // Helper functions for getUniqueValue(). 941 // Recursively finds a unique value (or whether there is not a unique one) 942 // from a branch. 943 // uniqueValue: On input, same as for getUniqueValue()/findUniqueValue(). 944 // On return, if not 0, then bits 63..33 contain the updated non-negative pos. 945 private static long findUniqueValueFromBranch(byte[] bytes, int pos, int length, 946 long uniqueValue) { 947 while(length>kMaxBranchLinearSubNodeLength) { 948 ++pos; // ignore the comparison byte 949 uniqueValue=findUniqueValueFromBranch(bytes, jumpByDelta(bytes, pos), length>>1, uniqueValue); 950 if(uniqueValue==0) { 951 return 0; 952 } 953 length=length-(length>>1); 954 pos=skipDelta(bytes, pos); 955 } 956 do { 957 ++pos; // ignore a comparison byte 958 // handle its value 959 int node=bytes[pos++]&0xff; 960 boolean isFinal=(node&kValueIsFinal)!=0; 961 int value=readValue(bytes, pos, node>>1); 962 pos=skipValue(pos, node); 963 if(isFinal) { 964 if(uniqueValue!=0) { 965 if(value!=(int)(uniqueValue>>1)) { 966 return 0; 967 } 968 } else { 969 uniqueValue=((long)value<<1)|1; 970 } 971 } else { 972 uniqueValue=findUniqueValue(bytes, pos+value, uniqueValue); 973 if(uniqueValue==0) { 974 return 0; 975 } 976 } 977 } while(--length>1); 978 // ignore the last comparison byte 979 return ((long)(pos+1)<<33)|(uniqueValue&0x1ffffffffL); 980 } 981 // Recursively finds a unique value (or whether there is not a unique one) 982 // starting from a position on a node lead byte. 983 // uniqueValue: If there is one, then bits 32..1 contain the value and bit 0 is set. 984 // Otherwise, uniqueValue is 0. Bits 63..33 are ignored. findUniqueValue(byte[] bytes, int pos, long uniqueValue)985 private static long findUniqueValue(byte[] bytes, int pos, long uniqueValue) { 986 for(;;) { 987 int node=bytes[pos++]&0xff; 988 if(node<kMinLinearMatch) { 989 if(node==0) { 990 node=bytes[pos++]&0xff; 991 } 992 uniqueValue=findUniqueValueFromBranch(bytes, pos, node+1, uniqueValue); 993 if(uniqueValue==0) { 994 return 0; 995 } 996 pos=(int)(uniqueValue>>>33); 997 } else if(node<kMinValueLead) { 998 // linear-match node 999 pos+=node-kMinLinearMatch+1; // Ignore the match bytes. 1000 } else { 1001 boolean isFinal=(node&kValueIsFinal)!=0; 1002 int value=readValue(bytes, pos, node>>1); 1003 if(uniqueValue!=0) { 1004 if(value!=(int)(uniqueValue>>1)) { 1005 return 0; 1006 } 1007 } else { 1008 uniqueValue=((long)value<<1)|1; 1009 } 1010 if(isFinal) { 1011 return uniqueValue; 1012 } 1013 pos=skipValue(pos, node); 1014 } 1015 } 1016 } 1017 1018 // Helper functions for getNextBytes(). 1019 // getNextBytes() when pos is on a branch node. getNextBranchBytes(byte[] bytes, int pos, int length, Appendable out)1020 private static void getNextBranchBytes(byte[] bytes, int pos, int length, Appendable out) { 1021 while(length>kMaxBranchLinearSubNodeLength) { 1022 ++pos; // ignore the comparison byte 1023 getNextBranchBytes(bytes, jumpByDelta(bytes, pos), length>>1, out); 1024 length=length-(length>>1); 1025 pos=skipDelta(bytes, pos); 1026 } 1027 do { 1028 append(out, bytes[pos++]&0xff); 1029 pos=skipValue(bytes, pos); 1030 } while(--length>1); 1031 append(out, bytes[pos]&0xff); 1032 } append(Appendable out, int c)1033 private static void append(Appendable out, int c) { 1034 try { 1035 out.append((char)c); 1036 } catch(IOException e) { 1037 throw new ICUUncheckedIOException(e); 1038 } 1039 } 1040 1041 // BytesTrie data structure 1042 // 1043 // The trie consists of a series of byte-serialized nodes for incremental 1044 // string/byte sequence matching. The root node is at the beginning of the trie data. 1045 // 1046 // Types of nodes are distinguished by their node lead byte ranges. 1047 // After each node, except a final-value node, another node follows to 1048 // encode match values or continue matching further bytes. 1049 // 1050 // Node types: 1051 // - Value node: Stores a 32-bit integer in a compact, variable-length format. 1052 // The value is for the string/byte sequence so far. 1053 // One node bit indicates whether the value is final or whether 1054 // matching continues with the next node. 1055 // - Linear-match node: Matches a number of bytes. 1056 // - Branch node: Branches to other nodes according to the current input byte. 1057 // The node byte is the length of the branch (number of bytes to select from) 1058 // minus 1. It is followed by a sub-node: 1059 // - If the length is at most kMaxBranchLinearSubNodeLength, then 1060 // there are length-1 (key, value) pairs and then one more comparison byte. 1061 // If one of the key bytes matches, then the value is either a final value for 1062 // the string/byte sequence so far, or a "jump" delta to the next node. 1063 // If the last byte matches, then matching continues with the next node. 1064 // (Values have the same encoding as value nodes.) 1065 // - If the length is greater than kMaxBranchLinearSubNodeLength, then 1066 // there is one byte and one "jump" delta. 1067 // If the input byte is less than the sub-node byte, then "jump" by delta to 1068 // the next sub-node which will have a length of length/2. 1069 // (The delta has its own compact encoding.) 1070 // Otherwise, skip the "jump" delta to the next sub-node 1071 // which will have a length of length-length/2. 1072 1073 // Node lead byte values. 1074 1075 // 00..0f: Branch node. If node!=0 then the length is node+1, otherwise 1076 // the length is one more than the next byte. 1077 1078 // For a branch sub-node with at most this many entries, we drop down 1079 // to a linear search. 1080 /*package*/ static final int kMaxBranchLinearSubNodeLength=5; 1081 1082 // 10..1f: Linear-match node, match 1..16 bytes and continue reading the next node. 1083 /*package*/ static final int kMinLinearMatch=0x10; 1084 /*package*/ static final int kMaxLinearMatchLength=0x10; 1085 1086 // 20..ff: Variable-length value node. 1087 // If odd, the value is final. (Otherwise, intermediate value or jump delta.) 1088 // Then shift-right by 1 bit. 1089 // The remaining lead byte value indicates the number of following bytes (0..4) 1090 // and contains the value's top bits. 1091 /*package*/ static final int kMinValueLead=kMinLinearMatch+kMaxLinearMatchLength; // 0x20 1092 // It is a final value if bit 0 is set. 1093 private static final int kValueIsFinal=1; 1094 1095 // Compact value: After testing bit 0, shift right by 1 and then use the following thresholds. 1096 /*package*/ static final int kMinOneByteValueLead=kMinValueLead/2; // 0x10 1097 /*package*/ static final int kMaxOneByteValue=0x40; // At least 6 bits in the first byte. 1098 1099 /*package*/ static final int kMinTwoByteValueLead=kMinOneByteValueLead+kMaxOneByteValue+1; // 0x51 1100 /*package*/ static final int kMaxTwoByteValue=0x1aff; 1101 1102 /*package*/ static final int kMinThreeByteValueLead=kMinTwoByteValueLead+(kMaxTwoByteValue>>8)+1; // 0x6c 1103 /*package*/ static final int kFourByteValueLead=0x7e; 1104 1105 // A little more than Unicode code points. (0x11ffff) 1106 /*package*/ static final int kMaxThreeByteValue=((kFourByteValueLead-kMinThreeByteValueLead)<<16)-1; 1107 1108 /*package*/ static final int kFiveByteValueLead=0x7f; 1109 1110 // Compact delta integers. 1111 /*package*/ static final int kMaxOneByteDelta=0xbf; 1112 /*package*/ static final int kMinTwoByteDeltaLead=kMaxOneByteDelta+1; // 0xc0 1113 /*package*/ static final int kMinThreeByteDeltaLead=0xf0; 1114 /*package*/ static final int kFourByteDeltaLead=0xfe; 1115 /*package*/ static final int kFiveByteDeltaLead=0xff; 1116 1117 /*package*/ static final int kMaxTwoByteDelta=((kMinThreeByteDeltaLead-kMinTwoByteDeltaLead)<<8)-1; // 0x2fff 1118 /*package*/ static final int kMaxThreeByteDelta=((kFourByteDeltaLead-kMinThreeByteDeltaLead)<<16)-1; // 0xdffff 1119 1120 // Fixed value referencing the BytesTrie bytes. 1121 private byte[] bytes_; 1122 private int root_; 1123 1124 // Iterator variables. 1125 1126 // Index of next trie byte to read. Negative if no more matches. 1127 private int pos_; 1128 // Remaining length of a linear-match node, minus 1. Negative if not in such a node. 1129 private int remainingMatchLength_; 1130 }; 1131