• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *   Copyright (C) 2010-2014, International Business Machines
6 *   Corporation and others.  All Rights Reserved.
7 *******************************************************************************
8 *   created on: 2010nov23
9 *   created by: Markus W. Scherer
10 *   ported from ICU4C bytestrie.h/.cpp
11 */
12 package com.ibm.icu.util;
13 
14 import java.io.IOException;
15 import java.nio.ByteBuffer;
16 import java.util.ArrayList;
17 import java.util.NoSuchElementException;
18 
19 /**
20  * Light-weight, non-const reader class for a BytesTrie.
21  * Traverses a byte-serialized data structure with minimal state,
22  * for mapping byte sequences to non-negative integer values.
23  *
24  * <p>This class is not intended for public subclassing.
25  *
26  * @stable ICU 4.8
27  * @author Markus W. Scherer
28  */
29 public final class BytesTrie implements Cloneable, Iterable<BytesTrie.Entry> {
30     /**
31      * Constructs a BytesTrie reader instance.
32      *
33      * <p>The array must contain a copy of a byte sequence from the BytesTrieBuilder,
34      * with the offset indicating the first byte of that sequence.
35      * The BytesTrie object will not read more bytes than
36      * the BytesTrieBuilder generated in the corresponding build() call.
37      *
38      * <p>The array is not copied/cloned and must not be modified while
39      * the BytesTrie object is in use.
40      *
41      * @param trieBytes Bytes array that contains the serialized trie.
42      * @param offset Root offset of the trie in the array.
43      * @stable ICU 4.8
44      */
BytesTrie(byte[] trieBytes, int offset)45     public BytesTrie(byte[] trieBytes, int offset) {
46         bytes_=trieBytes;
47         pos_=root_=offset;
48         remainingMatchLength_=-1;
49     }
50 
51     /**
52      * Copy constructor.
53      * Makes a shallow copy of the other trie reader object and its state.
54      * Does not copy the byte array which will be shared.
55      * Same as clone() but without the throws clause.
56      *
57      * @stable ICU 64
58      */
BytesTrie(BytesTrie other)59     public BytesTrie(BytesTrie other) {
60         bytes_ = other.bytes_;
61         root_ = other.root_;
62         pos_ = other.pos_;
63         remainingMatchLength_ = other.remainingMatchLength_;
64     }
65 
66     /**
67      * Clones this trie reader object and its state,
68      * but not the byte array which will be shared.
69      * @return A shallow clone of this trie.
70      * @stable ICU 4.8
71      */
72     @Override
clone()73     public BytesTrie clone() throws CloneNotSupportedException {
74         return (BytesTrie) super.clone();  // A shallow copy is just what we need.
75     }
76 
77     /**
78      * Resets this trie to its initial state.
79      * @return this
80      * @stable ICU 4.8
81      */
reset()82     public BytesTrie reset() {
83         pos_=root_;
84         remainingMatchLength_=-1;
85         return this;
86     }
87 
88     /**
89      * Returns the state of this trie as a 64-bit integer.
90      * The state value is never 0.
91      *
92      * @return opaque state value
93      * @see #resetToState64
94      * @stable ICU 64
95      */
getState64()96     public long getState64() {
97         return ((long)remainingMatchLength_ << 32) | pos_;
98     }
99 
100     /**
101      * Resets this trie to the saved state.
102      * Unlike {@link #resetToState(State)}, the 64-bit state value
103      * must be from {@link #getState64()} from the same trie object or
104      * from one initialized the exact same way.
105      * Because of no validation, this method is faster.
106      *
107      * @param state The opaque trie state value from getState64().
108      * @return this
109      * @see #getState64
110      * @see #resetToState
111      * @see #reset
112      * @stable ICU 64
113      */
resetToState64(long state)114     public BytesTrie resetToState64(long state) {
115         remainingMatchLength_ = (int)(state >> 32);
116         pos_ = (int)state;
117         return this;
118     }
119 
120     /**
121      * BytesTrie state object, for saving a trie's current state
122      * and resetting the trie back to this state later.
123      * @stable ICU 4.8
124      */
125     public static final class State {
126         /**
127          * Constructs an empty State.
128          * @stable ICU 4.8
129          */
State()130         public State() {}
131         private byte[] bytes;
132         private int root;
133         private int pos;
134         private int remainingMatchLength;
135     }
136 
137     /**
138      * Saves the state of this trie.
139      * @param state The State object to hold the trie's state.
140      * @return this
141      * @see #resetToState
142      * @stable ICU 4.8
143      */
saveState(State state)144     public BytesTrie saveState(State state) /*const*/ {
145         state.bytes=bytes_;
146         state.root=root_;
147         state.pos=pos_;
148         state.remainingMatchLength=remainingMatchLength_;
149         return this;
150     }
151 
152     /**
153      * Resets this trie to the saved state.
154      * Slower than {@link #resetToState64(long)} which does not validate the state value.
155      *
156      * @param state The State object which holds a saved trie state.
157      * @return this
158      * @throws IllegalArgumentException if the state object contains no state,
159      *         or the state of a different trie
160      * @see #saveState
161      * @see #reset
162      * @stable ICU 4.8
163      */
resetToState(State state)164     public BytesTrie resetToState(State state) {
165         if(bytes_==state.bytes && bytes_!=null && root_==state.root) {
166             pos_=state.pos;
167             remainingMatchLength_=state.remainingMatchLength;
168         } else {
169             throw new IllegalArgumentException("incompatible trie state");
170         }
171         return this;
172     }
173 
174     /**
175      * Return values for BytesTrie.next(), CharsTrie.next() and similar methods.
176      * @stable ICU 4.8
177      */
178     public enum Result {
179         /**
180          * The input unit(s) did not continue a matching string.
181          * Once current()/next() return NO_MATCH,
182          * all further calls to current()/next() will also return NO_MATCH,
183          * until the trie is reset to its original state or to a saved state.
184          * @stable ICU 4.8
185          */
186         NO_MATCH,
187         /**
188          * The input unit(s) continued a matching string
189          * but there is no value for the string so far.
190          * (It is a prefix of a longer string.)
191          * @stable ICU 4.8
192          */
193         NO_VALUE,
194         /**
195          * The input unit(s) continued a matching string
196          * and there is a value for the string so far.
197          * This value will be returned by getValue().
198          * No further input byte/unit can continue a matching string.
199          * @stable ICU 4.8
200          */
201         FINAL_VALUE,
202         /**
203          * The input unit(s) continued a matching string
204          * and there is a value for the string so far.
205          * This value will be returned by getValue().
206          * Another input byte/unit can continue a matching string.
207          * @stable ICU 4.8
208          */
209         INTERMEDIATE_VALUE;
210 
211         // Note: The following methods assume the particular order
212         // of enum constants, treating the ordinal() values like bit sets.
213         // Do not reorder the enum constants!
214 
215         /**
216          * Same as (result!=NO_MATCH).
217          * @return true if the input bytes/units so far are part of a matching string/byte sequence.
218          * @stable ICU 4.8
219          */
matches()220         public boolean matches() { return this!=NO_MATCH; }
221 
222         /**
223          * Equivalent to (result==INTERMEDIATE_VALUE || result==FINAL_VALUE).
224          * @return true if there is a value for the input bytes/units so far.
225          * @see #getValue
226          * @stable ICU 4.8
227          */
hasValue()228         public boolean hasValue() { return ordinal()>=2; }
229 
230         /**
231          * Equivalent to (result==NO_VALUE || result==INTERMEDIATE_VALUE).
232          * @return true if another input byte/unit can continue a matching string.
233          * @stable ICU 4.8
234          */
hasNext()235         public boolean hasNext() { return (ordinal()&1)!=0; }
236     }
237 
238     /**
239      * Determines whether the byte sequence so far matches, whether it has a value,
240      * and whether another input byte can continue a matching byte sequence.
241      * @return The match/value Result.
242      * @stable ICU 4.8
243      */
current()244     public Result current() /*const*/ {
245         int pos=pos_;
246         if(pos<0) {
247             return Result.NO_MATCH;
248         } else {
249             int node;
250             return (remainingMatchLength_<0 && (node=bytes_[pos]&0xff)>=kMinValueLead) ?
251                     valueResults_[node&kValueIsFinal] : Result.NO_VALUE;
252         }
253     }
254 
255     /**
256      * Traverses the trie from the initial state for this input byte.
257      * Equivalent to reset().next(inByte).
258      * @param inByte Input byte value. Values -0x100..-1 are treated like 0..0xff.
259      *               Values below -0x100 and above 0xff will never match.
260      * @return The match/value Result.
261      * @stable ICU 4.8
262      */
first(int inByte)263     public Result first(int inByte) {
264         remainingMatchLength_=-1;
265         if(inByte<0) {
266             inByte+=0x100;
267         }
268         return nextImpl(root_, inByte);
269     }
270 
271     /**
272      * Traverses the trie from the current state for this input byte.
273      * @param inByte Input byte value. Values -0x100..-1 are treated like 0..0xff.
274      *               Values below -0x100 and above 0xff will never match.
275      * @return The match/value Result.
276      * @stable ICU 4.8
277      */
next(int inByte)278     public Result next(int inByte) {
279         int pos=pos_;
280         if(pos<0) {
281             return Result.NO_MATCH;
282         }
283         if(inByte<0) {
284             inByte+=0x100;
285         }
286         int length=remainingMatchLength_;  // Actual remaining match length minus 1.
287         if(length>=0) {
288             // Remaining part of a linear-match node.
289             if(inByte==(bytes_[pos++]&0xff)) {
290                 remainingMatchLength_=--length;
291                 pos_=pos;
292                 int node;
293                 return (length<0 && (node=bytes_[pos]&0xff)>=kMinValueLead) ?
294                         valueResults_[node&kValueIsFinal] : Result.NO_VALUE;
295             } else {
296                 stop();
297                 return Result.NO_MATCH;
298             }
299         }
300         return nextImpl(pos, inByte);
301     }
302 
303     /**
304      * Traverses the trie from the current state for this byte sequence.
305      * Equivalent to
306      * <pre>
307      * Result result=current();
308      * for(each c in s)
309      *   if(!result.hasNext()) return Result.NO_MATCH;
310      *   result=next(c);
311      * return result;
312      * </pre>
313      * @param s Contains a string or byte sequence.
314      * @param sIndex The start index of the byte sequence in s.
315      * @param sLimit The (exclusive) end index of the byte sequence in s.
316      * @return The match/value Result.
317      * @stable ICU 4.8
318      */
next(byte[] s, int sIndex, int sLimit)319     public Result next(byte[] s, int sIndex, int sLimit) {
320         if(sIndex>=sLimit) {
321             // Empty input.
322             return current();
323         }
324         int pos=pos_;
325         if(pos<0) {
326             return Result.NO_MATCH;
327         }
328         int length=remainingMatchLength_;  // Actual remaining match length minus 1.
329         for(;;) {
330             // Fetch the next input byte, if there is one.
331             // Continue a linear-match node.
332             byte inByte;
333             for(;;) {
334                 if(sIndex==sLimit) {
335                     remainingMatchLength_=length;
336                     pos_=pos;
337                     int node;
338                     return (length<0 && (node=(bytes_[pos]&0xff))>=kMinValueLead) ?
339                             valueResults_[node&kValueIsFinal] : Result.NO_VALUE;
340                 }
341                 inByte=s[sIndex++];
342                 if(length<0) {
343                     remainingMatchLength_=length;
344                     break;
345                 }
346                 if(inByte!=bytes_[pos]) {
347                     stop();
348                     return Result.NO_MATCH;
349                 }
350                 ++pos;
351                 --length;
352             }
353             for(;;) {
354                 int node=bytes_[pos++]&0xff;
355                 if(node<kMinLinearMatch) {
356                     Result result=branchNext(pos, node, inByte&0xff);
357                     if(result==Result.NO_MATCH) {
358                         return Result.NO_MATCH;
359                     }
360                     // Fetch the next input byte, if there is one.
361                     if(sIndex==sLimit) {
362                         return result;
363                     }
364                     if(result==Result.FINAL_VALUE) {
365                         // No further matching bytes.
366                         stop();
367                         return Result.NO_MATCH;
368                     }
369                     inByte=s[sIndex++];
370                     pos=pos_;  // branchNext() advanced pos and wrote it to pos_ .
371                 } else if(node<kMinValueLead) {
372                     // Match length+1 bytes.
373                     length=node-kMinLinearMatch;  // Actual match length minus 1.
374                     if(inByte!=bytes_[pos]) {
375                         stop();
376                         return Result.NO_MATCH;
377                     }
378                     ++pos;
379                     --length;
380                     break;
381                 } else if((node&kValueIsFinal)!=0) {
382                     // No further matching bytes.
383                     stop();
384                     return Result.NO_MATCH;
385                 } else {
386                     // Skip intermediate value.
387                     pos=skipValue(pos, node);
388                     // The next node must not also be a value node.
389                     assert((bytes_[pos]&0xff)<kMinValueLead);
390                 }
391             }
392         }
393     }
394 
395     /**
396      * Returns a matching byte sequence's value if called immediately after
397      * current()/first()/next() returned Result.INTERMEDIATE_VALUE or Result.FINAL_VALUE.
398      * getValue() can be called multiple times.
399      *
400      * Do not call getValue() after Result.NO_MATCH or Result.NO_VALUE!
401      * @return The value for the byte sequence so far.
402      * @stable ICU 4.8
403      */
404     public int getValue() /*const*/ {
405         int pos=pos_;
406         int leadByte=bytes_[pos++]&0xff;
407         assert(leadByte>=kMinValueLead);
408         return readValue(bytes_, pos, leadByte>>1);
409     }
410 
411     /**
412      * Determines whether all byte sequences reachable from the current state
413      * map to the same value, and if so, returns that value.
414      * @return The unique value in bits 32..1 with bit 0 set,
415      *         if all byte sequences reachable from the current state
416      *         map to the same value; otherwise returns 0.
417      * @stable ICU 4.8
418      */
getUniqueValue()419     public long getUniqueValue() /*const*/ {
420         int pos=pos_;
421         if(pos<0) {
422             return 0;
423         }
424         // Skip the rest of a pending linear-match node.
425         long uniqueValue=findUniqueValue(bytes_, pos+remainingMatchLength_+1, 0);
426         // Ignore internally used bits 63..33; extend the actual value's sign bit from bit 32.
427         return (uniqueValue<<31)>>31;
428     }
429 
430     /**
431      * Finds each byte which continues the byte sequence from the current state.
432      * That is, each byte b for which it would be next(b)!=Result.NO_MATCH now.
433      * @param out Each next byte is 0-extended to a char and appended to this object.
434      *            (Only uses the out.append(c) method.)
435      * @return The number of bytes which continue the byte sequence from here.
436      * @stable ICU 4.8
437      */
getNextBytes(Appendable out)438     public int getNextBytes(Appendable out) /*const*/ {
439         int pos=pos_;
440         if(pos<0) {
441             return 0;
442         }
443         if(remainingMatchLength_>=0) {
444             append(out, bytes_[pos]&0xff);  // Next byte of a pending linear-match node.
445             return 1;
446         }
447         int node=bytes_[pos++]&0xff;
448         if(node>=kMinValueLead) {
449             if((node&kValueIsFinal)!=0) {
450                 return 0;
451             } else {
452                 pos=skipValue(pos, node);
453                 node=bytes_[pos++]&0xff;
454                 assert(node<kMinValueLead);
455             }
456         }
457         if(node<kMinLinearMatch) {
458             if(node==0) {
459                 node=bytes_[pos++]&0xff;
460             }
461             getNextBranchBytes(bytes_, pos, ++node, out);
462             return node;
463         } else {
464             // First byte of the linear-match node.
465             append(out, bytes_[pos]&0xff);
466             return 1;
467         }
468     }
469 
470     /**
471      * Iterates from the current state of this trie.
472      * @return A new BytesTrie.Iterator.
473      * @stable ICU 4.8
474      */
475     @Override
476     public Iterator iterator() {
477         return new Iterator(bytes_, pos_, remainingMatchLength_, 0);
478     }
479 
480     /**
481      * Iterates from the current state of this trie.
482      * @param maxStringLength If 0, the iterator returns full strings/byte sequences.
483      *                        Otherwise, the iterator returns strings with this maximum length.
484      * @return A new BytesTrie.Iterator.
485      * @stable ICU 4.8
486      */
487     public Iterator iterator(int maxStringLength) {
488         return new Iterator(bytes_, pos_, remainingMatchLength_, maxStringLength);
489     }
490 
491     /**
492      * Iterates from the root of a byte-serialized BytesTrie.
493      * @param trieBytes Bytes array that contains the serialized trie.
494      * @param offset Root offset of the trie in the array.
495      * @param maxStringLength If 0, the iterator returns full strings/byte sequences.
496      *                        Otherwise, the iterator returns strings with this maximum length.
497      * @return A new BytesTrie.Iterator.
498      * @stable ICU 4.8
499      */
500     public static Iterator iterator(byte[] trieBytes, int offset, int maxStringLength) {
501         return new Iterator(trieBytes, offset, -1, maxStringLength);
502     }
503 
504     /**
505      * Return value type for the Iterator.
506      * @stable ICU 4.8
507      */
508     public static final class Entry {
509         private Entry(int capacity) {
510             bytes=new byte[capacity];
511         }
512 
513         /**
514          * @return The length of the byte sequence.
515          * @stable ICU 4.8
516          */
517         public int bytesLength() { return length; }
518         /**
519          * Returns a byte of the byte sequence.
520          * @param index An index into the byte sequence.
521          * @return The index-th byte sequence byte.
522          * @stable ICU 4.8
523          */
524         public byte byteAt(int index) { return bytes[index]; }
525         /**
526          * Copies the byte sequence into a byte array.
527          * @param dest Destination byte array.
528          * @param destOffset Starting offset to where in dest the byte sequence is copied.
529          * @stable ICU 4.8
530          */
531         public void copyBytesTo(byte[] dest, int destOffset) {
532             System.arraycopy(bytes, 0, dest, destOffset, length);
533         }
534         /**
535          * @return The byte sequence as a read-only ByteBuffer.
536          * @stable ICU 4.8
537          */
538         public ByteBuffer bytesAsByteBuffer() {
539             return ByteBuffer.wrap(bytes, 0, length).asReadOnlyBuffer();
540         }
541 
542         /**
543          * The value associated with the byte sequence.
544          * @stable ICU 4.8
545          */
546         public int value;
547 
548         private void ensureCapacity(int len) {
549             if(bytes.length<len) {
550                 byte[] newBytes=new byte[Math.min(2*bytes.length, 2*len)];
551                 System.arraycopy(bytes, 0, newBytes, 0, length);
552                 bytes=newBytes;
553             }
554         }
555         private void append(byte b) {
556             ensureCapacity(length+1);
557             bytes[length++]=b;
558         }
559         private void append(byte[] b, int off, int len) {
560             ensureCapacity(length+len);
561             System.arraycopy(b, off, bytes, length, len);
562             length+=len;
563         }
564         private void truncateString(int newLength) { length=newLength; }
565 
566         private byte[] bytes;
567         private int length;
568     }
569 
570     /**
571      * Iterator for all of the (byte sequence, value) pairs in a BytesTrie.
572      * @stable ICU 4.8
573      */
574     public static final class Iterator implements java.util.Iterator<Entry> {
575         private Iterator(byte[] trieBytes, int offset, int remainingMatchLength, int maxStringLength) {
576             bytes_=trieBytes;
577             pos_=initialPos_=offset;
578             remainingMatchLength_=initialRemainingMatchLength_=remainingMatchLength;
579             maxLength_=maxStringLength;
580             entry_=new Entry(maxLength_!=0 ? maxLength_ : 32);
581             int length=remainingMatchLength_;  // Actual remaining match length minus 1.
582             if(length>=0) {
583                 // Pending linear-match node, append remaining bytes to entry_.
584                 ++length;
585                 if(maxLength_>0 && length>maxLength_) {
586                     length=maxLength_;  // This will leave remainingMatchLength>=0 as a signal.
587                 }
588                 entry_.append(bytes_, pos_, length);
589                 pos_+=length;
590                 remainingMatchLength_-=length;
591             }
592         }
593 
594         /**
595          * Resets this iterator to its initial state.
596          * @return this
597          * @stable ICU 4.8
598          */
reset()599         public Iterator reset() {
600             pos_=initialPos_;
601             remainingMatchLength_=initialRemainingMatchLength_;
602             int length=remainingMatchLength_+1;  // Remaining match length.
603             if(maxLength_>0 && length>maxLength_) {
604                 length=maxLength_;
605             }
606             entry_.truncateString(length);
607             pos_+=length;
608             remainingMatchLength_-=length;
609             stack_.clear();
610             return this;
611         }
612 
613         /**
614          * @return true if there are more elements.
615          * @stable ICU 4.8
616          */
617         @Override
hasNext()618         public boolean hasNext() /*const*/ { return pos_>=0 || !stack_.isEmpty(); }
619 
620         /**
621          * Finds the next (byte sequence, value) pair if there is one.
622          *
623          * If the byte sequence is truncated to the maximum length and does not
624          * have a real value, then the value is set to -1.
625          * In this case, this "not a real value" is indistinguishable from
626          * a real value of -1.
627          * @return An Entry with the string and value of the next element.
628          * @throws NoSuchElementException - iteration has no more elements.
629          * @stable ICU 4.8
630          */
631         @Override
next()632         public Entry next() {
633             int pos=pos_;
634             if(pos<0) {
635                 if(stack_.isEmpty()) {
636                     throw new NoSuchElementException();
637                 }
638                 // Pop the state off the stack and continue with the next outbound edge of
639                 // the branch node.
640                 long top=stack_.remove(stack_.size()-1);
641                 int length=(int)top;
642                 pos=(int)(top>>32);
643                 entry_.truncateString(length&0xffff);
644                 length>>>=16;
645                 if(length>1) {
646                     pos=branchNext(pos, length);
647                     if(pos<0) {
648                         return entry_;  // Reached a final value.
649                     }
650                 } else {
651                     entry_.append(bytes_[pos++]);
652                 }
653             }
654             if(remainingMatchLength_>=0) {
655                 // We only get here if we started in a pending linear-match node
656                 // with more than maxLength remaining bytes.
657                 return truncateAndStop();
658             }
659             for(;;) {
660                 int node=bytes_[pos++]&0xff;
661                 if(node>=kMinValueLead) {
662                     // Deliver value for the byte sequence so far.
663                     boolean isFinal=(node&kValueIsFinal)!=0;
664                     entry_.value=readValue(bytes_, pos, node>>1);
665                     if(isFinal || (maxLength_>0 && entry_.length==maxLength_)) {
666                         pos_=-1;
667                     } else {
668                         pos_=skipValue(pos, node);
669                     }
670                     return entry_;
671                 }
672                 if(maxLength_>0 && entry_.length==maxLength_) {
673                     return truncateAndStop();
674                 }
675                 if(node<kMinLinearMatch) {
676                     if(node==0) {
677                         node=bytes_[pos++]&0xff;
678                     }
679                     pos=branchNext(pos, node+1);
680                     if(pos<0) {
681                         return entry_;  // Reached a final value.
682                     }
683                 } else {
684                     // Linear-match node, append length bytes to entry_.
685                     int length=node-kMinLinearMatch+1;
686                     if(maxLength_>0 && entry_.length+length>maxLength_) {
687                         entry_.append(bytes_, pos, maxLength_-entry_.length);
688                         return truncateAndStop();
689                     }
690                     entry_.append(bytes_, pos, length);
691                     pos+=length;
692                 }
693             }
694         }
695 
696         /**
697          * Iterator.remove() is not supported.
698          * @throws UnsupportedOperationException (always)
699          * @stable ICU 4.8
700          */
701         @Override
remove()702         public void remove() {
703             throw new UnsupportedOperationException();
704         }
705 
truncateAndStop()706         private Entry truncateAndStop() {
707             pos_=-1;
708             entry_.value=-1;  // no real value for str
709             return entry_;
710         }
711 
branchNext(int pos, int length)712         private int branchNext(int pos, int length) {
713             while(length>kMaxBranchLinearSubNodeLength) {
714                 ++pos;  // ignore the comparison byte
715                 // Push state for the greater-or-equal edge.
716                 stack_.add(((long)skipDelta(bytes_, pos)<<32)|((length-(length>>1))<<16)|entry_.length);
717                 // Follow the less-than edge.
718                 length>>=1;
719                 pos=jumpByDelta(bytes_, pos);
720             }
721             // List of key-value pairs where values are either final values or jump deltas.
722             // Read the first (key, value) pair.
723             byte trieByte=bytes_[pos++];
724             int node=bytes_[pos++]&0xff;
725             boolean isFinal=(node&kValueIsFinal)!=0;
726             int value=readValue(bytes_, pos, node>>1);
727             pos=skipValue(pos, node);
728             stack_.add(((long)pos<<32)|((length-1)<<16)|entry_.length);
729             entry_.append(trieByte);
730             if(isFinal) {
731                 pos_=-1;
732                 entry_.value=value;
733                 return -1;
734             } else {
735                 return pos+value;
736             }
737         }
738 
739         private byte[] bytes_;
740         private int pos_;
741         private int initialPos_;
742         private int remainingMatchLength_;
743         private int initialRemainingMatchLength_;
744 
745         private int maxLength_;
746         private Entry entry_;
747 
748         // The stack stores longs for backtracking to another
749         // outbound edge of a branch node.
750         // Each long has the offset from bytes_ in bits 62..32,
751         // the entry_.stringLength() from before the node in bits 15..0,
752         // and the remaining branch length in bits 24..16. (Bits 31..25 are unused.)
753         // (We could store the remaining branch length minus 1 in bits 23..16 and not use bits 31..24,
754         // but the code looks more confusing that way.)
755         private ArrayList<Long> stack_=new ArrayList<>();
756     }
757 
stop()758     private void stop() {
759         pos_=-1;
760     }
761 
762     // Reads a compact 32-bit integer.
763     // pos is already after the leadByte, and the lead byte is already shifted right by 1.
readValue(byte[] bytes, int pos, int leadByte)764     private static int readValue(byte[] bytes, int pos, int leadByte) {
765         int value;
766         if(leadByte<kMinTwoByteValueLead) {
767             value=leadByte-kMinOneByteValueLead;
768         } else if(leadByte<kMinThreeByteValueLead) {
769             value=((leadByte-kMinTwoByteValueLead)<<8)|(bytes[pos]&0xff);
770         } else if(leadByte<kFourByteValueLead) {
771             value=((leadByte-kMinThreeByteValueLead)<<16)|((bytes[pos]&0xff)<<8)|(bytes[pos+1]&0xff);
772         } else if(leadByte==kFourByteValueLead) {
773             value=((bytes[pos]&0xff)<<16)|((bytes[pos+1]&0xff)<<8)|(bytes[pos+2]&0xff);
774         } else {
775             value=(bytes[pos]<<24)|((bytes[pos+1]&0xff)<<16)|((bytes[pos+2]&0xff)<<8)|(bytes[pos+3]&0xff);
776         }
777         return value;
778     }
skipValue(int pos, int leadByte)779     private static int skipValue(int pos, int leadByte) {
780         assert(leadByte>=kMinValueLead);
781         if(leadByte>=(kMinTwoByteValueLead<<1)) {
782             if(leadByte<(kMinThreeByteValueLead<<1)) {
783                 ++pos;
784             } else if(leadByte<(kFourByteValueLead<<1)) {
785                 pos+=2;
786             } else {
787                 pos+=3+((leadByte>>1)&1);
788             }
789         }
790         return pos;
791     }
skipValue(byte[] bytes, int pos)792     private static int skipValue(byte[] bytes, int pos) {
793         int leadByte=bytes[pos++]&0xff;
794         return skipValue(pos, leadByte);
795     }
796 
797     /**
798      * Reads a jump delta and jumps.
799      * @internal
800      * @deprecated This API is ICU internal only.
801      */
802     @Deprecated
jumpByDelta(byte[] bytes, int pos)803     public static int jumpByDelta(byte[] bytes, int pos) {
804         int delta=bytes[pos++]&0xff;
805         if(delta<kMinTwoByteDeltaLead) {
806             // nothing to do
807         } else if(delta<kMinThreeByteDeltaLead) {
808             delta=((delta-kMinTwoByteDeltaLead)<<8)|(bytes[pos++]&0xff);
809         } else if(delta<kFourByteDeltaLead) {
810             delta=((delta-kMinThreeByteDeltaLead)<<16)|((bytes[pos]&0xff)<<8)|(bytes[pos+1]&0xff);
811             pos+=2;
812         } else if(delta==kFourByteDeltaLead) {
813             delta=((bytes[pos]&0xff)<<16)|((bytes[pos+1]&0xff)<<8)|(bytes[pos+2]&0xff);
814             pos+=3;
815         } else {
816             delta=(bytes[pos]<<24)|((bytes[pos+1]&0xff)<<16)|((bytes[pos+2]&0xff)<<8)|(bytes[pos+3]&0xff);
817             pos+=4;
818         }
819         return pos+delta;
820     }
821 
skipDelta(byte[] bytes, int pos)822     private static int skipDelta(byte[] bytes, int pos) {
823         int delta=bytes[pos++]&0xff;
824         if(delta>=kMinTwoByteDeltaLead) {
825             if(delta<kMinThreeByteDeltaLead) {
826                 ++pos;
827             } else if(delta<kFourByteDeltaLead) {
828                 pos+=2;
829             } else {
830                 pos+=3+(delta&1);
831             }
832         }
833         return pos;
834     }
835 
836     private static Result[] valueResults_={ Result.INTERMEDIATE_VALUE, Result.FINAL_VALUE };
837 
838     // Handles a branch node for both next(byte) and next(string).
branchNext(int pos, int length, int inByte)839     private Result branchNext(int pos, int length, int inByte) {
840         // Branch according to the current byte.
841         if(length==0) {
842             length=bytes_[pos++]&0xff;
843         }
844         ++length;
845         // The length of the branch is the number of bytes to select from.
846         // The data structure encodes a binary search.
847         while(length>kMaxBranchLinearSubNodeLength) {
848             if(inByte<(bytes_[pos++]&0xff)) {
849                 length>>=1;
850                 pos=jumpByDelta(bytes_, pos);
851             } else {
852                 length=length-(length>>1);
853                 pos=skipDelta(bytes_, pos);
854             }
855         }
856         // Drop down to linear search for the last few bytes.
857         // length>=2 because the loop body above sees length>kMaxBranchLinearSubNodeLength>=3
858         // and divides length by 2.
859         do {
860             if(inByte==(bytes_[pos++]&0xff)) {
861                 Result result;
862                 int node=bytes_[pos]&0xff;
863                 assert(node>=kMinValueLead);
864                 if((node&kValueIsFinal)!=0) {
865                     // Leave the final value for getValue() to read.
866                     result=Result.FINAL_VALUE;
867                 } else {
868                     // Use the non-final value as the jump delta.
869                     ++pos;
870                     // int delta=readValue(pos, node>>1);
871                     node>>=1;
872                     int delta;
873                     if(node<kMinTwoByteValueLead) {
874                         delta=node-kMinOneByteValueLead;
875                     } else if(node<kMinThreeByteValueLead) {
876                         delta=((node-kMinTwoByteValueLead)<<8)|(bytes_[pos++]&0xff);
877                     } else if(node<kFourByteValueLead) {
878                         delta=((node-kMinThreeByteValueLead)<<16)|((bytes_[pos]&0xff)<<8)|(bytes_[pos+1]&0xff);
879                         pos+=2;
880                     } else if(node==kFourByteValueLead) {
881                         delta=((bytes_[pos]&0xff)<<16)|((bytes_[pos+1]&0xff)<<8)|(bytes_[pos+2]&0xff);
882                         pos+=3;
883                     } else {
884                         delta=(bytes_[pos]<<24)|((bytes_[pos+1]&0xff)<<16)|((bytes_[pos+2]&0xff)<<8)|(bytes_[pos+3]&0xff);
885                         pos+=4;
886                     }
887                     // end readValue()
888                     pos+=delta;
889                     node=bytes_[pos]&0xff;
890                     result= node>=kMinValueLead ? valueResults_[node&kValueIsFinal] : Result.NO_VALUE;
891                 }
892                 pos_=pos;
893                 return result;
894             }
895             --length;
896             pos=skipValue(bytes_, pos);
897         } while(length>1);
898         if(inByte==(bytes_[pos++]&0xff)) {
899             pos_=pos;
900             int node=bytes_[pos]&0xff;
901             return node>=kMinValueLead ? valueResults_[node&kValueIsFinal] : Result.NO_VALUE;
902         } else {
903             stop();
904             return Result.NO_MATCH;
905         }
906     }
907 
908     // Requires remainingLength_<0.
nextImpl(int pos, int inByte)909     private Result nextImpl(int pos, int inByte) {
910         for(;;) {
911             int node=bytes_[pos++]&0xff;
912             if(node<kMinLinearMatch) {
913                 return branchNext(pos, node, inByte);
914             } else if(node<kMinValueLead) {
915                 // Match the first of length+1 bytes.
916                 int length=node-kMinLinearMatch;  // Actual match length minus 1.
917                 if(inByte==(bytes_[pos++]&0xff)) {
918                     remainingMatchLength_=--length;
919                     pos_=pos;
920                     return (length<0 && (node=bytes_[pos]&0xff)>=kMinValueLead) ?
921                             valueResults_[node&kValueIsFinal] : Result.NO_VALUE;
922                 } else {
923                     // No match.
924                     break;
925                 }
926             } else if((node&kValueIsFinal)!=0) {
927                 // No further matching bytes.
928                 break;
929             } else {
930                 // Skip intermediate value.
931                 pos=skipValue(pos, node);
932                 // The next node must not also be a value node.
933                 assert((bytes_[pos]&0xff)<kMinValueLead);
934             }
935         }
936         stop();
937         return Result.NO_MATCH;
938     }
939 
940     // Helper functions for getUniqueValue().
941     // Recursively finds a unique value (or whether there is not a unique one)
942     // from a branch.
943     // uniqueValue: On input, same as for getUniqueValue()/findUniqueValue().
944     // On return, if not 0, then bits 63..33 contain the updated non-negative pos.
945     private static long findUniqueValueFromBranch(byte[] bytes, int pos, int length,
946                                                   long uniqueValue) {
947         while(length>kMaxBranchLinearSubNodeLength) {
948             ++pos;  // ignore the comparison byte
949             uniqueValue=findUniqueValueFromBranch(bytes, jumpByDelta(bytes, pos), length>>1, uniqueValue);
950             if(uniqueValue==0) {
951                 return 0;
952             }
953             length=length-(length>>1);
954             pos=skipDelta(bytes, pos);
955         }
956         do {
957             ++pos;  // ignore a comparison byte
958             // handle its value
959             int node=bytes[pos++]&0xff;
960             boolean isFinal=(node&kValueIsFinal)!=0;
961             int value=readValue(bytes, pos, node>>1);
962             pos=skipValue(pos, node);
963             if(isFinal) {
964                 if(uniqueValue!=0) {
965                     if(value!=(int)(uniqueValue>>1)) {
966                         return 0;
967                     }
968                 } else {
969                     uniqueValue=((long)value<<1)|1;
970                 }
971             } else {
972                 uniqueValue=findUniqueValue(bytes, pos+value, uniqueValue);
973                 if(uniqueValue==0) {
974                     return 0;
975                 }
976             }
977         } while(--length>1);
978         // ignore the last comparison byte
979         return ((long)(pos+1)<<33)|(uniqueValue&0x1ffffffffL);
980     }
981     // Recursively finds a unique value (or whether there is not a unique one)
982     // starting from a position on a node lead byte.
983     // uniqueValue: If there is one, then bits 32..1 contain the value and bit 0 is set.
984     // Otherwise, uniqueValue is 0. Bits 63..33 are ignored.
findUniqueValue(byte[] bytes, int pos, long uniqueValue)985     private static long findUniqueValue(byte[] bytes, int pos, long uniqueValue) {
986         for(;;) {
987             int node=bytes[pos++]&0xff;
988             if(node<kMinLinearMatch) {
989                 if(node==0) {
990                     node=bytes[pos++]&0xff;
991                 }
992                 uniqueValue=findUniqueValueFromBranch(bytes, pos, node+1, uniqueValue);
993                 if(uniqueValue==0) {
994                     return 0;
995                 }
996                 pos=(int)(uniqueValue>>>33);
997             } else if(node<kMinValueLead) {
998                 // linear-match node
999                 pos+=node-kMinLinearMatch+1;  // Ignore the match bytes.
1000             } else {
1001                 boolean isFinal=(node&kValueIsFinal)!=0;
1002                 int value=readValue(bytes, pos, node>>1);
1003                 if(uniqueValue!=0) {
1004                     if(value!=(int)(uniqueValue>>1)) {
1005                         return 0;
1006                     }
1007                 } else {
1008                     uniqueValue=((long)value<<1)|1;
1009                 }
1010                 if(isFinal) {
1011                     return uniqueValue;
1012                 }
1013                 pos=skipValue(pos, node);
1014             }
1015         }
1016     }
1017 
1018     // Helper functions for getNextBytes().
1019     // getNextBytes() when pos is on a branch node.
getNextBranchBytes(byte[] bytes, int pos, int length, Appendable out)1020     private static void getNextBranchBytes(byte[] bytes, int pos, int length, Appendable out) {
1021         while(length>kMaxBranchLinearSubNodeLength) {
1022             ++pos;  // ignore the comparison byte
1023             getNextBranchBytes(bytes, jumpByDelta(bytes, pos), length>>1, out);
1024             length=length-(length>>1);
1025             pos=skipDelta(bytes, pos);
1026         }
1027         do {
1028             append(out, bytes[pos++]&0xff);
1029             pos=skipValue(bytes, pos);
1030         } while(--length>1);
1031         append(out, bytes[pos]&0xff);
1032     }
append(Appendable out, int c)1033     private static void append(Appendable out, int c) {
1034         try {
1035             out.append((char)c);
1036         } catch(IOException e) {
1037             throw new ICUUncheckedIOException(e);
1038         }
1039     }
1040 
1041     // BytesTrie data structure
1042     //
1043     // The trie consists of a series of byte-serialized nodes for incremental
1044     // string/byte sequence matching. The root node is at the beginning of the trie data.
1045     //
1046     // Types of nodes are distinguished by their node lead byte ranges.
1047     // After each node, except a final-value node, another node follows to
1048     // encode match values or continue matching further bytes.
1049     //
1050     // Node types:
1051     //  - Value node: Stores a 32-bit integer in a compact, variable-length format.
1052     //    The value is for the string/byte sequence so far.
1053     //    One node bit indicates whether the value is final or whether
1054     //    matching continues with the next node.
1055     //  - Linear-match node: Matches a number of bytes.
1056     //  - Branch node: Branches to other nodes according to the current input byte.
1057     //    The node byte is the length of the branch (number of bytes to select from)
1058     //    minus 1. It is followed by a sub-node:
1059     //    - If the length is at most kMaxBranchLinearSubNodeLength, then
1060     //      there are length-1 (key, value) pairs and then one more comparison byte.
1061     //      If one of the key bytes matches, then the value is either a final value for
1062     //      the string/byte sequence so far, or a "jump" delta to the next node.
1063     //      If the last byte matches, then matching continues with the next node.
1064     //      (Values have the same encoding as value nodes.)
1065     //    - If the length is greater than kMaxBranchLinearSubNodeLength, then
1066     //      there is one byte and one "jump" delta.
1067     //      If the input byte is less than the sub-node byte, then "jump" by delta to
1068     //      the next sub-node which will have a length of length/2.
1069     //      (The delta has its own compact encoding.)
1070     //      Otherwise, skip the "jump" delta to the next sub-node
1071     //      which will have a length of length-length/2.
1072 
1073     // Node lead byte values.
1074 
1075     // 00..0f: Branch node. If node!=0 then the length is node+1, otherwise
1076     // the length is one more than the next byte.
1077 
1078     // For a branch sub-node with at most this many entries, we drop down
1079     // to a linear search.
1080     /*package*/ static final int kMaxBranchLinearSubNodeLength=5;
1081 
1082     // 10..1f: Linear-match node, match 1..16 bytes and continue reading the next node.
1083     /*package*/ static final int kMinLinearMatch=0x10;
1084     /*package*/ static final int kMaxLinearMatchLength=0x10;
1085 
1086     // 20..ff: Variable-length value node.
1087     // If odd, the value is final. (Otherwise, intermediate value or jump delta.)
1088     // Then shift-right by 1 bit.
1089     // The remaining lead byte value indicates the number of following bytes (0..4)
1090     // and contains the value's top bits.
1091     /*package*/ static final int kMinValueLead=kMinLinearMatch+kMaxLinearMatchLength;  // 0x20
1092     // It is a final value if bit 0 is set.
1093     private static final int kValueIsFinal=1;
1094 
1095     // Compact value: After testing bit 0, shift right by 1 and then use the following thresholds.
1096     /*package*/ static final int kMinOneByteValueLead=kMinValueLead/2;  // 0x10
1097     /*package*/ static final int kMaxOneByteValue=0x40;  // At least 6 bits in the first byte.
1098 
1099     /*package*/ static final int kMinTwoByteValueLead=kMinOneByteValueLead+kMaxOneByteValue+1;  // 0x51
1100     /*package*/ static final int kMaxTwoByteValue=0x1aff;
1101 
1102     /*package*/ static final int kMinThreeByteValueLead=kMinTwoByteValueLead+(kMaxTwoByteValue>>8)+1;  // 0x6c
1103     /*package*/ static final int kFourByteValueLead=0x7e;
1104 
1105     // A little more than Unicode code points. (0x11ffff)
1106     /*package*/ static final int kMaxThreeByteValue=((kFourByteValueLead-kMinThreeByteValueLead)<<16)-1;
1107 
1108     /*package*/ static final int kFiveByteValueLead=0x7f;
1109 
1110     // Compact delta integers.
1111     /*package*/ static final int kMaxOneByteDelta=0xbf;
1112     /*package*/ static final int kMinTwoByteDeltaLead=kMaxOneByteDelta+1;  // 0xc0
1113     /*package*/ static final int kMinThreeByteDeltaLead=0xf0;
1114     /*package*/ static final int kFourByteDeltaLead=0xfe;
1115     /*package*/ static final int kFiveByteDeltaLead=0xff;
1116 
1117     /*package*/ static final int kMaxTwoByteDelta=((kMinThreeByteDeltaLead-kMinTwoByteDeltaLead)<<8)-1;  // 0x2fff
1118     /*package*/ static final int kMaxThreeByteDelta=((kFourByteDeltaLead-kMinThreeByteDeltaLead)<<16)-1;  // 0xdffff
1119 
1120     // Fixed value referencing the BytesTrie bytes.
1121     private byte[] bytes_;
1122     private int root_;
1123 
1124     // Iterator variables.
1125 
1126     // Index of next trie byte to read. Negative if no more matches.
1127     private int pos_;
1128     // Remaining length of a linear-match node, minus 1. Negative if not in such a node.
1129     private int remainingMatchLength_;
1130 };
1131