• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html#License
3 /*
4 *******************************************************************************
5 *   Copyright (C) 2010-2014, International Business Machines
6 *   Corporation and others.  All Rights Reserved.
7 *******************************************************************************
8 *   created on: 2010nov23
9 *   created by: Markus W. Scherer
10 *   ported from ICU4C bytestrie.h/.cpp
11 */
12 package com.ibm.icu.util;
13 
14 import java.io.IOException;
15 import java.nio.ByteBuffer;
16 import java.util.ArrayList;
17 import java.util.NoSuchElementException;
18 
19 /**
20  * Light-weight, non-const reader class for a BytesTrie.
21  * Traverses a byte-serialized data structure with minimal state,
22  * for mapping byte sequences to non-negative integer values.
23  *
24  * <p>This class is not intended for public subclassing.
25  *
26  * @stable ICU 4.8
27  * @author Markus W. Scherer
28  */
29 public final class BytesTrie implements Cloneable, Iterable<BytesTrie.Entry> {
30     /**
31      * Constructs a BytesTrie reader instance.
32      *
33      * <p>The array must contain a copy of a byte sequence from the BytesTrieBuilder,
34      * with the offset indicating the first byte of that sequence.
35      * The BytesTrie object will not read more bytes than
36      * the BytesTrieBuilder generated in the corresponding build() call.
37      *
38      * <p>The array is not copied/cloned and must not be modified while
39      * the BytesTrie object is in use.
40      *
41      * @param trieBytes Bytes array that contains the serialized trie.
42      * @param offset Root offset of the trie in the array.
43      * @stable ICU 4.8
44      */
BytesTrie(byte[] trieBytes, int offset)45     public BytesTrie(byte[] trieBytes, int offset) {
46         bytes_=trieBytes;
47         pos_=root_=offset;
48         remainingMatchLength_=-1;
49     }
50 
51     /**
52      * Clones this trie reader object and its state,
53      * but not the byte array which will be shared.
54      * @return A shallow clone of this trie.
55      * @stable ICU 4.8
56      */
57     @Override
clone()58     public Object clone() throws CloneNotSupportedException {
59         return super.clone();  // A shallow copy is just what we need.
60     }
61 
62     /**
63      * Resets this trie to its initial state.
64      * @return this
65      * @stable ICU 4.8
66      */
reset()67     public BytesTrie reset() {
68         pos_=root_;
69         remainingMatchLength_=-1;
70         return this;
71     }
72 
73     /**
74      * BytesTrie state object, for saving a trie's current state
75      * and resetting the trie back to this state later.
76      * @stable ICU 4.8
77      */
78     public static final class State {
79         /**
80          * Constructs an empty State.
81          * @stable ICU 4.8
82          */
State()83         public State() {}
84         private byte[] bytes;
85         private int root;
86         private int pos;
87         private int remainingMatchLength;
88     }
89 
90     /**
91      * Saves the state of this trie.
92      * @param state The State object to hold the trie's state.
93      * @return this
94      * @see #resetToState
95      * @stable ICU 4.8
96      */
saveState(State state)97     public BytesTrie saveState(State state) /*const*/ {
98         state.bytes=bytes_;
99         state.root=root_;
100         state.pos=pos_;
101         state.remainingMatchLength=remainingMatchLength_;
102         return this;
103     }
104 
105     /**
106      * Resets this trie to the saved state.
107      * @param state The State object which holds a saved trie state.
108      * @return this
109      * @throws IllegalArgumentException if the state object contains no state,
110      *         or the state of a different trie
111      * @see #saveState
112      * @see #reset
113      * @stable ICU 4.8
114      */
resetToState(State state)115     public BytesTrie resetToState(State state) {
116         if(bytes_==state.bytes && bytes_!=null && root_==state.root) {
117             pos_=state.pos;
118             remainingMatchLength_=state.remainingMatchLength;
119         } else {
120             throw new IllegalArgumentException("incompatible trie state");
121         }
122         return this;
123     }
124 
125     /**
126      * Return values for BytesTrie.next(), CharsTrie.next() and similar methods.
127      * @stable ICU 4.8
128      */
129     public enum Result {
130         /**
131          * The input unit(s) did not continue a matching string.
132          * Once current()/next() return NO_MATCH,
133          * all further calls to current()/next() will also return NO_MATCH,
134          * until the trie is reset to its original state or to a saved state.
135          * @stable ICU 4.8
136          */
137         NO_MATCH,
138         /**
139          * The input unit(s) continued a matching string
140          * but there is no value for the string so far.
141          * (It is a prefix of a longer string.)
142          * @stable ICU 4.8
143          */
144         NO_VALUE,
145         /**
146          * The input unit(s) continued a matching string
147          * and there is a value for the string so far.
148          * This value will be returned by getValue().
149          * No further input byte/unit can continue a matching string.
150          * @stable ICU 4.8
151          */
152         FINAL_VALUE,
153         /**
154          * The input unit(s) continued a matching string
155          * and there is a value for the string so far.
156          * This value will be returned by getValue().
157          * Another input byte/unit can continue a matching string.
158          * @stable ICU 4.8
159          */
160         INTERMEDIATE_VALUE;
161 
162         // Note: The following methods assume the particular order
163         // of enum constants, treating the ordinal() values like bit sets.
164         // Do not reorder the enum constants!
165 
166         /**
167          * Same as (result!=NO_MATCH).
168          * @return true if the input bytes/units so far are part of a matching string/byte sequence.
169          * @stable ICU 4.8
170          */
matches()171         public boolean matches() { return this!=NO_MATCH; }
172 
173         /**
174          * Equivalent to (result==INTERMEDIATE_VALUE || result==FINAL_VALUE).
175          * @return true if there is a value for the input bytes/units so far.
176          * @see #getValue
177          * @stable ICU 4.8
178          */
hasValue()179         public boolean hasValue() { return ordinal()>=2; }
180 
181         /**
182          * Equivalent to (result==NO_VALUE || result==INTERMEDIATE_VALUE).
183          * @return true if another input byte/unit can continue a matching string.
184          * @stable ICU 4.8
185          */
hasNext()186         public boolean hasNext() { return (ordinal()&1)!=0; }
187     }
188 
189     /**
190      * Determines whether the byte sequence so far matches, whether it has a value,
191      * and whether another input byte can continue a matching byte sequence.
192      * @return The match/value Result.
193      * @stable ICU 4.8
194      */
current()195     public Result current() /*const*/ {
196         int pos=pos_;
197         if(pos<0) {
198             return Result.NO_MATCH;
199         } else {
200             int node;
201             return (remainingMatchLength_<0 && (node=bytes_[pos]&0xff)>=kMinValueLead) ?
202                     valueResults_[node&kValueIsFinal] : Result.NO_VALUE;
203         }
204     }
205 
206     /**
207      * Traverses the trie from the initial state for this input byte.
208      * Equivalent to reset().next(inByte).
209      * @param inByte Input byte value. Values -0x100..-1 are treated like 0..0xff.
210      *               Values below -0x100 and above 0xff will never match.
211      * @return The match/value Result.
212      * @stable ICU 4.8
213      */
first(int inByte)214     public Result first(int inByte) {
215         remainingMatchLength_=-1;
216         if(inByte<0) {
217             inByte+=0x100;
218         }
219         return nextImpl(root_, inByte);
220     }
221 
222     /**
223      * Traverses the trie from the current state for this input byte.
224      * @param inByte Input byte value. Values -0x100..-1 are treated like 0..0xff.
225      *               Values below -0x100 and above 0xff will never match.
226      * @return The match/value Result.
227      * @stable ICU 4.8
228      */
next(int inByte)229     public Result next(int inByte) {
230         int pos=pos_;
231         if(pos<0) {
232             return Result.NO_MATCH;
233         }
234         if(inByte<0) {
235             inByte+=0x100;
236         }
237         int length=remainingMatchLength_;  // Actual remaining match length minus 1.
238         if(length>=0) {
239             // Remaining part of a linear-match node.
240             if(inByte==(bytes_[pos++]&0xff)) {
241                 remainingMatchLength_=--length;
242                 pos_=pos;
243                 int node;
244                 return (length<0 && (node=bytes_[pos]&0xff)>=kMinValueLead) ?
245                         valueResults_[node&kValueIsFinal] : Result.NO_VALUE;
246             } else {
247                 stop();
248                 return Result.NO_MATCH;
249             }
250         }
251         return nextImpl(pos, inByte);
252     }
253 
254     /**
255      * Traverses the trie from the current state for this byte sequence.
256      * Equivalent to
257      * <pre>
258      * Result result=current();
259      * for(each c in s)
260      *   if(!result.hasNext()) return Result.NO_MATCH;
261      *   result=next(c);
262      * return result;
263      * </pre>
264      * @param s Contains a string or byte sequence.
265      * @param sIndex The start index of the byte sequence in s.
266      * @param sLimit The (exclusive) end index of the byte sequence in s.
267      * @return The match/value Result.
268      * @stable ICU 4.8
269      */
next(byte[] s, int sIndex, int sLimit)270     public Result next(byte[] s, int sIndex, int sLimit) {
271         if(sIndex>=sLimit) {
272             // Empty input.
273             return current();
274         }
275         int pos=pos_;
276         if(pos<0) {
277             return Result.NO_MATCH;
278         }
279         int length=remainingMatchLength_;  // Actual remaining match length minus 1.
280         for(;;) {
281             // Fetch the next input byte, if there is one.
282             // Continue a linear-match node.
283             byte inByte;
284             for(;;) {
285                 if(sIndex==sLimit) {
286                     remainingMatchLength_=length;
287                     pos_=pos;
288                     int node;
289                     return (length<0 && (node=(bytes_[pos]&0xff))>=kMinValueLead) ?
290                             valueResults_[node&kValueIsFinal] : Result.NO_VALUE;
291                 }
292                 inByte=s[sIndex++];
293                 if(length<0) {
294                     remainingMatchLength_=length;
295                     break;
296                 }
297                 if(inByte!=bytes_[pos]) {
298                     stop();
299                     return Result.NO_MATCH;
300                 }
301                 ++pos;
302                 --length;
303             }
304             for(;;) {
305                 int node=bytes_[pos++]&0xff;
306                 if(node<kMinLinearMatch) {
307                     Result result=branchNext(pos, node, inByte&0xff);
308                     if(result==Result.NO_MATCH) {
309                         return Result.NO_MATCH;
310                     }
311                     // Fetch the next input byte, if there is one.
312                     if(sIndex==sLimit) {
313                         return result;
314                     }
315                     if(result==Result.FINAL_VALUE) {
316                         // No further matching bytes.
317                         stop();
318                         return Result.NO_MATCH;
319                     }
320                     inByte=s[sIndex++];
321                     pos=pos_;  // branchNext() advanced pos and wrote it to pos_ .
322                 } else if(node<kMinValueLead) {
323                     // Match length+1 bytes.
324                     length=node-kMinLinearMatch;  // Actual match length minus 1.
325                     if(inByte!=bytes_[pos]) {
326                         stop();
327                         return Result.NO_MATCH;
328                     }
329                     ++pos;
330                     --length;
331                     break;
332                 } else if((node&kValueIsFinal)!=0) {
333                     // No further matching bytes.
334                     stop();
335                     return Result.NO_MATCH;
336                 } else {
337                     // Skip intermediate value.
338                     pos=skipValue(pos, node);
339                     // The next node must not also be a value node.
340                     assert((bytes_[pos]&0xff)<kMinValueLead);
341                 }
342             }
343         }
344     }
345 
346     /**
347      * Returns a matching byte sequence's value if called immediately after
348      * current()/first()/next() returned Result.INTERMEDIATE_VALUE or Result.FINAL_VALUE.
349      * getValue() can be called multiple times.
350      *
351      * Do not call getValue() after Result.NO_MATCH or Result.NO_VALUE!
352      * @return The value for the byte sequence so far.
353      * @stable ICU 4.8
354      */
355     public int getValue() /*const*/ {
356         int pos=pos_;
357         int leadByte=bytes_[pos++]&0xff;
358         assert(leadByte>=kMinValueLead);
359         return readValue(bytes_, pos, leadByte>>1);
360     }
361 
362     /**
363      * Determines whether all byte sequences reachable from the current state
364      * map to the same value, and if so, returns that value.
365      * @return The unique value in bits 32..1 with bit 0 set,
366      *         if all byte sequences reachable from the current state
367      *         map to the same value; otherwise returns 0.
368      * @stable ICU 4.8
369      */
getUniqueValue()370     public long getUniqueValue() /*const*/ {
371         int pos=pos_;
372         if(pos<0) {
373             return 0;
374         }
375         // Skip the rest of a pending linear-match node.
376         long uniqueValue=findUniqueValue(bytes_, pos+remainingMatchLength_+1, 0);
377         // Ignore internally used bits 63..33; extend the actual value's sign bit from bit 32.
378         return (uniqueValue<<31)>>31;
379     }
380 
381     /**
382      * Finds each byte which continues the byte sequence from the current state.
383      * That is, each byte b for which it would be next(b)!=Result.NO_MATCH now.
384      * @param out Each next byte is 0-extended to a char and appended to this object.
385      *            (Only uses the out.append(c) method.)
386      * @return The number of bytes which continue the byte sequence from here.
387      * @stable ICU 4.8
388      */
getNextBytes(Appendable out)389     public int getNextBytes(Appendable out) /*const*/ {
390         int pos=pos_;
391         if(pos<0) {
392             return 0;
393         }
394         if(remainingMatchLength_>=0) {
395             append(out, bytes_[pos]&0xff);  // Next byte of a pending linear-match node.
396             return 1;
397         }
398         int node=bytes_[pos++]&0xff;
399         if(node>=kMinValueLead) {
400             if((node&kValueIsFinal)!=0) {
401                 return 0;
402             } else {
403                 pos=skipValue(pos, node);
404                 node=bytes_[pos++]&0xff;
405                 assert(node<kMinValueLead);
406             }
407         }
408         if(node<kMinLinearMatch) {
409             if(node==0) {
410                 node=bytes_[pos++]&0xff;
411             }
412             getNextBranchBytes(bytes_, pos, ++node, out);
413             return node;
414         } else {
415             // First byte of the linear-match node.
416             append(out, bytes_[pos]&0xff);
417             return 1;
418         }
419     }
420 
421     /**
422      * Iterates from the current state of this trie.
423      * @return A new BytesTrie.Iterator.
424      * @stable ICU 4.8
425      */
426     @Override
427     public Iterator iterator() {
428         return new Iterator(bytes_, pos_, remainingMatchLength_, 0);
429     }
430 
431     /**
432      * Iterates from the current state of this trie.
433      * @param maxStringLength If 0, the iterator returns full strings/byte sequences.
434      *                        Otherwise, the iterator returns strings with this maximum length.
435      * @return A new BytesTrie.Iterator.
436      * @stable ICU 4.8
437      */
438     public Iterator iterator(int maxStringLength) {
439         return new Iterator(bytes_, pos_, remainingMatchLength_, maxStringLength);
440     }
441 
442     /**
443      * Iterates from the root of a byte-serialized BytesTrie.
444      * @param trieBytes Bytes array that contains the serialized trie.
445      * @param offset Root offset of the trie in the array.
446      * @param maxStringLength If 0, the iterator returns full strings/byte sequences.
447      *                        Otherwise, the iterator returns strings with this maximum length.
448      * @return A new BytesTrie.Iterator.
449      * @stable ICU 4.8
450      */
451     public static Iterator iterator(byte[] trieBytes, int offset, int maxStringLength) {
452         return new Iterator(trieBytes, offset, -1, maxStringLength);
453     }
454 
455     /**
456      * Return value type for the Iterator.
457      * @stable ICU 4.8
458      */
459     public static final class Entry {
460         private Entry(int capacity) {
461             bytes=new byte[capacity];
462         }
463 
464         /**
465          * @return The length of the byte sequence.
466          * @stable ICU 4.8
467          */
468         public int bytesLength() { return length; }
469         /**
470          * Returns a byte of the byte sequence.
471          * @param index An index into the byte sequence.
472          * @return The index-th byte sequence byte.
473          * @stable ICU 4.8
474          */
475         public byte byteAt(int index) { return bytes[index]; }
476         /**
477          * Copies the byte sequence into a byte array.
478          * @param dest Destination byte array.
479          * @param destOffset Starting offset to where in dest the byte sequence is copied.
480          * @stable ICU 4.8
481          */
482         public void copyBytesTo(byte[] dest, int destOffset) {
483             System.arraycopy(bytes, 0, dest, destOffset, length);
484         }
485         /**
486          * @return The byte sequence as a read-only ByteBuffer.
487          * @stable ICU 4.8
488          */
489         public ByteBuffer bytesAsByteBuffer() {
490             return ByteBuffer.wrap(bytes, 0, length).asReadOnlyBuffer();
491         }
492 
493         /**
494          * The value associated with the byte sequence.
495          * @stable ICU 4.8
496          */
497         public int value;
498 
499         private void ensureCapacity(int len) {
500             if(bytes.length<len) {
501                 byte[] newBytes=new byte[Math.min(2*bytes.length, 2*len)];
502                 System.arraycopy(bytes, 0, newBytes, 0, length);
503                 bytes=newBytes;
504             }
505         }
506         private void append(byte b) {
507             ensureCapacity(length+1);
508             bytes[length++]=b;
509         }
510         private void append(byte[] b, int off, int len) {
511             ensureCapacity(length+len);
512             System.arraycopy(b, off, bytes, length, len);
513             length+=len;
514         }
515         private void truncateString(int newLength) { length=newLength; }
516 
517         private byte[] bytes;
518         private int length;
519     }
520 
521     /**
522      * Iterator for all of the (byte sequence, value) pairs in a BytesTrie.
523      * @stable ICU 4.8
524      */
525     public static final class Iterator implements java.util.Iterator<Entry> {
526         private Iterator(byte[] trieBytes, int offset, int remainingMatchLength, int maxStringLength) {
527             bytes_=trieBytes;
528             pos_=initialPos_=offset;
529             remainingMatchLength_=initialRemainingMatchLength_=remainingMatchLength;
530             maxLength_=maxStringLength;
531             entry_=new Entry(maxLength_!=0 ? maxLength_ : 32);
532             int length=remainingMatchLength_;  // Actual remaining match length minus 1.
533             if(length>=0) {
534                 // Pending linear-match node, append remaining bytes to entry_.
535                 ++length;
536                 if(maxLength_>0 && length>maxLength_) {
537                     length=maxLength_;  // This will leave remainingMatchLength>=0 as a signal.
538                 }
539                 entry_.append(bytes_, pos_, length);
540                 pos_+=length;
541                 remainingMatchLength_-=length;
542             }
543         }
544 
545         /**
546          * Resets this iterator to its initial state.
547          * @return this
548          * @stable ICU 4.8
549          */
reset()550         public Iterator reset() {
551             pos_=initialPos_;
552             remainingMatchLength_=initialRemainingMatchLength_;
553             int length=remainingMatchLength_+1;  // Remaining match length.
554             if(maxLength_>0 && length>maxLength_) {
555                 length=maxLength_;
556             }
557             entry_.truncateString(length);
558             pos_+=length;
559             remainingMatchLength_-=length;
560             stack_.clear();
561             return this;
562         }
563 
564         /**
565          * @return true if there are more elements.
566          * @stable ICU 4.8
567          */
568         @Override
hasNext()569         public boolean hasNext() /*const*/ { return pos_>=0 || !stack_.isEmpty(); }
570 
571         /**
572          * Finds the next (byte sequence, value) pair if there is one.
573          *
574          * If the byte sequence is truncated to the maximum length and does not
575          * have a real value, then the value is set to -1.
576          * In this case, this "not a real value" is indistinguishable from
577          * a real value of -1.
578          * @return An Entry with the string and value of the next element.
579          * @throws NoSuchElementException - iteration has no more elements.
580          * @stable ICU 4.8
581          */
582         @Override
next()583         public Entry next() {
584             int pos=pos_;
585             if(pos<0) {
586                 if(stack_.isEmpty()) {
587                     throw new NoSuchElementException();
588                 }
589                 // Pop the state off the stack and continue with the next outbound edge of
590                 // the branch node.
591                 long top=stack_.remove(stack_.size()-1);
592                 int length=(int)top;
593                 pos=(int)(top>>32);
594                 entry_.truncateString(length&0xffff);
595                 length>>>=16;
596                 if(length>1) {
597                     pos=branchNext(pos, length);
598                     if(pos<0) {
599                         return entry_;  // Reached a final value.
600                     }
601                 } else {
602                     entry_.append(bytes_[pos++]);
603                 }
604             }
605             if(remainingMatchLength_>=0) {
606                 // We only get here if we started in a pending linear-match node
607                 // with more than maxLength remaining bytes.
608                 return truncateAndStop();
609             }
610             for(;;) {
611                 int node=bytes_[pos++]&0xff;
612                 if(node>=kMinValueLead) {
613                     // Deliver value for the byte sequence so far.
614                     boolean isFinal=(node&kValueIsFinal)!=0;
615                     entry_.value=readValue(bytes_, pos, node>>1);
616                     if(isFinal || (maxLength_>0 && entry_.length==maxLength_)) {
617                         pos_=-1;
618                     } else {
619                         pos_=skipValue(pos, node);
620                     }
621                     return entry_;
622                 }
623                 if(maxLength_>0 && entry_.length==maxLength_) {
624                     return truncateAndStop();
625                 }
626                 if(node<kMinLinearMatch) {
627                     if(node==0) {
628                         node=bytes_[pos++]&0xff;
629                     }
630                     pos=branchNext(pos, node+1);
631                     if(pos<0) {
632                         return entry_;  // Reached a final value.
633                     }
634                 } else {
635                     // Linear-match node, append length bytes to entry_.
636                     int length=node-kMinLinearMatch+1;
637                     if(maxLength_>0 && entry_.length+length>maxLength_) {
638                         entry_.append(bytes_, pos, maxLength_-entry_.length);
639                         return truncateAndStop();
640                     }
641                     entry_.append(bytes_, pos, length);
642                     pos+=length;
643                 }
644             }
645         }
646 
647         /**
648          * Iterator.remove() is not supported.
649          * @throws UnsupportedOperationException (always)
650          * @stable ICU 4.8
651          */
652         @Override
remove()653         public void remove() {
654             throw new UnsupportedOperationException();
655         }
656 
truncateAndStop()657         private Entry truncateAndStop() {
658             pos_=-1;
659             entry_.value=-1;  // no real value for str
660             return entry_;
661         }
662 
branchNext(int pos, int length)663         private int branchNext(int pos, int length) {
664             while(length>kMaxBranchLinearSubNodeLength) {
665                 ++pos;  // ignore the comparison byte
666                 // Push state for the greater-or-equal edge.
667                 stack_.add(((long)skipDelta(bytes_, pos)<<32)|((length-(length>>1))<<16)|entry_.length);
668                 // Follow the less-than edge.
669                 length>>=1;
670                 pos=jumpByDelta(bytes_, pos);
671             }
672             // List of key-value pairs where values are either final values or jump deltas.
673             // Read the first (key, value) pair.
674             byte trieByte=bytes_[pos++];
675             int node=bytes_[pos++]&0xff;
676             boolean isFinal=(node&kValueIsFinal)!=0;
677             int value=readValue(bytes_, pos, node>>1);
678             pos=skipValue(pos, node);
679             stack_.add(((long)pos<<32)|((length-1)<<16)|entry_.length);
680             entry_.append(trieByte);
681             if(isFinal) {
682                 pos_=-1;
683                 entry_.value=value;
684                 return -1;
685             } else {
686                 return pos+value;
687             }
688         }
689 
690         private byte[] bytes_;
691         private int pos_;
692         private int initialPos_;
693         private int remainingMatchLength_;
694         private int initialRemainingMatchLength_;
695 
696         private int maxLength_;
697         private Entry entry_;
698 
699         // The stack stores longs for backtracking to another
700         // outbound edge of a branch node.
701         // Each long has the offset from bytes_ in bits 62..32,
702         // the entry_.stringLength() from before the node in bits 15..0,
703         // and the remaining branch length in bits 24..16. (Bits 31..25 are unused.)
704         // (We could store the remaining branch length minus 1 in bits 23..16 and not use bits 31..24,
705         // but the code looks more confusing that way.)
706         private ArrayList<Long> stack_=new ArrayList<Long>();
707     }
708 
stop()709     private void stop() {
710         pos_=-1;
711     }
712 
713     // Reads a compact 32-bit integer.
714     // pos is already after the leadByte, and the lead byte is already shifted right by 1.
readValue(byte[] bytes, int pos, int leadByte)715     private static int readValue(byte[] bytes, int pos, int leadByte) {
716         int value;
717         if(leadByte<kMinTwoByteValueLead) {
718             value=leadByte-kMinOneByteValueLead;
719         } else if(leadByte<kMinThreeByteValueLead) {
720             value=((leadByte-kMinTwoByteValueLead)<<8)|(bytes[pos]&0xff);
721         } else if(leadByte<kFourByteValueLead) {
722             value=((leadByte-kMinThreeByteValueLead)<<16)|((bytes[pos]&0xff)<<8)|(bytes[pos+1]&0xff);
723         } else if(leadByte==kFourByteValueLead) {
724             value=((bytes[pos]&0xff)<<16)|((bytes[pos+1]&0xff)<<8)|(bytes[pos+2]&0xff);
725         } else {
726             value=(bytes[pos]<<24)|((bytes[pos+1]&0xff)<<16)|((bytes[pos+2]&0xff)<<8)|(bytes[pos+3]&0xff);
727         }
728         return value;
729     }
skipValue(int pos, int leadByte)730     private static int skipValue(int pos, int leadByte) {
731         assert(leadByte>=kMinValueLead);
732         if(leadByte>=(kMinTwoByteValueLead<<1)) {
733             if(leadByte<(kMinThreeByteValueLead<<1)) {
734                 ++pos;
735             } else if(leadByte<(kFourByteValueLead<<1)) {
736                 pos+=2;
737             } else {
738                 pos+=3+((leadByte>>1)&1);
739             }
740         }
741         return pos;
742     }
skipValue(byte[] bytes, int pos)743     private static int skipValue(byte[] bytes, int pos) {
744         int leadByte=bytes[pos++]&0xff;
745         return skipValue(pos, leadByte);
746     }
747 
748     // Reads a jump delta and jumps.
jumpByDelta(byte[] bytes, int pos)749     private static int jumpByDelta(byte[] bytes, int pos) {
750         int delta=bytes[pos++]&0xff;
751         if(delta<kMinTwoByteDeltaLead) {
752             // nothing to do
753         } else if(delta<kMinThreeByteDeltaLead) {
754             delta=((delta-kMinTwoByteDeltaLead)<<8)|(bytes[pos++]&0xff);
755         } else if(delta<kFourByteDeltaLead) {
756             delta=((delta-kMinThreeByteDeltaLead)<<16)|((bytes[pos]&0xff)<<8)|(bytes[pos+1]&0xff);
757             pos+=2;
758         } else if(delta==kFourByteDeltaLead) {
759             delta=((bytes[pos]&0xff)<<16)|((bytes[pos+1]&0xff)<<8)|(bytes[pos+2]&0xff);
760             pos+=3;
761         } else {
762             delta=(bytes[pos]<<24)|((bytes[pos+1]&0xff)<<16)|((bytes[pos+2]&0xff)<<8)|(bytes[pos+3]&0xff);
763             pos+=4;
764         }
765         return pos+delta;
766     }
767 
skipDelta(byte[] bytes, int pos)768     private static int skipDelta(byte[] bytes, int pos) {
769         int delta=bytes[pos++]&0xff;
770         if(delta>=kMinTwoByteDeltaLead) {
771             if(delta<kMinThreeByteDeltaLead) {
772                 ++pos;
773             } else if(delta<kFourByteDeltaLead) {
774                 pos+=2;
775             } else {
776                 pos+=3+(delta&1);
777             }
778         }
779         return pos;
780     }
781 
782     private static Result[] valueResults_={ Result.INTERMEDIATE_VALUE, Result.FINAL_VALUE };
783 
784     // Handles a branch node for both next(byte) and next(string).
branchNext(int pos, int length, int inByte)785     private Result branchNext(int pos, int length, int inByte) {
786         // Branch according to the current byte.
787         if(length==0) {
788             length=bytes_[pos++]&0xff;
789         }
790         ++length;
791         // The length of the branch is the number of bytes to select from.
792         // The data structure encodes a binary search.
793         while(length>kMaxBranchLinearSubNodeLength) {
794             if(inByte<(bytes_[pos++]&0xff)) {
795                 length>>=1;
796                 pos=jumpByDelta(bytes_, pos);
797             } else {
798                 length=length-(length>>1);
799                 pos=skipDelta(bytes_, pos);
800             }
801         }
802         // Drop down to linear search for the last few bytes.
803         // length>=2 because the loop body above sees length>kMaxBranchLinearSubNodeLength>=3
804         // and divides length by 2.
805         do {
806             if(inByte==(bytes_[pos++]&0xff)) {
807                 Result result;
808                 int node=bytes_[pos]&0xff;
809                 assert(node>=kMinValueLead);
810                 if((node&kValueIsFinal)!=0) {
811                     // Leave the final value for getValue() to read.
812                     result=Result.FINAL_VALUE;
813                 } else {
814                     // Use the non-final value as the jump delta.
815                     ++pos;
816                     // int delta=readValue(pos, node>>1);
817                     node>>=1;
818                     int delta;
819                     if(node<kMinTwoByteValueLead) {
820                         delta=node-kMinOneByteValueLead;
821                     } else if(node<kMinThreeByteValueLead) {
822                         delta=((node-kMinTwoByteValueLead)<<8)|(bytes_[pos++]&0xff);
823                     } else if(node<kFourByteValueLead) {
824                         delta=((node-kMinThreeByteValueLead)<<16)|((bytes_[pos]&0xff)<<8)|(bytes_[pos+1]&0xff);
825                         pos+=2;
826                     } else if(node==kFourByteValueLead) {
827                         delta=((bytes_[pos]&0xff)<<16)|((bytes_[pos+1]&0xff)<<8)|(bytes_[pos+2]&0xff);
828                         pos+=3;
829                     } else {
830                         delta=(bytes_[pos]<<24)|((bytes_[pos+1]&0xff)<<16)|((bytes_[pos+2]&0xff)<<8)|(bytes_[pos+3]&0xff);
831                         pos+=4;
832                     }
833                     // end readValue()
834                     pos+=delta;
835                     node=bytes_[pos]&0xff;
836                     result= node>=kMinValueLead ? valueResults_[node&kValueIsFinal] : Result.NO_VALUE;
837                 }
838                 pos_=pos;
839                 return result;
840             }
841             --length;
842             pos=skipValue(bytes_, pos);
843         } while(length>1);
844         if(inByte==(bytes_[pos++]&0xff)) {
845             pos_=pos;
846             int node=bytes_[pos]&0xff;
847             return node>=kMinValueLead ? valueResults_[node&kValueIsFinal] : Result.NO_VALUE;
848         } else {
849             stop();
850             return Result.NO_MATCH;
851         }
852     }
853 
854     // Requires remainingLength_<0.
nextImpl(int pos, int inByte)855     private Result nextImpl(int pos, int inByte) {
856         for(;;) {
857             int node=bytes_[pos++]&0xff;
858             if(node<kMinLinearMatch) {
859                 return branchNext(pos, node, inByte);
860             } else if(node<kMinValueLead) {
861                 // Match the first of length+1 bytes.
862                 int length=node-kMinLinearMatch;  // Actual match length minus 1.
863                 if(inByte==(bytes_[pos++]&0xff)) {
864                     remainingMatchLength_=--length;
865                     pos_=pos;
866                     return (length<0 && (node=bytes_[pos]&0xff)>=kMinValueLead) ?
867                             valueResults_[node&kValueIsFinal] : Result.NO_VALUE;
868                 } else {
869                     // No match.
870                     break;
871                 }
872             } else if((node&kValueIsFinal)!=0) {
873                 // No further matching bytes.
874                 break;
875             } else {
876                 // Skip intermediate value.
877                 pos=skipValue(pos, node);
878                 // The next node must not also be a value node.
879                 assert((bytes_[pos]&0xff)<kMinValueLead);
880             }
881         }
882         stop();
883         return Result.NO_MATCH;
884     }
885 
886     // Helper functions for getUniqueValue().
887     // Recursively finds a unique value (or whether there is not a unique one)
888     // from a branch.
889     // uniqueValue: On input, same as for getUniqueValue()/findUniqueValue().
890     // On return, if not 0, then bits 63..33 contain the updated non-negative pos.
891     private static long findUniqueValueFromBranch(byte[] bytes, int pos, int length,
892                                                   long uniqueValue) {
893         while(length>kMaxBranchLinearSubNodeLength) {
894             ++pos;  // ignore the comparison byte
895             uniqueValue=findUniqueValueFromBranch(bytes, jumpByDelta(bytes, pos), length>>1, uniqueValue);
896             if(uniqueValue==0) {
897                 return 0;
898             }
899             length=length-(length>>1);
900             pos=skipDelta(bytes, pos);
901         }
902         do {
903             ++pos;  // ignore a comparison byte
904             // handle its value
905             int node=bytes[pos++]&0xff;
906             boolean isFinal=(node&kValueIsFinal)!=0;
907             int value=readValue(bytes, pos, node>>1);
908             pos=skipValue(pos, node);
909             if(isFinal) {
910                 if(uniqueValue!=0) {
911                     if(value!=(int)(uniqueValue>>1)) {
912                         return 0;
913                     }
914                 } else {
915                     uniqueValue=((long)value<<1)|1;
916                 }
917             } else {
918                 uniqueValue=findUniqueValue(bytes, pos+value, uniqueValue);
919                 if(uniqueValue==0) {
920                     return 0;
921                 }
922             }
923         } while(--length>1);
924         // ignore the last comparison byte
925         return ((long)(pos+1)<<33)|(uniqueValue&0x1ffffffffL);
926     }
927     // Recursively finds a unique value (or whether there is not a unique one)
928     // starting from a position on a node lead byte.
929     // uniqueValue: If there is one, then bits 32..1 contain the value and bit 0 is set.
930     // Otherwise, uniqueValue is 0. Bits 63..33 are ignored.
findUniqueValue(byte[] bytes, int pos, long uniqueValue)931     private static long findUniqueValue(byte[] bytes, int pos, long uniqueValue) {
932         for(;;) {
933             int node=bytes[pos++]&0xff;
934             if(node<kMinLinearMatch) {
935                 if(node==0) {
936                     node=bytes[pos++]&0xff;
937                 }
938                 uniqueValue=findUniqueValueFromBranch(bytes, pos, node+1, uniqueValue);
939                 if(uniqueValue==0) {
940                     return 0;
941                 }
942                 pos=(int)(uniqueValue>>>33);
943             } else if(node<kMinValueLead) {
944                 // linear-match node
945                 pos+=node-kMinLinearMatch+1;  // Ignore the match bytes.
946             } else {
947                 boolean isFinal=(node&kValueIsFinal)!=0;
948                 int value=readValue(bytes, pos, node>>1);
949                 if(uniqueValue!=0) {
950                     if(value!=(int)(uniqueValue>>1)) {
951                         return 0;
952                     }
953                 } else {
954                     uniqueValue=((long)value<<1)|1;
955                 }
956                 if(isFinal) {
957                     return uniqueValue;
958                 }
959                 pos=skipValue(pos, node);
960             }
961         }
962     }
963 
964     // Helper functions for getNextBytes().
965     // getNextBytes() when pos is on a branch node.
getNextBranchBytes(byte[] bytes, int pos, int length, Appendable out)966     private static void getNextBranchBytes(byte[] bytes, int pos, int length, Appendable out) {
967         while(length>kMaxBranchLinearSubNodeLength) {
968             ++pos;  // ignore the comparison byte
969             getNextBranchBytes(bytes, jumpByDelta(bytes, pos), length>>1, out);
970             length=length-(length>>1);
971             pos=skipDelta(bytes, pos);
972         }
973         do {
974             append(out, bytes[pos++]&0xff);
975             pos=skipValue(bytes, pos);
976         } while(--length>1);
977         append(out, bytes[pos]&0xff);
978     }
append(Appendable out, int c)979     private static void append(Appendable out, int c) {
980         try {
981             out.append((char)c);
982         } catch(IOException e) {
983             throw new ICUUncheckedIOException(e);
984         }
985     }
986 
987     // BytesTrie data structure
988     //
989     // The trie consists of a series of byte-serialized nodes for incremental
990     // string/byte sequence matching. The root node is at the beginning of the trie data.
991     //
992     // Types of nodes are distinguished by their node lead byte ranges.
993     // After each node, except a final-value node, another node follows to
994     // encode match values or continue matching further bytes.
995     //
996     // Node types:
997     //  - Value node: Stores a 32-bit integer in a compact, variable-length format.
998     //    The value is for the string/byte sequence so far.
999     //    One node bit indicates whether the value is final or whether
1000     //    matching continues with the next node.
1001     //  - Linear-match node: Matches a number of bytes.
1002     //  - Branch node: Branches to other nodes according to the current input byte.
1003     //    The node byte is the length of the branch (number of bytes to select from)
1004     //    minus 1. It is followed by a sub-node:
1005     //    - If the length is at most kMaxBranchLinearSubNodeLength, then
1006     //      there are length-1 (key, value) pairs and then one more comparison byte.
1007     //      If one of the key bytes matches, then the value is either a final value for
1008     //      the string/byte sequence so far, or a "jump" delta to the next node.
1009     //      If the last byte matches, then matching continues with the next node.
1010     //      (Values have the same encoding as value nodes.)
1011     //    - If the length is greater than kMaxBranchLinearSubNodeLength, then
1012     //      there is one byte and one "jump" delta.
1013     //      If the input byte is less than the sub-node byte, then "jump" by delta to
1014     //      the next sub-node which will have a length of length/2.
1015     //      (The delta has its own compact encoding.)
1016     //      Otherwise, skip the "jump" delta to the next sub-node
1017     //      which will have a length of length-length/2.
1018 
1019     // Node lead byte values.
1020 
1021     // 00..0f: Branch node. If node!=0 then the length is node+1, otherwise
1022     // the length is one more than the next byte.
1023 
1024     // For a branch sub-node with at most this many entries, we drop down
1025     // to a linear search.
1026     /*package*/ static final int kMaxBranchLinearSubNodeLength=5;
1027 
1028     // 10..1f: Linear-match node, match 1..16 bytes and continue reading the next node.
1029     /*package*/ static final int kMinLinearMatch=0x10;
1030     /*package*/ static final int kMaxLinearMatchLength=0x10;
1031 
1032     // 20..ff: Variable-length value node.
1033     // If odd, the value is final. (Otherwise, intermediate value or jump delta.)
1034     // Then shift-right by 1 bit.
1035     // The remaining lead byte value indicates the number of following bytes (0..4)
1036     // and contains the value's top bits.
1037     /*package*/ static final int kMinValueLead=kMinLinearMatch+kMaxLinearMatchLength;  // 0x20
1038     // It is a final value if bit 0 is set.
1039     private static final int kValueIsFinal=1;
1040 
1041     // Compact value: After testing bit 0, shift right by 1 and then use the following thresholds.
1042     /*package*/ static final int kMinOneByteValueLead=kMinValueLead/2;  // 0x10
1043     /*package*/ static final int kMaxOneByteValue=0x40;  // At least 6 bits in the first byte.
1044 
1045     /*package*/ static final int kMinTwoByteValueLead=kMinOneByteValueLead+kMaxOneByteValue+1;  // 0x51
1046     /*package*/ static final int kMaxTwoByteValue=0x1aff;
1047 
1048     /*package*/ static final int kMinThreeByteValueLead=kMinTwoByteValueLead+(kMaxTwoByteValue>>8)+1;  // 0x6c
1049     /*package*/ static final int kFourByteValueLead=0x7e;
1050 
1051     // A little more than Unicode code points. (0x11ffff)
1052     /*package*/ static final int kMaxThreeByteValue=((kFourByteValueLead-kMinThreeByteValueLead)<<16)-1;
1053 
1054     /*package*/ static final int kFiveByteValueLead=0x7f;
1055 
1056     // Compact delta integers.
1057     /*package*/ static final int kMaxOneByteDelta=0xbf;
1058     /*package*/ static final int kMinTwoByteDeltaLead=kMaxOneByteDelta+1;  // 0xc0
1059     /*package*/ static final int kMinThreeByteDeltaLead=0xf0;
1060     /*package*/ static final int kFourByteDeltaLead=0xfe;
1061     /*package*/ static final int kFiveByteDeltaLead=0xff;
1062 
1063     /*package*/ static final int kMaxTwoByteDelta=((kMinThreeByteDeltaLead-kMinTwoByteDeltaLead)<<8)-1;  // 0x2fff
1064     /*package*/ static final int kMaxThreeByteDelta=((kFourByteDeltaLead-kMinThreeByteDeltaLead)<<16)-1;  // 0xdffff
1065 
1066     // Fixed value referencing the BytesTrie bytes.
1067     private byte[] bytes_;
1068     private int root_;
1069 
1070     // Iterator variables.
1071 
1072     // Index of next trie byte to read. Negative if no more matches.
1073     private int pos_;
1074     // Remaining length of a linear-match node, minus 1. Negative if not in such a node.
1075     private int remainingMatchLength_;
1076 };
1077