• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* GENERATED SOURCE. DO NOT MODIFY. */
2 // © 2016 and later: Unicode, Inc. and others.
3 // License & terms of use: http://www.unicode.org/copyright.html#License
4 /*
5 *******************************************************************************
6 *   Copyright (C) 2011-2014, International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *******************************************************************************
9 *   created on: 2011jan06
10 *   created by: Markus W. Scherer
11 *   ported from ICU4C ucharstrie.h/.cpp
12 */
13 
14 package ohos.global.icu.util;
15 
16 import java.io.IOException;
17 import java.util.ArrayList;
18 import java.util.NoSuchElementException;
19 
20 import ohos.global.icu.text.UTF16;
21 import ohos.global.icu.util.BytesTrie.Result;
22 
23 /**
24  * Light-weight, non-const reader class for a CharsTrie.
25  * Traverses a char-serialized data structure with minimal state,
26  * for mapping strings (16-bit-unit sequences) to non-negative integer values.
27  *
28  * <p>This class is not intended for public subclassing.
29  *
30  * @author Markus W. Scherer
31  * @hide exposed on OHOS
32  */
33 public final class CharsTrie implements Cloneable, Iterable<CharsTrie.Entry> {
34     /**
35      * Constructs a CharsTrie reader instance.
36      *
37      * <p>The CharSequence must contain a copy of a char sequence from the CharsTrieBuilder,
38      * with the offset indicating the first char of that sequence.
39      * The CharsTrie object will not read more chars than
40      * the CharsTrieBuilder generated in the corresponding build() call.
41      *
42      * <p>The CharSequence is not copied/cloned and must not be modified while
43      * the CharsTrie object is in use.
44      *
45      * @param trieChars CharSequence that contains the serialized trie.
46      * @param offset Root offset of the trie in the CharSequence.
47      */
CharsTrie(CharSequence trieChars, int offset)48     public CharsTrie(CharSequence trieChars, int offset) {
49         chars_=trieChars;
50         pos_=root_=offset;
51         remainingMatchLength_=-1;
52     }
53 
54     /**
55      * Copy constructor.
56      * Makes a shallow copy of the other trie reader object and its state.
57      * Does not copy the char array which will be shared.
58      * Same as clone() but without the throws clause.
59      */
CharsTrie(CharsTrie other)60     public CharsTrie(CharsTrie other) {
61         chars_ = other.chars_;
62         root_ = other.root_;
63         pos_ = other.pos_;
64         remainingMatchLength_ = other.remainingMatchLength_;
65     }
66 
67     /**
68      * Clones this trie reader object and its state,
69      * but not the char array which will be shared.
70      * @return A shallow clone of this trie.
71      */
72     @Override
clone()73     public CharsTrie clone() throws CloneNotSupportedException {
74         return (CharsTrie) super.clone();  // A shallow copy is just what we need.
75     }
76 
77     /**
78      * Resets this trie to its initial state.
79      * @return this
80      */
reset()81     public CharsTrie reset() {
82         pos_=root_;
83         remainingMatchLength_=-1;
84         return this;
85     }
86 
87     /**
88      * Returns the state of this trie as a 64-bit integer.
89      * The state value is never 0.
90      *
91      * @return opaque state value
92      * @see #resetToState64
93      */
getState64()94     public long getState64() {
95         return ((long)remainingMatchLength_ << 32) | pos_;
96     }
97 
98     /**
99      * Resets this trie to the saved state.
100      * Unlike {@link #resetToState(State)}, the 64-bit state value
101      * must be from {@link #getState64()} from the same trie object or
102      * from one initialized the exact same way.
103      * Because of no validation, this method is faster.
104      *
105      * @param state The opaque trie state value from getState64().
106      * @return this
107      * @see #getState64
108      * @see #resetToState
109      * @see #reset
110      */
resetToState64(long state)111     public CharsTrie resetToState64(long state) {
112         remainingMatchLength_ = (int)(state >> 32);
113         pos_ = (int)state;
114         return this;
115     }
116 
117     /**
118      * CharsTrie state object, for saving a trie's current state
119      * and resetting the trie back to this state later.
120      * @hide exposed on OHOS
121      */
122     public static final class State {
123         /**
124          * Constructs an empty State.
125          */
State()126         public State() {}
127         private CharSequence chars;
128         private int root;
129         private int pos;
130         private int remainingMatchLength;
131     }
132 
133     /**
134      * Saves the state of this trie.
135      * @param state The State object to hold the trie's state.
136      * @return this
137      * @see #resetToState
138      */
saveState(State state)139     public CharsTrie saveState(State state) /*const*/ {
140         state.chars=chars_;
141         state.root=root_;
142         state.pos=pos_;
143         state.remainingMatchLength=remainingMatchLength_;
144         return this;
145     }
146 
147     /**
148      * Resets this trie to the saved state.
149      * Slower than {@link #resetToState64(long)} which does not validate the state value.
150      *
151      * @param state The State object which holds a saved trie state.
152      * @return this
153      * @throws IllegalArgumentException if the state object contains no state,
154      *         or the state of a different trie
155      * @see #saveState
156      * @see #reset
157      */
resetToState(State state)158     public CharsTrie resetToState(State state) {
159         if(chars_==state.chars && chars_!=null && root_==state.root) {
160             pos_=state.pos;
161             remainingMatchLength_=state.remainingMatchLength;
162         } else {
163             throw new IllegalArgumentException("incompatible trie state");
164         }
165         return this;
166     }
167 
168     /**
169      * Determines whether the string so far matches, whether it has a value,
170      * and whether another input char can continue a matching string.
171      * @return The match/value Result.
172      */
current()173     public Result current() /*const*/ {
174         int pos=pos_;
175         if(pos<0) {
176             return Result.NO_MATCH;
177         } else {
178             int node;
179             return (remainingMatchLength_<0 && (node=chars_.charAt(pos))>=kMinValueLead) ?
180                     valueResults_[node>>15] : Result.NO_VALUE;
181         }
182     }
183 
184     /**
185      * Traverses the trie from the initial state for this input char.
186      * Equivalent to reset().next(inUnit).
187      * @param inUnit Input char value. Values below 0 and above 0xffff will never match.
188      * @return The match/value Result.
189      */
first(int inUnit)190     public Result first(int inUnit) {
191         remainingMatchLength_=-1;
192         return nextImpl(root_, inUnit);
193     }
194 
195     /**
196      * Traverses the trie from the initial state for the
197      * one or two UTF-16 code units for this input code point.
198      * Equivalent to reset().nextForCodePoint(cp).
199      * @param cp A Unicode code point 0..0x10ffff.
200      * @return The match/value Result.
201      */
firstForCodePoint(int cp)202     public Result firstForCodePoint(int cp) {
203         return cp<=0xffff ?
204             first(cp) :
205             (first(UTF16.getLeadSurrogate(cp)).hasNext() ?
206                 next(UTF16.getTrailSurrogate(cp)) :
207                 Result.NO_MATCH);
208     }
209 
210     /**
211      * Traverses the trie from the current state for this input char.
212      * @param inUnit Input char value. Values below 0 and above 0xffff will never match.
213      * @return The match/value Result.
214      */
next(int inUnit)215     public Result next(int inUnit) {
216         int pos=pos_;
217         if(pos<0) {
218             return Result.NO_MATCH;
219         }
220         int length=remainingMatchLength_;  // Actual remaining match length minus 1.
221         if(length>=0) {
222             // Remaining part of a linear-match node.
223             if(inUnit==chars_.charAt(pos++)) {
224                 remainingMatchLength_=--length;
225                 pos_=pos;
226                 int node;
227                 return (length<0 && (node=chars_.charAt(pos))>=kMinValueLead) ?
228                         valueResults_[node>>15] : Result.NO_VALUE;
229             } else {
230                 stop();
231                 return Result.NO_MATCH;
232             }
233         }
234         return nextImpl(pos, inUnit);
235     }
236 
237     /**
238      * Traverses the trie from the current state for the
239      * one or two UTF-16 code units for this input code point.
240      * @param cp A Unicode code point 0..0x10ffff.
241      * @return The match/value Result.
242      */
nextForCodePoint(int cp)243     public Result nextForCodePoint(int cp) {
244         return cp<=0xffff ?
245             next(cp) :
246             (next(UTF16.getLeadSurrogate(cp)).hasNext() ?
247                 next(UTF16.getTrailSurrogate(cp)) :
248                 Result.NO_MATCH);
249     }
250 
251     /**
252      * Traverses the trie from the current state for this string.
253      * Equivalent to
254      * <pre>
255      * Result result=current();
256      * for(each c in s)
257      *   if(!result.hasNext()) return Result.NO_MATCH;
258      *   result=next(c);
259      * return result;
260      * </pre>
261      * @param s Contains a string.
262      * @param sIndex The start index of the string in s.
263      * @param sLimit The (exclusive) end index of the string in s.
264      * @return The match/value Result.
265      */
next(CharSequence s, int sIndex, int sLimit)266     public Result next(CharSequence s, int sIndex, int sLimit) {
267         if(sIndex>=sLimit) {
268             // Empty input.
269             return current();
270         }
271         int pos=pos_;
272         if(pos<0) {
273             return Result.NO_MATCH;
274         }
275         int length=remainingMatchLength_;  // Actual remaining match length minus 1.
276         for(;;) {
277             // Fetch the next input unit, if there is one.
278             // Continue a linear-match node.
279             char inUnit;
280             for(;;) {
281                 if(sIndex==sLimit) {
282                     remainingMatchLength_=length;
283                     pos_=pos;
284                     int node;
285                     return (length<0 && (node=chars_.charAt(pos))>=kMinValueLead) ?
286                             valueResults_[node>>15] : Result.NO_VALUE;
287                 }
288                 inUnit=s.charAt(sIndex++);
289                 if(length<0) {
290                     remainingMatchLength_=length;
291                     break;
292                 }
293                 if(inUnit!=chars_.charAt(pos)) {
294                     stop();
295                     return Result.NO_MATCH;
296                 }
297                 ++pos;
298                 --length;
299             }
300             int node=chars_.charAt(pos++);
301             for(;;) {
302                 if(node<kMinLinearMatch) {
303                     Result result=branchNext(pos, node, inUnit);
304                     if(result==Result.NO_MATCH) {
305                         return Result.NO_MATCH;
306                     }
307                     // Fetch the next input unit, if there is one.
308                     if(sIndex==sLimit) {
309                         return result;
310                     }
311                     if(result==Result.FINAL_VALUE) {
312                         // No further matching units.
313                         stop();
314                         return Result.NO_MATCH;
315                     }
316                     inUnit=s.charAt(sIndex++);
317                     pos=pos_;  // branchNext() advanced pos and wrote it to pos_ .
318                     node=chars_.charAt(pos++);
319                 } else if(node<kMinValueLead) {
320                     // Match length+1 units.
321                     length=node-kMinLinearMatch;  // Actual match length minus 1.
322                     if(inUnit!=chars_.charAt(pos)) {
323                         stop();
324                         return Result.NO_MATCH;
325                     }
326                     ++pos;
327                     --length;
328                     break;
329                 } else if((node&kValueIsFinal)!=0) {
330                     // No further matching units.
331                     stop();
332                     return Result.NO_MATCH;
333                 } else {
334                     // Skip intermediate value.
335                     pos=skipNodeValue(pos, node);
336                     node&=kNodeTypeMask;
337                 }
338             }
339         }
340     }
341 
342     /**
343      * Returns a matching string's value if called immediately after
344      * current()/first()/next() returned Result.INTERMEDIATE_VALUE or Result.FINAL_VALUE.
345      * getValue() can be called multiple times.
346      *
347      * Do not call getValue() after Result.NO_MATCH or Result.NO_VALUE!
348      * @return The value for the string so far.
349      */
getValue()350     public int getValue() /*const*/ {
351         int pos=pos_;
352         int leadUnit=chars_.charAt(pos++);
353         assert(leadUnit>=kMinValueLead);
354         return (leadUnit&kValueIsFinal)!=0 ?
355             readValue(chars_, pos, leadUnit&0x7fff) : readNodeValue(chars_, pos, leadUnit);
356     }
357 
358     /**
359      * Determines whether all strings reachable from the current state
360      * map to the same value, and if so, returns that value.
361      * @return The unique value in bits 32..1 with bit 0 set,
362      *         if all strings reachable from the current state
363      *         map to the same value; otherwise returns 0.
364      */
getUniqueValue()365     public long getUniqueValue() /*const*/ {
366         int pos=pos_;
367         if(pos<0) {
368             return 0;
369         }
370         // Skip the rest of a pending linear-match node.
371         long uniqueValue=findUniqueValue(chars_, pos+remainingMatchLength_+1, 0);
372         // Ignore internally used bits 63..33; extend the actual value's sign bit from bit 32.
373         return (uniqueValue<<31)>>31;
374     }
375 
376     /**
377      * Finds each char which continues the string from the current state.
378      * That is, each char c for which it would be next(c)!=Result.NO_MATCH now.
379      * @param out Each next char is appended to this object.
380      *            (Only uses the out.append(c) method.)
381      * @return The number of chars which continue the string from here.
382      */
getNextChars(Appendable out)383     public int getNextChars(Appendable out) /*const*/ {
384         int pos=pos_;
385         if(pos<0) {
386             return 0;
387         }
388         if(remainingMatchLength_>=0) {
389             append(out, chars_.charAt(pos));  // Next unit of a pending linear-match node.
390             return 1;
391         }
392         int node=chars_.charAt(pos++);
393         if(node>=kMinValueLead) {
394             if((node&kValueIsFinal)!=0) {
395                 return 0;
396             } else {
397                 pos=skipNodeValue(pos, node);
398                 node&=kNodeTypeMask;
399             }
400         }
401         if(node<kMinLinearMatch) {
402             if(node==0) {
403                 node=chars_.charAt(pos++);
404             }
405             getNextBranchChars(chars_, pos, ++node, out);
406             return node;
407         } else {
408             // First unit of the linear-match node.
409             append(out, chars_.charAt(pos));
410             return 1;
411         }
412     }
413 
414     /**
415      * Iterates from the current state of this trie.
416      * @return A new CharsTrie.Iterator.
417      */
418     @Override
iterator()419     public Iterator iterator() {
420         return new Iterator(chars_, pos_, remainingMatchLength_, 0);
421     }
422 
423     /**
424      * Iterates from the current state of this trie.
425      * @param maxStringLength If 0, the iterator returns full strings.
426      *                        Otherwise, the iterator returns strings with this maximum length.
427      * @return A new CharsTrie.Iterator.
428      */
iterator(int maxStringLength)429     public Iterator iterator(int maxStringLength) {
430         return new Iterator(chars_, pos_, remainingMatchLength_, maxStringLength);
431     }
432 
433     /**
434      * Iterates from the root of a char-serialized BytesTrie.
435      * @param trieChars CharSequence that contains the serialized trie.
436      * @param offset Root offset of the trie in the CharSequence.
437      * @param maxStringLength If 0, the iterator returns full strings.
438      *                        Otherwise, the iterator returns strings with this maximum length.
439      * @return A new CharsTrie.Iterator.
440      */
iterator(CharSequence trieChars, int offset, int maxStringLength)441     public static Iterator iterator(CharSequence trieChars, int offset, int maxStringLength) {
442         return new Iterator(trieChars, offset, -1, maxStringLength);
443     }
444 
445     /**
446      * Return value type for the Iterator.
447      * @hide exposed on OHOS
448      */
449     public static final class Entry {
450         /**
451          * The string.
452          */
453         public CharSequence chars;
454         /**
455          * The value associated with the string.
456          */
457         public int value;
458 
Entry()459         private Entry() {
460         }
461     }
462 
463     /**
464      * Iterator for all of the (string, value) pairs in a CharsTrie.
465      * @hide exposed on OHOS
466      */
467     public static final class Iterator implements java.util.Iterator<Entry> {
Iterator(CharSequence trieChars, int offset, int remainingMatchLength, int maxStringLength)468         private Iterator(CharSequence trieChars, int offset, int remainingMatchLength, int maxStringLength) {
469             chars_=trieChars;
470             pos_=initialPos_=offset;
471             remainingMatchLength_=initialRemainingMatchLength_=remainingMatchLength;
472             maxLength_=maxStringLength;
473             int length=remainingMatchLength_;  // Actual remaining match length minus 1.
474             if(length>=0) {
475                 // Pending linear-match node, append remaining bytes to str_.
476                 ++length;
477                 if(maxLength_>0 && length>maxLength_) {
478                     length=maxLength_;  // This will leave remainingMatchLength>=0 as a signal.
479                 }
480                 str_.append(chars_, pos_, pos_+length);
481                 pos_+=length;
482                 remainingMatchLength_-=length;
483             }
484         }
485 
486         /**
487          * Resets this iterator to its initial state.
488          * @return this
489          */
reset()490         public Iterator reset() {
491             pos_=initialPos_;
492             remainingMatchLength_=initialRemainingMatchLength_;
493             skipValue_=false;
494             int length=remainingMatchLength_+1;  // Remaining match length.
495             if(maxLength_>0 && length>maxLength_) {
496                 length=maxLength_;
497             }
498             str_.setLength(length);
499             pos_+=length;
500             remainingMatchLength_-=length;
501             stack_.clear();
502             return this;
503         }
504 
505         /**
506          * @return true if there are more elements.
507          */
508         @Override
hasNext()509         public boolean hasNext() /*const*/ { return pos_>=0 || !stack_.isEmpty(); }
510 
511         /**
512          * Finds the next (string, value) pair if there is one.
513          *
514          * If the string is truncated to the maximum length and does not
515          * have a real value, then the value is set to -1.
516          * In this case, this "not a real value" is indistinguishable from
517          * a real value of -1.
518          * @return An Entry with the string and value of the next element.
519          * @throws NoSuchElementException - iteration has no more elements.
520          */
521         @Override
next()522         public Entry next() {
523             int pos=pos_;
524             if(pos<0) {
525                 if(stack_.isEmpty()) {
526                     throw new NoSuchElementException();
527                 }
528                 // Pop the state off the stack and continue with the next outbound edge of
529                 // the branch node.
530                 long top=stack_.remove(stack_.size()-1);
531                 int length=(int)top;
532                 pos=(int)(top>>32);
533                 str_.setLength(length&0xffff);
534                 length>>>=16;
535                 if(length>1) {
536                     pos=branchNext(pos, length);
537                     if(pos<0) {
538                         return entry_;  // Reached a final value.
539                     }
540                 } else {
541                     str_.append(chars_.charAt(pos++));
542                 }
543             }
544             if(remainingMatchLength_>=0) {
545                 // We only get here if we started in a pending linear-match node
546                 // with more than maxLength remaining units.
547                 return truncateAndStop();
548             }
549             for(;;) {
550                 int node=chars_.charAt(pos++);
551                 if(node>=kMinValueLead) {
552                     if(skipValue_) {
553                         pos=skipNodeValue(pos, node);
554                         node&=kNodeTypeMask;
555                         skipValue_=false;
556                     } else {
557                         // Deliver value for the string so far.
558                         boolean isFinal=(node&kValueIsFinal)!=0;
559                         if(isFinal) {
560                             entry_.value=readValue(chars_, pos, node&0x7fff);
561                         } else {
562                             entry_.value=readNodeValue(chars_, pos, node);
563                         }
564                         if(isFinal || (maxLength_>0 && str_.length()==maxLength_)) {
565                             pos_=-1;
566                         } else {
567                             // We cannot skip the value right here because it shares its
568                             // lead unit with a match node which we have to evaluate
569                             // next time.
570                             // Instead, keep pos_ on the node lead unit itself.
571                             pos_=pos-1;
572                             skipValue_=true;
573                         }
574                         entry_.chars=str_;
575                         return entry_;
576                     }
577                 }
578                 if(maxLength_>0 && str_.length()==maxLength_) {
579                     return truncateAndStop();
580                 }
581                 if(node<kMinLinearMatch) {
582                     if(node==0) {
583                         node=chars_.charAt(pos++);
584                     }
585                     pos=branchNext(pos, node+1);
586                     if(pos<0) {
587                         return entry_;  // Reached a final value.
588                     }
589                 } else {
590                     // Linear-match node, append length units to str_.
591                     int length=node-kMinLinearMatch+1;
592                     if(maxLength_>0 && str_.length()+length>maxLength_) {
593                         str_.append(chars_, pos, pos+maxLength_-str_.length());
594                         return truncateAndStop();
595                     }
596                     str_.append(chars_, pos, pos+length);
597                     pos+=length;
598                 }
599             }
600         }
601 
602         /**
603          * Iterator.remove() is not supported.
604          * @throws UnsupportedOperationException (always)
605          */
606         @Override
remove()607         public void remove() {
608             throw new UnsupportedOperationException();
609         }
610 
truncateAndStop()611         private Entry truncateAndStop() {
612             pos_=-1;
613             // We reset entry_.chars every time we return entry_
614             // just because the caller might have modified the Entry.
615             entry_.chars=str_;
616             entry_.value=-1;  // no real value for str
617             return entry_;
618         }
619 
branchNext(int pos, int length)620         private int branchNext(int pos, int length) {
621             while(length>kMaxBranchLinearSubNodeLength) {
622                 ++pos;  // ignore the comparison unit
623                 // Push state for the greater-or-equal edge.
624                 stack_.add(((long)skipDelta(chars_, pos)<<32)|((length-(length>>1))<<16)|str_.length());
625                 // Follow the less-than edge.
626                 length>>=1;
627                 pos=jumpByDelta(chars_, pos);
628             }
629             // List of key-value pairs where values are either final values or jump deltas.
630             // Read the first (key, value) pair.
631             char trieUnit=chars_.charAt(pos++);
632             int node=chars_.charAt(pos++);
633             boolean isFinal=(node&kValueIsFinal)!=0;
634             int value=readValue(chars_, pos, node&=0x7fff);
635             pos=skipValue(pos, node);
636             stack_.add(((long)pos<<32)|((length-1)<<16)|str_.length());
637             str_.append(trieUnit);
638             if(isFinal) {
639                 pos_=-1;
640                 entry_.chars=str_;
641                 entry_.value=value;
642                 return -1;
643             } else {
644                 return pos+value;
645             }
646         }
647 
648         private CharSequence chars_;
649         private int pos_;
650         private int initialPos_;
651         private int remainingMatchLength_;
652         private int initialRemainingMatchLength_;
653         private boolean skipValue_;  // Skip intermediate value which was already delivered.
654 
655         private StringBuilder str_=new StringBuilder();
656         private int maxLength_;
657         private Entry entry_=new Entry();
658 
659         // The stack stores longs for backtracking to another
660         // outbound edge of a branch node.
661         // Each long has the offset in chars_ in bits 62..32,
662         // the str_.length() from before the node in bits 15..0,
663         // and the remaining branch length in bits 31..16.
664         // (We could store the remaining branch length minus 1 in bits 30..16 and not use bit 31,
665         // but the code looks more confusing that way.)
666         private ArrayList<Long> stack_=new ArrayList<>();
667     }
668 
stop()669     private void stop() {
670         pos_=-1;
671     }
672 
673     // Reads a compact 32-bit integer.
674     // pos is already after the leadUnit, and the lead unit has bit 15 reset.
readValue(CharSequence chars, int pos, int leadUnit)675     private static int readValue(CharSequence chars, int pos, int leadUnit) {
676         int value;
677         if(leadUnit<kMinTwoUnitValueLead) {
678             value=leadUnit;
679         } else if(leadUnit<kThreeUnitValueLead) {
680             value=((leadUnit-kMinTwoUnitValueLead)<<16)|chars.charAt(pos);
681         } else {
682             value=(chars.charAt(pos)<<16)|chars.charAt(pos+1);
683         }
684         return value;
685     }
skipValue(int pos, int leadUnit)686     private static int skipValue(int pos, int leadUnit) {
687         if(leadUnit>=kMinTwoUnitValueLead) {
688             if(leadUnit<kThreeUnitValueLead) {
689                 ++pos;
690             } else {
691                 pos+=2;
692             }
693         }
694         return pos;
695     }
skipValue(CharSequence chars, int pos)696     private static int skipValue(CharSequence chars, int pos) {
697         int leadUnit=chars.charAt(pos++);
698         return skipValue(pos, leadUnit&0x7fff);
699     }
700 
readNodeValue(CharSequence chars, int pos, int leadUnit)701     private static int readNodeValue(CharSequence chars, int pos, int leadUnit) {
702         assert(kMinValueLead<=leadUnit && leadUnit<kValueIsFinal);
703         int value;
704         if(leadUnit<kMinTwoUnitNodeValueLead) {
705             value=(leadUnit>>6)-1;
706         } else if(leadUnit<kThreeUnitNodeValueLead) {
707             value=(((leadUnit&0x7fc0)-kMinTwoUnitNodeValueLead)<<10)|chars.charAt(pos);
708         } else {
709             value=(chars.charAt(pos)<<16)|chars.charAt(pos+1);
710         }
711         return value;
712     }
skipNodeValue(int pos, int leadUnit)713     private static int skipNodeValue(int pos, int leadUnit) {
714         assert(kMinValueLead<=leadUnit && leadUnit<kValueIsFinal);
715         if(leadUnit>=kMinTwoUnitNodeValueLead) {
716             if(leadUnit<kThreeUnitNodeValueLead) {
717                 ++pos;
718             } else {
719                 pos+=2;
720             }
721         }
722         return pos;
723     }
724 
jumpByDelta(CharSequence chars, int pos)725     private static int jumpByDelta(CharSequence chars, int pos) {
726         int delta=chars.charAt(pos++);
727         if(delta>=kMinTwoUnitDeltaLead) {
728             if(delta==kThreeUnitDeltaLead) {
729                 delta=(chars.charAt(pos)<<16)|chars.charAt(pos+1);
730                 pos+=2;
731             } else {
732                 delta=((delta-kMinTwoUnitDeltaLead)<<16)|chars.charAt(pos++);
733             }
734         }
735         return pos+delta;
736     }
737 
skipDelta(CharSequence chars, int pos)738     private static int skipDelta(CharSequence chars, int pos) {
739         int delta=chars.charAt(pos++);
740         if(delta>=kMinTwoUnitDeltaLead) {
741             if(delta==kThreeUnitDeltaLead) {
742                 pos+=2;
743             } else {
744                 ++pos;
745             }
746         }
747         return pos;
748     }
749 
750     private static Result[] valueResults_={ Result.INTERMEDIATE_VALUE, Result.FINAL_VALUE };
751 
752     // Handles a branch node for both next(unit) and next(string).
branchNext(int pos, int length, int inUnit)753     private Result branchNext(int pos, int length, int inUnit) {
754         // Branch according to the current unit.
755         if(length==0) {
756             length=chars_.charAt(pos++);
757         }
758         ++length;
759         // The length of the branch is the number of units to select from.
760         // The data structure encodes a binary search.
761         while(length>kMaxBranchLinearSubNodeLength) {
762             if(inUnit<chars_.charAt(pos++)) {
763                 length>>=1;
764                 pos=jumpByDelta(chars_, pos);
765             } else {
766                 length=length-(length>>1);
767                 pos=skipDelta(chars_, pos);
768             }
769         }
770         // Drop down to linear search for the last few units.
771         // length>=2 because the loop body above sees length>kMaxBranchLinearSubNodeLength>=3
772         // and divides length by 2.
773         do {
774             if(inUnit==chars_.charAt(pos++)) {
775                 Result result;
776                 int node=chars_.charAt(pos);
777                 if((node&kValueIsFinal)!=0) {
778                     // Leave the final value for getValue() to read.
779                     result=Result.FINAL_VALUE;
780                 } else {
781                     // Use the non-final value as the jump delta.
782                     ++pos;
783                     // int delta=readValue(pos, node);
784                     int delta;
785                     if(node<kMinTwoUnitValueLead) {
786                         delta=node;
787                     } else if(node<kThreeUnitValueLead) {
788                         delta=((node-kMinTwoUnitValueLead)<<16)|chars_.charAt(pos++);
789                     } else {
790                         delta=(chars_.charAt(pos)<<16)|chars_.charAt(pos+1);
791                         pos+=2;
792                     }
793                     // end readValue()
794                     pos+=delta;
795                     node=chars_.charAt(pos);
796                     result= node>=kMinValueLead ? valueResults_[node>>15] : Result.NO_VALUE;
797                 }
798                 pos_=pos;
799                 return result;
800             }
801             --length;
802             pos=skipValue(chars_, pos);
803         } while(length>1);
804         if(inUnit==chars_.charAt(pos++)) {
805             pos_=pos;
806             int node=chars_.charAt(pos);
807             return node>=kMinValueLead ? valueResults_[node>>15] : Result.NO_VALUE;
808         } else {
809             stop();
810             return Result.NO_MATCH;
811         }
812     }
813 
814     // Requires remainingLength_<0.
nextImpl(int pos, int inUnit)815     private Result nextImpl(int pos, int inUnit) {
816         int node=chars_.charAt(pos++);
817         for(;;) {
818             if(node<kMinLinearMatch) {
819                 return branchNext(pos, node, inUnit);
820             } else if(node<kMinValueLead) {
821                 // Match the first of length+1 units.
822                 int length=node-kMinLinearMatch;  // Actual match length minus 1.
823                 if(inUnit==chars_.charAt(pos++)) {
824                     remainingMatchLength_=--length;
825                     pos_=pos;
826                     return (length<0 && (node=chars_.charAt(pos))>=kMinValueLead) ?
827                             valueResults_[node>>15] : Result.NO_VALUE;
828                 } else {
829                     // No match.
830                     break;
831                 }
832             } else if((node&kValueIsFinal)!=0) {
833                 // No further matching units.
834                 break;
835             } else {
836                 // Skip intermediate value.
837                 pos=skipNodeValue(pos, node);
838                 node&=kNodeTypeMask;
839             }
840         }
841         stop();
842         return Result.NO_MATCH;
843     }
844 
845     // Helper functions for getUniqueValue().
846     // Recursively finds a unique value (or whether there is not a unique one)
847     // from a branch.
848     // uniqueValue: On input, same as for getUniqueValue()/findUniqueValue().
849     // On return, if not 0, then bits 63..33 contain the updated non-negative pos.
findUniqueValueFromBranch(CharSequence chars, int pos, int length, long uniqueValue)850     private static long findUniqueValueFromBranch(CharSequence chars, int pos, int length,
851                                                   long uniqueValue) {
852         while(length>kMaxBranchLinearSubNodeLength) {
853             ++pos;  // ignore the comparison unit
854             uniqueValue=findUniqueValueFromBranch(chars, jumpByDelta(chars, pos), length>>1, uniqueValue);
855             if(uniqueValue==0) {
856                 return 0;
857             }
858             length=length-(length>>1);
859             pos=skipDelta(chars, pos);
860         }
861         do {
862             ++pos;  // ignore a comparison unit
863             // handle its value
864             int node=chars.charAt(pos++);
865             boolean isFinal=(node&kValueIsFinal)!=0;
866             node&=0x7fff;
867             int value=readValue(chars, pos, node);
868             pos=skipValue(pos, node);
869             if(isFinal) {
870                 if(uniqueValue!=0) {
871                     if(value!=(int)(uniqueValue>>1)) {
872                         return 0;
873                     }
874                 } else {
875                     uniqueValue=((long)value<<1)|1;
876                 }
877             } else {
878                 uniqueValue=findUniqueValue(chars, pos+value, uniqueValue);
879                 if(uniqueValue==0) {
880                     return 0;
881                 }
882             }
883         } while(--length>1);
884         // ignore the last comparison byte
885         return ((long)(pos+1)<<33)|(uniqueValue&0x1ffffffffL);
886     }
887     // Recursively finds a unique value (or whether there is not a unique one)
888     // starting from a position on a node lead unit.
889     // uniqueValue: If there is one, then bits 32..1 contain the value and bit 0 is set.
890     // Otherwise, uniqueValue is 0. Bits 63..33 are ignored.
findUniqueValue(CharSequence chars, int pos, long uniqueValue)891     private static long findUniqueValue(CharSequence chars, int pos, long uniqueValue) {
892         int node=chars.charAt(pos++);
893         for(;;) {
894             if(node<kMinLinearMatch) {
895                 if(node==0) {
896                     node=chars.charAt(pos++);
897                 }
898                 uniqueValue=findUniqueValueFromBranch(chars, pos, node+1, uniqueValue);
899                 if(uniqueValue==0) {
900                     return 0;
901                 }
902                 pos=(int)(uniqueValue>>>33);
903                 node=chars.charAt(pos++);
904             } else if(node<kMinValueLead) {
905                 // linear-match node
906                 pos+=node-kMinLinearMatch+1;  // Ignore the match units.
907                 node=chars.charAt(pos++);
908             } else {
909                 boolean isFinal=(node&kValueIsFinal)!=0;
910                 int value;
911                 if(isFinal) {
912                     value=readValue(chars, pos, node&0x7fff);
913                 } else {
914                     value=readNodeValue(chars, pos, node);
915                 }
916                 if(uniqueValue!=0) {
917                     if(value!=(int)(uniqueValue>>1)) {
918                         return 0;
919                     }
920                 } else {
921                     uniqueValue=((long)value<<1)|1;
922                 }
923                 if(isFinal) {
924                     return uniqueValue;
925                 }
926                 pos=skipNodeValue(pos, node);
927                 node&=kNodeTypeMask;
928             }
929         }
930     }
931 
932     // Helper functions for getNextChars().
933     // getNextChars() when pos is on a branch node.
getNextBranchChars(CharSequence chars, int pos, int length, Appendable out)934     private static void getNextBranchChars(CharSequence chars, int pos, int length, Appendable out) {
935         while(length>kMaxBranchLinearSubNodeLength) {
936             ++pos;  // ignore the comparison unit
937             getNextBranchChars(chars, jumpByDelta(chars, pos), length>>1, out);
938             length=length-(length>>1);
939             pos=skipDelta(chars, pos);
940         }
941         do {
942             append(out, chars.charAt(pos++));
943             pos=skipValue(chars, pos);
944         } while(--length>1);
945         append(out, chars.charAt(pos));
946     }
append(Appendable out, int c)947     private static void append(Appendable out, int c) {
948         try {
949             out.append((char)c);
950         } catch(IOException e) {
951             throw new ICUUncheckedIOException(e);
952         }
953     }
954 
955     // CharsTrie data structure
956     //
957     // The trie consists of a series of char-serialized nodes for incremental
958     // Unicode string/char sequence matching. (char=16-bit unsigned integer)
959     // The root node is at the beginning of the trie data.
960     //
961     // Types of nodes are distinguished by their node lead unit ranges.
962     // After each node, except a final-value node, another node follows to
963     // encode match values or continue matching further units.
964     //
965     // Node types:
966     //  - Final-value node: Stores a 32-bit integer in a compact, variable-length format.
967     //    The value is for the string/char sequence so far.
968     //  - Match node, optionally with an intermediate value in a different compact format.
969     //    The value, if present, is for the string/char sequence so far.
970     //
971     //  Aside from the value, which uses the node lead unit's high bits:
972     //
973     //  - Linear-match node: Matches a number of units.
974     //  - Branch node: Branches to other nodes according to the current input unit.
975     //    The node unit is the length of the branch (number of units to select from)
976     //    minus 1. It is followed by a sub-node:
977     //    - If the length is at most kMaxBranchLinearSubNodeLength, then
978     //      there are length-1 (key, value) pairs and then one more comparison unit.
979     //      If one of the key units matches, then the value is either a final value for
980     //      the string so far, or a "jump" delta to the next node.
981     //      If the last unit matches, then matching continues with the next node.
982     //      (Values have the same encoding as final-value nodes.)
983     //    - If the length is greater than kMaxBranchLinearSubNodeLength, then
984     //      there is one unit and one "jump" delta.
985     //      If the input unit is less than the sub-node unit, then "jump" by delta to
986     //      the next sub-node which will have a length of length/2.
987     //      (The delta has its own compact encoding.)
988     //      Otherwise, skip the "jump" delta to the next sub-node
989     //      which will have a length of length-length/2.
990 
991     // Match-node lead unit values, after masking off intermediate-value bits:
992 
993     // 0000..002f: Branch node. If node!=0 then the length is node+1, otherwise
994     // the length is one more than the next unit.
995 
996     // For a branch sub-node with at most this many entries, we drop down
997     // to a linear search.
998     /*package*/ static final int kMaxBranchLinearSubNodeLength=5;
999 
1000     // 0030..003f: Linear-match node, match 1..16 units and continue reading the next node.
1001     /*package*/ static final int kMinLinearMatch=0x30;
1002     /*package*/ static final int kMaxLinearMatchLength=0x10;
1003 
1004     // Match-node lead unit bits 14..6 for the optional intermediate value.
1005     // If these bits are 0, then there is no intermediate value.
1006     // Otherwise, see the *NodeValue* constants below.
1007     /*package*/ static final int kMinValueLead=kMinLinearMatch+kMaxLinearMatchLength;  // 0x0040
1008     /*package*/ static final int kNodeTypeMask=kMinValueLead-1;  // 0x003f
1009 
1010     // A final-value node has bit 15 set.
1011     /*package*/ static final int kValueIsFinal=0x8000;
1012 
1013     // Compact value: After testing and masking off bit 15, use the following thresholds.
1014     /*package*/ static final int kMaxOneUnitValue=0x3fff;
1015 
1016     /*package*/ static final int kMinTwoUnitValueLead=kMaxOneUnitValue+1;  // 0x4000
1017     /*package*/ static final int kThreeUnitValueLead=0x7fff;
1018 
1019     /*package*/ static final int kMaxTwoUnitValue=((kThreeUnitValueLead-kMinTwoUnitValueLead)<<16)-1;  // 0x3ffeffff
1020 
1021     // Compact intermediate-value integer, lead unit shared with a branch or linear-match node.
1022     /*package*/ static final int kMaxOneUnitNodeValue=0xff;
1023     /*package*/ static final int kMinTwoUnitNodeValueLead=kMinValueLead+((kMaxOneUnitNodeValue+1)<<6);  // 0x4040
1024     /*package*/ static final int kThreeUnitNodeValueLead=0x7fc0;
1025 
1026     /*package*/ static final int kMaxTwoUnitNodeValue=
1027         ((kThreeUnitNodeValueLead-kMinTwoUnitNodeValueLead)<<10)-1;  // 0xfdffff
1028 
1029     // Compact delta integers.
1030     /*package*/ static final int kMaxOneUnitDelta=0xfbff;
1031     /*package*/ static final int kMinTwoUnitDeltaLead=kMaxOneUnitDelta+1;  // 0xfc00
1032     /*package*/ static final int kThreeUnitDeltaLead=0xffff;
1033 
1034     /*package*/ static final int kMaxTwoUnitDelta=((kThreeUnitDeltaLead-kMinTwoUnitDeltaLead)<<16)-1;  // 0x03feffff
1035 
1036     // Fixed value referencing the CharsTrie words.
1037     private CharSequence chars_;
1038     private int root_;
1039 
1040     // Iterator variables.
1041 
1042     // Pointer to next trie unit to read. NULL if no more matches.
1043     private int pos_;
1044     // Remaining length of a linear-match node, minus 1. Negative if not in such a node.
1045     private int remainingMatchLength_;
1046 }
1047