• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 ******************************************************************************
5 * Copyright (C) 1996-2015, International Business Machines Corporation and
6 * others. All Rights Reserved.
7 ******************************************************************************
8 */
9 
10 package com.ibm.icu.impl;
11 
12 import java.util.NoSuchElementException;
13 
14 import com.ibm.icu.lang.UCharacter;
15 import com.ibm.icu.text.UTF16;
16 import com.ibm.icu.util.RangeValueIterator;
17 
18 /**
19  * <p>Class enabling iteration of the values in a Trie.</p>
20  *
21  * <p>2015-sep-03 TODO: Only used in test code, move there.
22  *
23  * <p>Result of each iteration contains the interval of codepoints that have
24  * the same value type and the value type itself.</p>
25  * <p>The comparison of each codepoint value is done via extract(), which the
26  * default implementation is to return the value as it is.</p>
27  * <p>Method extract() can be overwritten to perform manipulations on
28  * codepoint values in order to perform specialized comparison.</p>
29  * <p>TrieIterator is designed to be a generic iterator for the CharTrie
30  * and the IntTrie, hence to accommodate both types of data, the return
31  * result will be in terms of int (32 bit) values.</p>
32  * <p>See com.ibm.icu.text.UCharacterTypeIterator for examples of use.</p>
33  * <p>Notes for porting utrie_enum from icu4c to icu4j:<br>
34  * Internally, icu4c's utrie_enum performs all iterations in its body. In Java
35  * sense, the caller will have to pass a object with a callback function
36  * UTrieEnumRange(const void *context, UChar32 start, UChar32 limit,
37  * uint32_t value) into utrie_enum. utrie_enum will then find ranges of
38  * codepoints with the same value as determined by
39  * UTrieEnumValue(const void *context, uint32_t value). for each range,
40  * utrie_enum calls the callback function to perform a task. In this way,
41  * icu4c performs the iteration within utrie_enum.
42  * To follow the JDK model, icu4j is slightly different from icu4c.
43  * Instead of requesting the caller to implement an object for a callback.
44  * The caller will have to implement a subclass of TrieIterator, fleshing out
45  * the method extract(int) (equivalent to UTrieEnumValue). Independent of icu4j,
46  * the caller will have to code his own iteration and flesh out the task
47  * (equivalent to UTrieEnumRange) to be performed in the iteration loop.
48  * </p>
49  * <p>There are basically 3 usage scenarios for porting:</p>
50  * <p>1) UTrieEnumValue is the only implemented callback then just implement a
51  * subclass of TrieIterator and override the extract(int) method. The
52  * extract(int) method is analogous to UTrieEnumValue callback.
53  * </p>
54  * <p>2) UTrieEnumValue and UTrieEnumRange both are implemented then implement
55  * a subclass of TrieIterator, override the extract method and iterate, e.g
56  * </p>
57  * <p>utrie_enum(&normTrie, _enumPropertyStartsValue, _enumPropertyStartsRange,
58  *               set);<br>
59  * In Java :<br>
60  * <pre>
61  * class TrieIteratorImpl extends TrieIterator{
62  *     public TrieIteratorImpl(Trie data){
63  *         super(data);
64  *     }
65  *     public int extract(int value){
66  *         // port the implementation of _enumPropertyStartsValue here
67  *     }
68  * }
69  * ....
70  * TrieIterator fcdIter  = new TrieIteratorImpl(fcdTrieImpl.fcdTrie);
71  * while(fcdIter.next(result)) {
72  *     // port the implementation of _enumPropertyStartsRange
73  * }
74  * </pre>
75  * </p>
76  * <p>3) UTrieEnumRange is the only implemented callback then just implement
77  * the while loop, when utrie_enum is called
78  * <pre>
79  * // utrie_enum(&fcdTrie, NULL, _enumPropertyStartsRange, set);
80  * TrieIterator fcdIter  = new TrieIterator(fcdTrieImpl.fcdTrie);
81  * while(fcdIter.next(result)){
82  *     set.add(result.start);
83  * }
84  * </p>
85  * @author synwee
86  * @see com.ibm.icu.impl.Trie
87  * @since release 2.1, Jan 17 2002
88  */
89 public class TrieIterator implements RangeValueIterator
90 
91 {
92     // public constructor ---------------------------------------------
93 
94     /**
95     * TrieEnumeration constructor
96     * @param trie to be used
97     * @exception IllegalArgumentException throw when argument is null.
98     */
TrieIterator(Trie trie)99     public TrieIterator(Trie trie)
100     {
101         if (trie == null) {
102             throw new IllegalArgumentException(
103                                           "Argument trie cannot be null");
104         }
105         m_trie_             = trie;
106         // synwee: check that extract belongs to the child class
107         m_initialValue_     = extract(m_trie_.getInitialValue());
108         reset();
109     }
110 
111     // public methods -------------------------------------------------
112 
113     /**
114     * <p>Returns true if we are not at the end of the iteration, false
115     * otherwise.</p>
116     * <p>The next set of codepoints with the same value type will be
117     * calculated during this call and returned in the argument element.</p>
118     * @param element return result
119     * @return true if we are not at the end of the iteration, false otherwise.
120     * @exception NoSuchElementException - if no more elements exist.
121     * @see com.ibm.icu.util.RangeValueIterator.Element
122     */
123     @Override
next(Element element)124     public final boolean next(Element element)
125     {
126         if (m_nextCodepoint_ > UCharacter.MAX_VALUE) {
127             return false;
128         }
129         if (m_nextCodepoint_ < UCharacter.SUPPLEMENTARY_MIN_VALUE &&
130             calculateNextBMPElement(element)) {
131             return true;
132         }
133         calculateNextSupplementaryElement(element);
134         return true;
135     }
136 
137     /**
138     * Resets the iterator to the beginning of the iteration
139     */
140     @Override
reset()141     public final void reset()
142     {
143         m_currentCodepoint_ = 0;
144         m_nextCodepoint_    = 0;
145         m_nextIndex_        = 0;
146         m_nextBlock_ = m_trie_.m_index_[0] << Trie.INDEX_STAGE_2_SHIFT_;
147         if (m_nextBlock_ == m_trie_.m_dataOffset_) {
148             m_nextValue_ = m_initialValue_;
149         }
150         else {
151             m_nextValue_ = extract(m_trie_.getValue(m_nextBlock_));
152         }
153         m_nextBlockIndex_ = 0;
154         m_nextTrailIndexOffset_ = TRAIL_SURROGATE_INDEX_BLOCK_LENGTH_;
155     }
156 
157     // protected methods ----------------------------------------------
158 
159     /**
160     * Called by next() to extracts a 32 bit value from a trie value
161     * used for comparison.
162     * This method is to be overwritten if special manipulation is to be done
163     * to retrieve a relevant comparison.
164     * The default function is to return the value as it is.
165     * @param value a value from the trie
166     * @return extracted value
167     */
extract(int value)168     protected int extract(int value)
169     {
170         return value;
171     }
172 
173     // private methods ------------------------------------------------
174 
175     /**
176     * Set the result values
177     * @param element return result object
178     * @param start codepoint of range
179     * @param limit (end + 1) codepoint of range
180     * @param value common value of range
181     */
setResult(Element element, int start, int limit, int value)182     private final void setResult(Element element, int start, int limit,
183                                  int value)
184     {
185         element.start = start;
186         element.limit = limit;
187         element.value = value;
188     }
189 
190     /**
191     * Finding the next element.
192     * This method is called just before returning the result of
193     * next().
194     * We always store the next element before it is requested.
195     * In the case that we have to continue calculations into the
196     * supplementary planes, a false will be returned.
197     * @param element return result object
198     * @return true if the next range is found, false if we have to proceed to
199     *         the supplementary range.
200     */
calculateNextBMPElement(Element element)201     private final boolean calculateNextBMPElement(Element element)
202     {
203         int currentValue    = m_nextValue_;
204         m_currentCodepoint_ = m_nextCodepoint_;
205         m_nextCodepoint_ ++;
206         m_nextBlockIndex_ ++;
207         if (!checkBlockDetail(currentValue)) {
208             setResult(element, m_currentCodepoint_, m_nextCodepoint_,
209                       currentValue);
210             return true;
211         }
212         // synwee check that next block index == 0 here
213         // enumerate BMP - the main loop enumerates data blocks
214         while (m_nextCodepoint_ < UCharacter.SUPPLEMENTARY_MIN_VALUE) {
215             // because of the way the character is split to form the index
216             // the lead surrogate and trail surrogate can not be in the
217             // mid of a block
218             if (m_nextCodepoint_ == LEAD_SURROGATE_MIN_VALUE_) {
219                 // skip lead surrogate code units,
220                 // go to lead surrogate codepoints
221                 m_nextIndex_ = BMP_INDEX_LENGTH_;
222             }
223             else if (m_nextCodepoint_ == TRAIL_SURROGATE_MIN_VALUE_) {
224                 // go back to regular BMP code points
225                 m_nextIndex_ = m_nextCodepoint_ >> Trie.INDEX_STAGE_1_SHIFT_;
226             } else {
227                 m_nextIndex_ ++;
228             }
229 
230             m_nextBlockIndex_ = 0;
231             if (!checkBlock(currentValue)) {
232                 setResult(element, m_currentCodepoint_, m_nextCodepoint_,
233                           currentValue);
234                 return true;
235             }
236         }
237         m_nextCodepoint_ --;   // step one back since this value has not been
238         m_nextBlockIndex_ --;  // retrieved yet.
239         return false;
240     }
241 
242     /**
243     * Finds the next supplementary element.
244     * For each entry in the trie, the value to be delivered is passed through
245     * extract().
246     * We always store the next element before it is requested.
247     * Called after calculateNextBMP() completes its round of BMP characters.
248     * There is a slight difference in the usage of m_currentCodepoint_
249     * here as compared to calculateNextBMP(). Though both represents the
250     * lower bound of the next element, in calculateNextBMP() it gets set
251     * at the start of any loop, where-else, in calculateNextSupplementary()
252     * since m_currentCodepoint_ already contains the lower bound of the
253     * next element (passed down from calculateNextBMP()), we keep it till
254     * the end before resetting it to the new value.
255     * Note, if there are no more iterations, it will never get to here.
256     * Blocked out by next().
257     * @param element return result object
258     */
calculateNextSupplementaryElement(Element element)259     private final void calculateNextSupplementaryElement(Element element)
260     {
261         int currentValue = m_nextValue_;
262         m_nextCodepoint_ ++;
263         m_nextBlockIndex_ ++;
264 
265         if (UTF16.getTrailSurrogate(m_nextCodepoint_)
266                                         != UTF16.TRAIL_SURROGATE_MIN_VALUE) {
267             // this piece is only called when we are in the middle of a lead
268             // surrogate block
269             if (!checkNullNextTrailIndex() && !checkBlockDetail(currentValue)) {
270                 setResult(element, m_currentCodepoint_, m_nextCodepoint_,
271                           currentValue);
272                 m_currentCodepoint_ = m_nextCodepoint_;
273                 return;
274             }
275             // we have cleared one block
276             m_nextIndex_ ++;
277             m_nextTrailIndexOffset_ ++;
278             if (!checkTrailBlock(currentValue)) {
279                 setResult(element, m_currentCodepoint_, m_nextCodepoint_,
280                           currentValue);
281                 m_currentCodepoint_ = m_nextCodepoint_;
282                 return;
283             }
284         }
285         int nextLead  = UTF16.getLeadSurrogate(m_nextCodepoint_);
286         // enumerate supplementary code points
287         while (nextLead < TRAIL_SURROGATE_MIN_VALUE_) {
288             // lead surrogate access
289             final int leadBlock =
290                    m_trie_.m_index_[nextLead >> Trie.INDEX_STAGE_1_SHIFT_] <<
291                                                    Trie.INDEX_STAGE_2_SHIFT_;
292             if (leadBlock == m_trie_.m_dataOffset_) {
293                 // no entries for a whole block of lead surrogates
294                 if (currentValue != m_initialValue_) {
295                     m_nextValue_      = m_initialValue_;
296                     m_nextBlock_      = leadBlock;  // == m_trie_.m_dataOffset_
297                     m_nextBlockIndex_ = 0;
298                     setResult(element, m_currentCodepoint_, m_nextCodepoint_,
299                               currentValue);
300                     m_currentCodepoint_ = m_nextCodepoint_;
301                     return;
302                 }
303 
304                 nextLead += DATA_BLOCK_LENGTH_;
305                 // number of total affected supplementary codepoints in one
306                 // block
307                 // this is not a simple addition of
308                 // DATA_BLOCK_SUPPLEMENTARY_LENGTH since we need to consider
309                 // that we might have moved some of the codepoints
310                 m_nextCodepoint_ = Character.toCodePoint((char)nextLead, (char)UTF16.TRAIL_SURROGATE_MIN_VALUE);
311                 continue;
312             }
313             if (m_trie_.m_dataManipulate_ == null) {
314                 throw new NullPointerException(
315                             "The field DataManipulate in this Trie is null");
316             }
317             // enumerate trail surrogates for this lead surrogate
318             m_nextIndex_ = m_trie_.m_dataManipulate_.getFoldingOffset(
319                                m_trie_.getValue(leadBlock +
320                                    (nextLead & Trie.INDEX_STAGE_3_MASK_)));
321             if (m_nextIndex_ <= 0) {
322                 // no data for this lead surrogate
323                 if (currentValue != m_initialValue_) {
324                     m_nextValue_      = m_initialValue_;
325                     m_nextBlock_      = m_trie_.m_dataOffset_;
326                     m_nextBlockIndex_ = 0;
327                     setResult(element, m_currentCodepoint_, m_nextCodepoint_,
328                               currentValue);
329                     m_currentCodepoint_ = m_nextCodepoint_;
330                     return;
331                 }
332                 m_nextCodepoint_ += TRAIL_SURROGATE_COUNT_;
333             } else {
334                 m_nextTrailIndexOffset_ = 0;
335                 if (!checkTrailBlock(currentValue)) {
336                     setResult(element, m_currentCodepoint_, m_nextCodepoint_,
337                               currentValue);
338                     m_currentCodepoint_ = m_nextCodepoint_;
339                     return;
340                 }
341             }
342             nextLead ++;
343          }
344 
345          // deliver last range
346          setResult(element, m_currentCodepoint_, UCharacter.MAX_VALUE + 1,
347                    currentValue);
348     }
349 
350     /**
351     * Internal block value calculations
352     * Performs calculations on a data block to find codepoints in m_nextBlock_
353     * after the index m_nextBlockIndex_ that has the same value.
354     * Note m_*_ variables at this point is the next codepoint whose value
355     * has not been calculated.
356     * But when returned with false, it will be the last codepoint whose
357     * value has been calculated.
358     * @param currentValue the value which other codepoints are tested against
359     * @return true if the whole block has the same value as currentValue or if
360     *              the whole block has been calculated, false otherwise.
361     */
checkBlockDetail(int currentValue)362     private final boolean checkBlockDetail(int currentValue)
363     {
364         while (m_nextBlockIndex_ < DATA_BLOCK_LENGTH_) {
365             m_nextValue_ = extract(m_trie_.getValue(m_nextBlock_ +
366                                                     m_nextBlockIndex_));
367             if (m_nextValue_ != currentValue) {
368                 return false;
369             }
370             ++ m_nextBlockIndex_;
371             ++ m_nextCodepoint_;
372         }
373         return true;
374     }
375 
376     /**
377     * Internal block value calculations
378     * Performs calculations on a data block to find codepoints in m_nextBlock_
379     * that has the same value.
380     * Will call checkBlockDetail() if highlevel check fails.
381     * Note m_*_ variables at this point is the next codepoint whose value
382     * has not been calculated.
383     * @param currentBlock the initial block containing all currentValue
384     * @param currentValue the value which other codepoints are tested against
385     * @return true if the whole block has the same value as currentValue or if
386     *              the whole block has been calculated, false otherwise.
387     */
checkBlock(int currentValue)388     private final boolean checkBlock(int currentValue)
389     {
390         int currentBlock = m_nextBlock_;
391         m_nextBlock_ = m_trie_.m_index_[m_nextIndex_] <<
392                                                   Trie.INDEX_STAGE_2_SHIFT_;
393         if (m_nextBlock_ == currentBlock &&
394             (m_nextCodepoint_ - m_currentCodepoint_) >= DATA_BLOCK_LENGTH_) {
395             // the block is the same as the previous one, filled with
396             // currentValue
397             m_nextCodepoint_ += DATA_BLOCK_LENGTH_;
398         }
399         else if (m_nextBlock_ == m_trie_.m_dataOffset_) {
400             // this is the all-initial-value block
401             if (currentValue != m_initialValue_) {
402                 m_nextValue_      = m_initialValue_;
403                 m_nextBlockIndex_ = 0;
404                 return false;
405             }
406             m_nextCodepoint_ += DATA_BLOCK_LENGTH_;
407         }
408         else {
409             if (!checkBlockDetail(currentValue)) {
410                 return false;
411             }
412         }
413         return true;
414     }
415 
416     /**
417     * Internal block value calculations
418     * Performs calculations on multiple data blocks for a set of trail
419     * surrogates to find codepoints in m_nextBlock_ that has the same value.
420     * Will call checkBlock() for internal block checks.
421     * Note m_*_ variables at this point is the next codepoint whose value
422     * has not been calculated.
423     * @param currentValue the value which other codepoints are tested against
424     * @return true if the whole block has the same value as currentValue or if
425     *              the whole block has been calculated, false otherwise.
426     */
checkTrailBlock(int currentValue)427     private final boolean checkTrailBlock(int currentValue)
428     {
429         // enumerate code points for this lead surrogate
430         while (m_nextTrailIndexOffset_ < TRAIL_SURROGATE_INDEX_BLOCK_LENGTH_)
431         {
432             // if we ever reach here, we are at the start of a new block
433             m_nextBlockIndex_ = 0;
434             // copy of most of the body of the BMP loop
435             if (!checkBlock(currentValue)) {
436                 return false;
437             }
438             m_nextTrailIndexOffset_ ++;
439             m_nextIndex_ ++;
440         }
441         return true;
442     }
443 
444     /**
445     * Checks if we are beginning at the start of a initial block.
446     * If we are then the rest of the codepoints in this initial block
447     * has the same values.
448     * We increment m_nextCodepoint_ and relevant data members if so.
449     * This is used only in for the supplementary codepoints because
450     * the offset to the trail indexes could be 0.
451     * @return true if we are at the start of a initial block.
452     */
checkNullNextTrailIndex()453     private final boolean checkNullNextTrailIndex()
454     {
455         if (m_nextIndex_ <= 0) {
456             m_nextCodepoint_ += TRAIL_SURROGATE_COUNT_ - 1;
457             int nextLead  = UTF16.getLeadSurrogate(m_nextCodepoint_);
458             int leadBlock =
459                    m_trie_.m_index_[nextLead >> Trie.INDEX_STAGE_1_SHIFT_] <<
460                                                    Trie.INDEX_STAGE_2_SHIFT_;
461             if (m_trie_.m_dataManipulate_ == null) {
462                 throw new NullPointerException(
463                             "The field DataManipulate in this Trie is null");
464             }
465             m_nextIndex_ = m_trie_.m_dataManipulate_.getFoldingOffset(
466                                m_trie_.getValue(leadBlock +
467                                    (nextLead & Trie.INDEX_STAGE_3_MASK_)));
468             m_nextIndex_ --;
469             m_nextBlockIndex_ =  DATA_BLOCK_LENGTH_;
470             return true;
471         }
472         return false;
473     }
474 
475     // private data members --------------------------------------------
476 
477     /**
478     * Size of the stage 1 BMP indexes
479     */
480     private static final int BMP_INDEX_LENGTH_ =
481                                         0x10000 >> Trie.INDEX_STAGE_1_SHIFT_;
482     /**
483     * Lead surrogate minimum value
484     */
485     private static final int LEAD_SURROGATE_MIN_VALUE_ = 0xD800;
486     /**
487     * Trail surrogate minimum value
488     */
489     private static final int TRAIL_SURROGATE_MIN_VALUE_ = 0xDC00;
490     /*
491     * Trail surrogate maximum value
492     */
493     //private static final int TRAIL_SURROGATE_MAX_VALUE_ = 0xDFFF;
494     /**
495     * Number of trail surrogate
496     */
497     private static final int TRAIL_SURROGATE_COUNT_ = 0x400;
498     /**
499     * Number of stage 1 indexes for supplementary calculations that maps to
500     * each lead surrogate character.
501     * See second pass into getRawOffset for the trail surrogate character.
502     * 10 for significant number of bits for trail surrogates, 5 for what we
503     * discard during shifting.
504     */
505     private static final int TRAIL_SURROGATE_INDEX_BLOCK_LENGTH_ =
506                                     1 << (10 - Trie.INDEX_STAGE_1_SHIFT_);
507     /**
508     * Number of data values in a stage 2 (data array) block.
509     */
510     private static final int DATA_BLOCK_LENGTH_ =
511                                               1 << Trie.INDEX_STAGE_1_SHIFT_;
512 //    /**
513 //    * Number of codepoints in a stage 2 block
514 //    */
515 //    private static final int DATA_BLOCK_SUPPLEMENTARY_LENGTH_ =
516 //                                                     DATA_BLOCK_LENGTH_ << 10;
517     /**
518     * Trie instance
519     */
520     private Trie m_trie_;
521     /**
522     * Initial value for trie values
523     */
524     private int m_initialValue_;
525     /**
526     * Next element results and data.
527     */
528     private int m_currentCodepoint_;
529     private int m_nextCodepoint_;
530     private int m_nextValue_;
531     private int m_nextIndex_;
532     private int m_nextBlock_;
533     private int m_nextBlockIndex_;
534     private int m_nextTrailIndexOffset_;
535 }
536