• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* GENERATED SOURCE. DO NOT MODIFY. */
2 // © 2016 and later: Unicode, Inc. and others.
3 // License & terms of use: http://www.unicode.org/copyright.html
4 /*
5  *******************************************************************************
6  * Copyright (C) 2005-2016 International Business Machines Corporation and
7  * others. All Rights Reserved.
8  *******************************************************************************
9  */
10 
11 package android.icu.text;
12 
13 import static android.icu.impl.CharacterIteration.DONE32;
14 import static android.icu.impl.CharacterIteration.next32;
15 import static android.icu.impl.CharacterIteration.nextTrail32;
16 
17 import java.io.ByteArrayOutputStream;
18 import java.io.IOException;
19 import java.io.InputStream;
20 import java.io.OutputStream;
21 import java.nio.ByteBuffer;
22 import java.text.CharacterIterator;
23 import java.util.MissingResourceException;
24 import java.util.concurrent.ConcurrentLinkedQueue;
25 
26 import android.icu.impl.CharacterIteration;
27 import android.icu.impl.ICUBinary;
28 import android.icu.impl.ICUDebug;
29 import android.icu.impl.RBBIDataWrapper;
30 import android.icu.impl.breakiter.BurmeseBreakEngine;
31 import android.icu.impl.breakiter.CjkBreakEngine;
32 import android.icu.impl.breakiter.DictionaryBreakEngine;
33 import android.icu.impl.breakiter.KhmerBreakEngine;
34 import android.icu.impl.breakiter.LSTMBreakEngine;
35 import android.icu.impl.breakiter.LanguageBreakEngine;
36 import android.icu.impl.breakiter.LaoBreakEngine;
37 import android.icu.impl.breakiter.ThaiBreakEngine;
38 import android.icu.impl.breakiter.UnhandledBreakEngine;
39 import android.icu.lang.UCharacter;
40 import android.icu.lang.UProperty;
41 import android.icu.lang.UScript;
42 import android.icu.util.CodePointTrie;
43 
44 /**
45  * Rule Based Break Iterator
46  * This is a port of the C++ class RuleBasedBreakIterator from ICU4C.
47  *
48  * @hide Only a subset of ICU is exposed in Android
49  */
50 public class RuleBasedBreakIterator extends BreakIterator {
51     //=======================================================================
52     // Constructors & Factories
53     //=======================================================================
54 
55     /**
56      * private constructor
57      */
RuleBasedBreakIterator()58     private RuleBasedBreakIterator() {
59         fDictionaryCharCount  = 0;
60     }
61 
62     /**
63      * Create a break iterator from a precompiled set of break rules.
64      *
65      * Creating a break iterator from the binary rules is much faster than
66      * creating one from source rules.
67      *
68      * The binary rules are generated by the RuleBasedBreakIterator.compileRules() function.
69      * Binary break iterator rules are not guaranteed to be compatible between
70      * different versions of ICU.
71      *
72      * @param is an input stream supplying the compiled binary rules.
73      * @throws IOException if there is an error while reading the rules from the InputStream.
74      * @see    #compileRules(String, OutputStream)
75      */
getInstanceFromCompiledRules(InputStream is)76     public static RuleBasedBreakIterator getInstanceFromCompiledRules(InputStream is) throws IOException {
77         RuleBasedBreakIterator  This = new RuleBasedBreakIterator();
78         This.fRData = RBBIDataWrapper.get(ICUBinary.getByteBufferFromInputStreamAndCloseStream(is));
79         This.fLookAheadMatches = new int[This.fRData.fFTable.fLookAheadResultsSize];
80         return This;
81     }
82 
83     /**
84      * This factory method doesn't have an access modifier; it is only accessible in the same
85      * package.
86      *
87      * Create a break iterator from a precompiled set of break rules.
88      *
89      * Creating a break iterator from the binary rules is much faster than
90      * creating one from source rules.
91      *
92      * The binary rules are generated by the RuleBasedBreakIterator.compileRules() function.
93      * Binary break iterator rules are not guaranteed to be compatible between
94      * different versions of ICU.
95      *
96      * @param bytes a buffer supplying the compiled binary rules.
97      * @param phraseBreaking a flag indicating if phrase breaking is required.
98      * @throws IOException if there is an error while reading the rules from the buffer.
99      * @see    #compileRules(String, OutputStream)
100      * @hide draft / provisional / internal are hidden on Android
101      */
getInstanceFromCompiledRules( ByteBuffer bytes, boolean phraseBreaking)102     /* package-potected */ static RuleBasedBreakIterator getInstanceFromCompiledRules(
103             ByteBuffer bytes, boolean phraseBreaking) throws IOException {
104         RuleBasedBreakIterator instance = getInstanceFromCompiledRules(bytes);
105         instance.fPhraseBreaking = phraseBreaking;
106         return instance;
107     }
108 
109     /**
110      * Create a break iterator from a precompiled set of break rules.
111      *
112      * Creating a break iterator from the binary rules is much faster than
113      * creating one from source rules.
114      *
115      * The binary rules are generated by the RuleBasedBreakIterator.compileRules() function.
116      * Binary break iterator rules are not guaranteed to be compatible between
117      * different versions of ICU.
118      *
119      * @param bytes a buffer supplying the compiled binary rules.
120      * @throws IOException if there is an error while reading the rules from the buffer.
121      * @see    #compileRules(String, OutputStream)
122      * @deprecated This API is ICU internal only.
123      * @hide draft / provisional / internal are hidden on Android
124      */
125     @Deprecated
getInstanceFromCompiledRules(ByteBuffer bytes)126     public static RuleBasedBreakIterator getInstanceFromCompiledRules(ByteBuffer bytes) throws IOException {
127         RuleBasedBreakIterator  This = new RuleBasedBreakIterator();
128         This.fRData = RBBIDataWrapper.get(bytes);
129         This.fLookAheadMatches = new int[This.fRData.fFTable.fLookAheadResultsSize];
130         return This;
131     }
132 
133     /**
134      * Construct a RuleBasedBreakIterator from a set of rules supplied as a string.
135      * @param rules The break rules to be used.
136      */
RuleBasedBreakIterator(String rules)137     public RuleBasedBreakIterator(String rules)  {
138         this();
139         try {
140             ByteArrayOutputStream ruleOS = new ByteArrayOutputStream();
141             compileRules(rules, ruleOS);
142             fRData = RBBIDataWrapper.get(ByteBuffer.wrap(ruleOS.toByteArray()));
143             fLookAheadMatches = new int[fRData.fFTable.fLookAheadResultsSize];
144         } catch (IOException e) {
145             ///CLOVER:OFF
146             // An IO exception can only arrive here if there is a bug in the RBBI Rule compiler,
147             //  causing bogus compiled rules to be produced, but with no compile error raised.
148             RuntimeException rte = new RuntimeException("RuleBasedBreakIterator rule compilation internal error: "
149                     + e.getMessage());
150             throw rte;
151             ///CLOVER:ON
152         }
153     }
154 
155     //=======================================================================
156     // Boilerplate
157     //=======================================================================
158 
159     /**
160      * Clones this iterator.
161      * @return A newly-constructed RuleBasedBreakIterator with the same
162      * behavior as this one.
163      */
164     @Override
clone()165     public Object clone()  {
166         RuleBasedBreakIterator result;
167         result = (RuleBasedBreakIterator)super.clone();
168         if (fText != null) {
169             result.fText = (CharacterIterator)(fText.clone());
170         }
171         result.fLookAheadMatches = new int[fRData.fFTable.fLookAheadResultsSize];
172         result.fBreakCache = result.new BreakCache(fBreakCache);
173         result.fDictionaryCache = result.new DictionaryCache(fDictionaryCache);
174         return result;
175     }
176 
177 
178     /**
179      * Returns true if both BreakIterators are of the same class, have the same
180      * rules, and iterate over the same text.
181      */
182     @Override
equals(Object that)183     public boolean equals(Object that) {
184         if (that == null) {
185             return false;
186         }
187         if (this == that) {
188             return true;
189         }
190         try {
191             RuleBasedBreakIterator other = (RuleBasedBreakIterator) that;
192             if (fRData != other.fRData && (fRData == null || other.fRData == null)) {
193                 return false;
194             }
195             if (fRData != null && other.fRData != null &&
196                     (!fRData.fRuleSource.equals(other.fRData.fRuleSource))) {
197                 return false;
198             }
199             if (fText == null && other.fText == null) {
200                 return true;
201             }
202             if (fText == null || other.fText == null || !fText.equals(other.fText)) {
203                 return false;
204             }
205             return fPosition == other.fPosition;
206         }
207         catch(ClassCastException e) {
208             return false;
209         }
210      }
211 
212     /**
213      * Returns the description (rules) used to create this iterator.
214      * (In ICU4C, the same function is RuleBasedBreakIterator::getRules())
215      */
216     @Override
toString()217     public String toString() {
218         String retStr = "";
219         if (fRData != null) {
220             retStr =  fRData.fRuleSource;
221         }
222         return retStr;
223     }
224 
225     /**
226      * Compute a hashcode for this BreakIterator
227      * @return A hash code
228      */
229     @Override
hashCode()230     public int hashCode()
231     {
232         return fRData.fRuleSource.hashCode();
233     }
234 
235 
236     private static final int  START_STATE = 1;     // The state number of the starting state
237     private static final int  STOP_STATE  = 0;     // The state-transition value indicating "stop"
238 
239     // RBBIRunMode - the state machine runs an extra iteration at the beginning and end
240     //               of user text.  A variable with this enum type keeps track of where we
241     //               are.  The state machine only fetches user text input while in RUN mode.
242     private static final int  RBBI_START  = 0;
243     private static final int  RBBI_RUN    = 1;
244     private static final int  RBBI_END    = 2;
245 
246     /**
247      * The character iterator through which this BreakIterator accesses the text.
248      */
249     private CharacterIterator   fText = new java.text.StringCharacterIterator("");
250 
251     /**
252      * The rule data for this BreakIterator instance.
253      * Not intended for public use. Declared public for testing purposes only.
254      * @deprecated This API is ICU internal only.
255      * @hide draft / provisional / internal are hidden on Android
256      */
257     @Deprecated
258     public RBBIDataWrapper    fRData;
259 
260     /**
261      *  The iteration state - current position, rule status for the current position,
262      *                        and whether the iterator ran off the end, yielding UBRK_DONE.
263      *                        Current position is pinned to be 0 < position <= text.length.
264      *                        Current position is always set to a boundary.
265      *
266      *  The current  position of the iterator. Pinned, 0 < fPosition <= text.length.
267      *  Never has the value UBRK_DONE (-1).
268      */
269     private int                fPosition;
270 
271     /**
272      * Index of the Rule {tag} values for the most recent match.
273      */
274     private int                fRuleStatusIndex;
275 
276     /**
277      * True when iteration has run off the end, and iterator functions should return UBRK_DONE.
278      */
279     private boolean            fDone;
280 
281     /**
282      *  Array of look-ahead tentative results.
283      */
284     private int[]              fLookAheadMatches;
285 
286     /**
287      *   Cache of previously determined boundary positions.
288      */
289     private BreakCache         fBreakCache = new BreakCache();
290 
291     /**
292      * Flag used to indicate if phrase breaking is required.
293      */
294     private boolean            fPhraseBreaking = false;
295 
296 
297     /**
298      * Counter for the number of characters encountered with the "dictionary"
299      *   flag set.  Normal RBBI iterators don't use it, although the code
300      *   for updating it is live.  Dictionary Based break iterators (a subclass
301      *   of us) access this field directly.
302      * @hide draft / provisional / internal are hidden on Android
303      */
304     private int fDictionaryCharCount;
305 
306     private DictionaryCache     fDictionaryCache = new DictionaryCache();
307 
308     /**
309      * ICU debug argument name for RBBI
310      */
311     private static final String RBBI_DEBUG_ARG = "rbbi";
312 
313     /**
314      * Debugging flag.  Trace operation of state machine when true.
315      */
316     private static final boolean TRACE = ICUDebug.enabled(RBBI_DEBUG_ARG)
317             && ICUDebug.value(RBBI_DEBUG_ARG).indexOf("trace") >= 0;
318 
319     /**
320      * The "default" break engine - just skips over ranges of dictionary words,
321      * producing no breaks. Should only be used if characters need to be handled
322      * by a dictionary but we have no dictionary implementation for them.
323      *
324      * Only one instance; shared by all break iterators.
325      */
326     private static final UnhandledBreakEngine gUnhandledBreakEngine;
327 
328     /**
329      * List of all known break engines, common for all break iterators.
330      * Lazily updated as break engines are needed, because instantiation of
331      * break engines is expensive.
332      *
333      * Important notes:
334      * <ul>Because we don't want to add the same LanguageBreakEngine multiple times, all writes
335      *     are synchronized.
336      * <ul>Read access avoids explicit synchronization, but will end up being synchronized if
337      *     needed.
338      */
339     private static final ConcurrentLinkedQueue<LanguageBreakEngine> gAllBreakEngines;
340 
341     static {
342         gUnhandledBreakEngine = new UnhandledBreakEngine();
343         gAllBreakEngines = new ConcurrentLinkedQueue<>();
344         gAllBreakEngines.add(gUnhandledBreakEngine);
345     }
346 
347     /**
348      * Dump the contents of the state table and character classes for this break iterator.
349      * For debugging only.
350      * @deprecated This API is ICU internal only.
351      * @hide draft / provisional / internal are hidden on Android
352      */
353     @Deprecated
dump(java.io.PrintStream out)354     public void dump(java.io.PrintStream out) {
355         if (out == null) {
356             out = System.out;
357         }
358         this.fRData.dump(out);
359     }
360 
361     /**
362      * Compile a set of source break rules into the binary state tables used
363      * by the break iterator engine.  Creating a break iterator from precompiled
364      * rules is much faster than creating one from source rules.
365      *
366      * Binary break rules are not guaranteed to be compatible between different
367      * versions of ICU.
368      *
369      *
370      * @param rules  The source form of the break rules
371      * @param ruleBinary  An output stream to receive the compiled rules.
372      * @throws IOException If there is an error writing the output.
373      * @see #getInstanceFromCompiledRules(InputStream)
374      */
compileRules(String rules, OutputStream ruleBinary)375     public static void compileRules(String rules, OutputStream ruleBinary) throws IOException {
376         RBBIRuleBuilder.compileRules(rules, ruleBinary);
377     }
378 
379     //=======================================================================
380     // BreakIterator overrides
381     //=======================================================================
382 
383     /**
384      * Sets the current iteration position to the beginning of the text.
385      * (i.e., the CharacterIterator's starting offset).
386      * @return The offset of the beginning of the text.
387      */
388     @Override
first()389     public int first() {
390         if (fText == null) {
391             return BreakIterator.DONE;
392         }
393         fText.first();
394         int start =  fText.getIndex();
395         if (!fBreakCache.seek(start)) {
396             fBreakCache.populateNear(start);
397         }
398         fBreakCache.current();
399         assert(fPosition == start);
400         return fPosition;
401     }
402 
403     /**
404      * Sets the current iteration position to the end of the text.
405      * (i.e., the CharacterIterator's ending offset).
406      * @return The text's past-the-end offset.
407      */
408     @Override
last()409     public int last() {
410         if (fText == null) {
411             return BreakIterator.DONE;
412         }
413         int endPos = fText.getEndIndex();
414         boolean endShouldBeBoundary = isBoundary(endPos);      // Has side effect of setting iterator position.
415         assert(endShouldBeBoundary);
416         if (fPosition != endPos) {
417             assert(fPosition == endPos);
418         }
419         return endPos;
420     }
421 
422     /**
423      * Advances the iterator either forward or backward the specified number of steps.
424      * Negative values move backward, and positive values move forward.  This is
425      * equivalent to repeatedly calling next() or previous().
426      * @param n The number of steps to move.  The sign indicates the direction
427      * (negative is backwards, and positive is forwards).
428      * @return The character offset of the boundary position n boundaries away from
429      * the current one.
430      */
431     @Override
next(int n)432     public int next(int n) {
433         int result = 0;
434         if (n > 0) {
435             for (; n > 0 && result != DONE; --n) {
436                 result = next();
437             }
438         } else if (n < 0) {
439             for (; n < 0 && result != DONE; ++n) {
440                 result = previous();
441             }
442         } else {
443             result = current();
444         }
445         return result;
446     }
447 
448     /**
449      * Advances the iterator to the next boundary position.
450      * @return The position of the first boundary after this one.
451      */
452     @Override
next()453     public int next() {
454         fBreakCache.next();
455         return fDone ? DONE : fPosition;
456     }
457 
458     /**
459      * Moves the iterator backwards, to the boundary preceding the current one.
460      * @return The position of the boundary position immediately preceding the starting position.
461      */
462     @Override
previous()463     public int previous() {
464         fBreakCache.previous();
465         return fDone ? DONE : fPosition;
466     }
467 
468     /**
469      * Sets the iterator to refer to the first boundary position following
470      * the specified position.
471      * @param startPos The position from which to begin searching for a break position.
472      * @return The position of the first break after the current position.
473      */
474     @Override
following(int startPos)475     public int following(int startPos) {
476         // if the supplied position is before the beginning, return the
477         // text's starting offset
478         if (startPos < fText.getBeginIndex()) {
479             return first();
480         }
481 
482         // Move requested offset to a code point start. It might be between a lead and trail surrogate.
483         // Or it may be beyond the end of the text.
484         startPos = CISetIndex32(fText, startPos);
485         fBreakCache.following(startPos);
486         return fDone ? DONE : fPosition;
487     }
488 
489 
490     /**
491      * Sets the iterator to refer to the last boundary position before the
492      * specified position.
493      * @param offset The position to begin searching for a break from.
494      * @return The position of the last boundary before the starting position.
495      */
496     @Override
preceding(int offset)497     public int preceding(int offset) {
498         if (fText == null || offset > fText.getEndIndex()) {
499             return last();
500         } else if (offset < fText.getBeginIndex()) {
501             return first();
502         }
503 
504         // Move requested offset to a code point start. It might be between a lead and trail surrogate.
505         // int adjustedOffset = CISetIndex32(fText, offset);    // TODO: restore to match ICU4C behavior.
506         int adjustedOffset = offset;
507         fBreakCache.preceding(adjustedOffset);
508         return fDone ? DONE : fPosition;
509 
510     }
511 
512 
513     /**
514      * Throw IllegalArgumentException unless begin &lt;= offset &lt; end.
515      */
checkOffset(int offset, CharacterIterator text)516     protected static final void checkOffset(int offset, CharacterIterator text) {
517         if (offset < text.getBeginIndex() || offset > text.getEndIndex()) {
518             throw new IllegalArgumentException("offset out of bounds");
519         }
520     }
521 
522 
523     /**
524      * Returns true if the specified position is a boundary position.  As a side
525      * effect, leaves the iterator pointing to the first boundary position at
526      * or after "offset".
527      * @param offset the offset to check.
528      * @return True if "offset" is a boundary position.
529      */
530     @Override
isBoundary(int offset)531     public boolean isBoundary(int offset) {
532         // TODO: behavior difference with ICU4C, which considers out-of-range offsets
533         //       to not be boundaries, and to not be errors.
534         checkOffset(offset, fText);
535 
536         // Adjust offset to be on a code point boundary and not beyond the end of the text.
537         // Note that isBoundary() is always false for offsets that are not on code point boundaries.
538         // But we still need the side effect of leaving iteration at the following boundary.
539         int adjustedOffset = CISetIndex32(fText, offset);
540 
541         boolean result = false;
542         if (fBreakCache.seek(adjustedOffset) || fBreakCache.populateNear(adjustedOffset)) {
543             result = (fBreakCache.current() == offset);
544         }
545 
546         if (!result) {
547             // Not on a boundary. isBoundary() must leave iterator on the following boundary.
548             // fBreakCache.seek(), above, left us on the preceding boundary, so advance one.
549             next();
550         }
551         return result;
552 
553     }
554 
555     /**
556      * Returns the current iteration position.  Note that DONE is never
557      * returned from this function; if iteration has run to the end of a
558      * string, current() will return the length of the string while
559      * next() will return BreakIterator.DONE).
560      * @return The current iteration position.
561      */
562     @Override
current()563     public int current() {
564         return (fText != null) ? fPosition : BreakIterator.DONE;
565     }
566 
567 
568     /**
569      * Return the status tag from the break rule that determined the boundary at
570      * the current iteration position.  The values appear in the rule source
571      * within brackets, {123}, for example.  For rules that do not specify a
572      * status, a default value of 0 is returned.  If more than one rule applies,
573      * the numerically largest of the possible status values is returned.
574      * <p>
575      * Of the standard types of ICU break iterators, only the word and line break
576      * iterator provides status values.  The values are defined in
577      * class RuleBasedBreakIterator, and allow distinguishing between words
578      * that contain alphabetic letters, "words" that appear to be numbers,
579      * punctuation and spaces, words containing ideographic characters, and
580      * more.  Call <code>getRuleStatus</code> after obtaining a boundary
581      * position from <code>next()</code>, <code>previous()</code>, or
582      * any other break iterator functions that returns a boundary position.
583      * <p>
584      * Note that <code>getRuleStatus()</code> returns the value corresponding to
585      * <code>current()</code> index even after <code>next()</code> has returned DONE.
586      * <p>
587 
588      * @return the status from the break rule that determined the boundary
589      * at the current iteration position.
590      */
591 
592     @Override
getRuleStatus()593     public int  getRuleStatus() {
594         //   Status records have this form:
595         //           Count N         <--  fLastRuleStatusIndex points here.
596         //           Status val 0
597         //           Status val 1
598         //              ...
599         //           Status val N-1  <--  the value we need to return
600         //   The status values are sorted in ascending order.
601         //   This function returns the last (largest) of the array of status values.
602         int  idx = fRuleStatusIndex + fRData.fStatusTable[fRuleStatusIndex];
603         int  tagVal = fRData.fStatusTable[idx];
604         return tagVal;
605     }
606 
607     /**
608      * Get the status (tag) values from the break rule(s) that determined the boundary
609      * at the current iteration position.  The values appear in the rule source
610      * within brackets, {123}, for example.  The default status value for rules
611      * that do not explicitly provide one is zero.
612      * <p>
613      * The status values used by the standard ICU break rules are defined
614      * as public constants in class RuleBasedBreakIterator.
615      * <p>
616      * If the size  of the output array is insufficient to hold the data,
617      *  the output will be truncated to the available length.  No exception
618      *  will be thrown.
619      *
620      * @param fillInArray an array to be filled in with the status values.
621      * @return          The number of rule status values from the rules that determined
622      *                  the boundary at the current iteration position.
623      *                  In the event that the array is too small, the return value
624      *                  is the total number of status values that were available,
625      *                  not the reduced number that were actually returned.
626      */
627     @Override
getRuleStatusVec(int[] fillInArray)628     public int getRuleStatusVec(int[] fillInArray) {
629         int numStatusVals = fRData.fStatusTable[fRuleStatusIndex];
630         if (fillInArray != null) {
631             int numToCopy = Math.min(numStatusVals, fillInArray.length);
632             for (int i=0; i<numToCopy; i++) {
633                 fillInArray[i] = fRData.fStatusTable[fRuleStatusIndex + i + 1];
634             }
635         }
636         return numStatusVals;
637     }
638 
639     /**
640      * Returns a CharacterIterator over the text being analyzed.
641      * <p>
642      * <b><i>Caution:</i></b>The state of the returned CharacterIterator
643      * must not be modified in any way while the BreakIterator is still in use.
644      * Doing so will lead to undefined behavior of the BreakIterator.
645      * Clone the returned CharacterIterator first and work with that.
646      * <p>
647      * The returned CharacterIterator is a reference
648      * to the <b>actual iterator being used</b> by the BreakIterator.
649      * No guarantees are made about the current position
650      * of this iterator when it is returned; it may differ from the
651      * BreakIterators current position.  If you need to move that
652      * position to examine the text, clone this function's return value first.
653      * @return An iterator over the text being analyzed.
654      */
655     @Override
getText()656     public CharacterIterator getText() {
657         return fText;
658     }
659 
660     /**
661      * Set the iterator to analyze a new piece of text.  This function resets
662      * the current iteration position to the beginning of the text.
663      * (The old iterator is dropped.)
664      * <p>
665      * <b><i>Caution:</i></b> The supplied CharacterIterator is used
666      * directly by the BreakIterator, and must not be altered in any
667      * way by code outside of the BreakIterator.
668      * Doing so will lead to undefined behavior of the BreakIterator.
669      *
670      * @param newText An iterator over the text to analyze.
671      */
672     @Override
setText(CharacterIterator newText)673     public void setText(CharacterIterator newText) {
674         if (newText != null) {
675             fBreakCache.reset(newText.getBeginIndex(), 0);
676         } else {
677             fBreakCache.reset();
678         }
679         fDictionaryCache.reset();
680         fText = newText;
681         this.first();
682     }
683 
684      /**
685      * Control debug, trace and dump options.
686      * @deprecated This API is ICU internal only.
687      * @hide draft / provisional / internal are hidden on Android
688      */
689     @Deprecated
690     public static final String fDebugEnv = ICUDebug.enabled(RBBI_DEBUG_ARG) ?
691                                            ICUDebug.value(RBBI_DEBUG_ARG) : null;
692 
693 
getLanguageBreakEngine(int c)694     private LanguageBreakEngine getLanguageBreakEngine(int c) {
695 
696         // We have a dictionary character.
697         // Does an already instantiated break engine handle it?
698         // First read without synchronization, which could lead to a new language
699         // break engine being added and we didn't go over it.
700         for (LanguageBreakEngine candidate : gAllBreakEngines) {
701             if (candidate.handles(c)) {
702                 return candidate;
703             }
704         }
705 
706         synchronized (gAllBreakEngines) {
707             // Another break iterator may have instantiated the desired engine.
708             for (LanguageBreakEngine candidate : gAllBreakEngines) {
709                 if (candidate.handles(c)) {
710                     return candidate;
711                 }
712             }
713 
714             // The global list doesn't have an existing engine, build one.
715             int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
716             if (script == UScript.KATAKANA || script == UScript.HIRAGANA) {
717                 // Katakana, Hiragana and Han are handled by the same dictionary engine.
718                 // Fold them together for mapping from script -> engine.
719                 script = UScript.HAN;
720             }
721 
722             LanguageBreakEngine eng;
723             try {
724                 switch (script) {
725                 case UScript.THAI:
726                     try {
727                         eng = LSTMBreakEngine.create(script, LSTMBreakEngine.createData(script));
728                     } catch (MissingResourceException e) {
729                         eng = new ThaiBreakEngine();
730                     }
731                     break;
732                 case UScript.LAO:
733                     eng = new LaoBreakEngine();
734                     break;
735                 case UScript.MYANMAR:
736                     try {
737                         eng = LSTMBreakEngine.create(script, LSTMBreakEngine.createData(script));
738                     } catch (MissingResourceException e) {
739                         eng = new BurmeseBreakEngine();
740                     }
741                     break;
742                 case UScript.KHMER:
743                     eng = new KhmerBreakEngine();
744                     break;
745                 case UScript.HAN:
746                     eng = new CjkBreakEngine(false);
747                      break;
748                 case UScript.HANGUL:
749                     eng = new CjkBreakEngine(true);
750                     break;
751                 default:
752                     gUnhandledBreakEngine.handleChar(c);
753                     eng = gUnhandledBreakEngine;
754                     break;
755                 }
756             } catch (IOException e) {
757                 eng = null;
758             }
759 
760             if (eng != null && eng != gUnhandledBreakEngine) {
761                 gAllBreakEngines.add(eng);
762             }
763             return eng;
764         }   // end synchronized(gAllBreakEngines)
765     }
766 
767     /**
768      * The State Machine Engine for moving forward is here.
769      * This function is the heart of the RBBI run time engine.
770      *
771      * Input
772      *    fPosition, the position in the text to begin from.
773      * Output
774      *    fPosition:           the boundary following the starting position.
775      *    fDictionaryCharCount the number of dictionary characters encountered.
776      *                         If > 0, the segment will be further subdivided
777      *    fRuleStatusIndex     Info from the state table indicating which rules caused the boundary.
778      *
779      * @return the new iterator position
780      *
781      * A note on supplementary characters and the position of underlying
782      * Java CharacterIterator:   Normally, a character iterator is positioned at
783      * the char most recently returned by next().  Within this function, when
784      * a supplementary char is being processed, the char iterator is left
785      * sitting on the trail surrogate, in the middle of the code point.
786      * This is different from everywhere else, where an iterator always
787      * points at the lead surrogate of a supplementary.
788      */
handleNext()789     private int handleNext() {
790         if (TRACE) {
791             System.out.println("Handle Next   pos      char  state category");
792         }
793 
794         // handleNext always sets the break tag value.
795         // Set the default for it.
796         fRuleStatusIndex  = 0;
797         fDictionaryCharCount = 0;
798 
799         // caches for quicker access
800         CharacterIterator text = fText;
801         CodePointTrie trie = fRData.fTrie;
802 
803         char[] stateTable  = fRData.fFTable.fTable;
804         int initialPosition = fPosition;
805         text.setIndex(initialPosition);
806         int result          = initialPosition;
807 
808         // Set up the starting char
809         int c = text.current();
810         if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) {
811             c = nextTrail32(text, c);
812             if (c == DONE32) {
813                 fDone = true;
814                 return BreakIterator.DONE;
815             }
816         }
817 
818         // Set the initial state for the state machine
819         int state           = START_STATE;
820         int row             = fRData.getRowIndex(state);
821         short category      = 3;
822         int flagsState      = fRData.fFTable.fFlags;
823         int dictStart       = fRData.fFTable.fDictCategoriesStart;
824         int mode            = RBBI_RUN;
825         if ((flagsState & RBBIDataWrapper.RBBI_BOF_REQUIRED) != 0) {
826             category = 2;
827             mode     = RBBI_START;
828             if (TRACE) {
829                 System.out.print("            " +  RBBIDataWrapper.intToString(text.getIndex(), 5));
830                 System.out.print(RBBIDataWrapper.intToHexString(c, 10));
831                 System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6));
832             }
833         }
834 
835         // loop until we reach the end of the text or transition to state 0
836         while (state != STOP_STATE) {
837             if (c == DONE32) {
838                 // Reached end of input string.
839                 if (mode == RBBI_END) {
840                     // We have already run the loop one last time with the
841                     // character set to the pseudo {eof} value. Now it is time
842                     // to unconditionally bail out.
843                     break;
844                 }
845                 // Run the loop one last time with the fake end-of-input character category
846                 mode = RBBI_END;
847                 category = 1;
848             }
849             else if (mode == RBBI_RUN) {
850                 // Get the char category.  An incoming category of 1 or 2 mens that
851                 //      we are preset for doing the beginning or end of input, and
852                 //      that we shouldn't get a category from an actual text input character.
853                 //
854 
855                 // look up the current character's character category, which tells us
856                 // which column in the state table to look at.
857                 //
858                 category = (short) trie.get(c);
859 
860                 // Check for categories that require word dictionary handling.
861                 if (category >= dictStart) {
862                     fDictionaryCharCount++;
863                 }
864 
865                 if (TRACE) {
866                     System.out.print("            " +  RBBIDataWrapper.intToString(text.getIndex(), 5));
867                     System.out.print(RBBIDataWrapper.intToHexString(c, 10));
868                     System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6));
869                 }
870 
871                 // Advance to the next character.
872                 // If this is a beginning-of-input loop iteration, don't advance.
873                 //    The next iteration will be processing the first real input character.
874                 c = text.next();
875                 if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) {
876                     c = nextTrail32(text, c);
877                 }
878             }
879             else {
880                 mode = RBBI_RUN;
881             }
882 
883             // look up a state transition in the state table
884             state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category];
885             row   = fRData.getRowIndex(state);
886             int accepting = stateTable[row + RBBIDataWrapper.ACCEPTING];
887             if (accepting == RBBIDataWrapper.ACCEPTING_UNCONDITIONAL) {
888                 // Match found, common case
889                 result = text.getIndex();
890                 if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c <= UTF16.CODEPOINT_MAX_VALUE) {
891                     // The iterator has been left in the middle of a surrogate pair.
892                     // We want the start of it.
893                     result--;
894                 }
895 
896                 //  Remember the break status (tag) values.
897                 fRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGSIDX];
898             } else if (accepting > RBBIDataWrapper.ACCEPTING_UNCONDITIONAL) {
899                 // Lookahead match is completed
900                 int lookaheadResult = fLookAheadMatches[accepting];
901                 if (lookaheadResult >= 0) {
902                     fRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGSIDX];
903                     fPosition = lookaheadResult;
904                     return lookaheadResult;
905                 }
906             }
907 
908 
909             // If we are at the position of the '/' in a look-ahead (hard break) rule;
910             // record the current position, to be returned later, if the full rule matches.
911             // TODO: Move this check before the previous check of fAccepting.
912             //       This would enable hard-break rules with no following context.
913             //       But there are line break test failures when trying this. Investigate.
914             //       Issue ICU-20837
915             int rule = stateTable[row + RBBIDataWrapper.LOOKAHEAD];
916             if (rule != 0) {
917                 int  pos = text.getIndex();
918                 if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c <= UTF16.CODEPOINT_MAX_VALUE) {
919                     // The iterator has been left in the middle of a surrogate pair.
920                     // We want the beginning  of it.
921                     pos--;
922                 }
923                 fLookAheadMatches[rule] = pos;
924             }
925 
926 
927         }        // End of state machine main loop
928 
929         // The state machine is done.  Check whether it found a match...
930 
931         // If the iterator failed to advance in the match engine force it ahead by one.
932         // This indicates a defect in the break rules, which should always match
933         // at least one character.
934 
935         if (result == initialPosition) {
936             if (TRACE) {
937                 System.out.println("Iterator did not move. Advancing by 1.");
938             }
939             text.setIndex(initialPosition);
940             next32(text);
941             result = text.getIndex();
942             fRuleStatusIndex = 0;
943         }
944 
945         // Leave the iterator at our result position.
946         //   (we may have advanced beyond the last accepting position chasing after
947         //    longer matches that never completed.)
948         fPosition = result;
949 
950         if (TRACE) {
951             System.out.println("result = " + result);
952         }
953         return result;
954     }
955 
956     /**
957      * Iterate backwards from an arbitrary position in the input text using the Safe Reverse rules.
958      * This locates a "Safe Position" from which the forward break rules
959      * will operate correctly. A Safe Position is not necessarily a boundary itself.
960      *
961      * The logic of this function is very similar to handleNext(), above, but simpler
962      * because the safe table does not require as many options.
963      *
964      * @param fromPosition the position in the input text to begin the iteration.
965      * @hide draft / provisional / internal are hidden on Android
966      */
handleSafePrevious(int fromPosition)967     private int handleSafePrevious(int fromPosition) {
968         char            state;
969         short           category = 0;
970         int             result = 0;
971 
972         // caches for quicker access
973         CharacterIterator text = fText;
974         CodePointTrie trie = fRData.fTrie;
975         char[] stateTable  = fRData.fRTable.fTable;
976 
977         CISetIndex32(text, fromPosition);
978         if (TRACE) {
979             System.out.print("Handle Previous   pos   char  state category");
980         }
981 
982         // if we're already at the start of the text, return DONE.
983         if (text.getIndex() == text.getBeginIndex()) {
984             return BreakIterator.DONE;
985         }
986 
987         //  Set the initial state for the state machine
988         int c = CharacterIteration.previous32(text);
989         state = START_STATE;
990         int row = fRData.getRowIndex(state);
991 
992         // loop until we reach the start of the text or transition to state 0
993         //
994         for (; c != DONE32; c = CharacterIteration.previous32(text)) {
995 
996             // look up the current character's character category, which tells us
997             // which column in the state table to look at.
998             //
999             //  And off the dictionary flag bit. For reverse iteration it is not used.
1000             category = (short) trie.get(c);
1001             if (TRACE) {
1002                 System.out.print("            " +  RBBIDataWrapper.intToString(text.getIndex(), 5));
1003                 System.out.print(RBBIDataWrapper.intToHexString(c, 10));
1004                 System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6));
1005             }
1006 
1007             // State Transition - move machine to its next state
1008             //
1009             assert(category < fRData.fHeader.fCatCount);
1010             state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category];
1011             row   = fRData.getRowIndex(state);
1012 
1013             if (state == STOP_STATE) {
1014                 // This is the normal exit from the lookup state machine.
1015                 // Transition to state zero means we have found a safe point.
1016                 break;
1017             }
1018         }
1019 
1020         // The state machine is done.
1021         result = text.getIndex();
1022         if (TRACE) {
1023             System.out.println("result = " + result);
1024         }
1025         return result;
1026     }
1027 
1028     /**
1029      * Set the index of a CharacterIterator.
1030      * Pin the index to the valid range range of BeginIndex <= index <= EndIndex.
1031      * If the index points to a trail surrogate of a supplementary character, adjust it
1032      * to the start (lead surrogate) index.
1033      *
1034      * @param ci A CharacterIterator to set
1035      * @param index the index to set
1036      * @return the resulting index, possibly pinned or adjusted.
1037      */
1038     private static int CISetIndex32(CharacterIterator ci, int index) {
1039         if (index <= ci.getBeginIndex()) {
1040             ci.first();
1041         } else if (index >= ci.getEndIndex()) {
1042             ci.setIndex(ci.getEndIndex());
1043         } else if (Character.isLowSurrogate(ci.setIndex(index))) {
1044             if (!Character.isHighSurrogate(ci.previous())) {
1045                 ci.next();
1046             }
1047         }
1048         return ci.getIndex();
1049     }
1050 
1051     /** DictionaryCache  stores the boundaries obtained from a run of dictionary characters.
1052      *                 Dictionary boundaries are moved first to this cache, then from here
1053      *                 to the main BreakCache, where they may inter-leave with non-dictionary
1054      *                 boundaries. The public BreakIterator API always fetches directly
1055      *                 from the main BreakCache, not from here.
1056      *
1057      *                 In common situations, the number of boundaries in a single dictionary run
1058      *                 should be quite small, it will be terminated by punctuation, spaces,
1059      *                 or any other non-dictionary characters. The main BreakCache may end
1060      *                 up with boundaries from multiple dictionary based runs.
1061      *
1062      *                 The boundaries are stored in a simple ArrayList (vector), with the
1063      *                 assumption that they will be accessed sequentially.
1064      */
1065     class DictionaryCache  {
1066 
1067          void reset() {
1068              fPositionInCache = -1;
1069              fStart = 0;
1070              fLimit = 0;
1071              fFirstRuleStatusIndex = 0;
1072              fOtherRuleStatusIndex = 0;
1073              fBreaks.removeAllElements();
1074          };
1075 
1076          boolean following(int fromPos) {
1077              if (fromPos >= fLimit || fromPos < fStart) {
1078                  fPositionInCache = -1;
1079                  return false;
1080              }
1081 
1082              // Sequential iteration, move from previous boundary to the following
1083 
1084              int r = 0;
1085              if (fPositionInCache >= 0 && fPositionInCache < fBreaks.size() && fBreaks.elementAt(fPositionInCache) == fromPos) {
1086                  ++fPositionInCache;
1087                  if (fPositionInCache >= fBreaks.size()) {
1088                      fPositionInCache = -1;
1089                      return false;
1090                  }
1091                  r = fBreaks.elementAt(fPositionInCache);
1092                  assert(r > fromPos);
1093                  fBoundary = r;
1094                  fStatusIndex = fOtherRuleStatusIndex;
1095                  return true;
1096              }
1097 
1098              // Random indexing. Linear search for the boundary following the given position.
1099 
1100              for (fPositionInCache = 0; fPositionInCache < fBreaks.size(); ++fPositionInCache) {
1101                  r= fBreaks.elementAt(fPositionInCache);
1102                  if (r > fromPos) {
1103                      fBoundary = r;
1104                      fStatusIndex = fOtherRuleStatusIndex;
1105                      return true;
1106                  }
1107              }
1108 
1109              // Internal error. fStart <= fromPos < fLimit, but no cached boundary.
1110              assert(false);
1111              fPositionInCache = -1;
1112              return false;
1113          };
1114 
preceding(int fromPos)1115          boolean preceding(int fromPos) {
1116              if (fromPos <= fStart || fromPos > fLimit) {
1117                  fPositionInCache = -1;
1118                  return false;
1119              }
1120 
1121              if (fromPos == fLimit) {
1122                  fPositionInCache = fBreaks.size() - 1;
1123                  if (fPositionInCache >= 0) {
1124                      assert(fBreaks.elementAt(fPositionInCache) == fromPos);
1125                  }
1126              }
1127 
1128              int r;
1129              if (fPositionInCache > 0 && fPositionInCache < fBreaks.size() && fBreaks.elementAt(fPositionInCache) == fromPos) {
1130                  --fPositionInCache;
1131                  r = fBreaks.elementAt(fPositionInCache);
1132                  assert(r < fromPos);
1133                  fBoundary = r;
1134                  fStatusIndex = ( r== fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex;
1135                  return true;
1136              }
1137 
1138              if (fPositionInCache == 0) {
1139                  fPositionInCache = -1;
1140                  return false;
1141              }
1142 
1143              for (fPositionInCache = fBreaks.size()-1; fPositionInCache >= 0; --fPositionInCache) {
1144                  r = fBreaks.elementAt(fPositionInCache);
1145                  if (r < fromPos) {
1146                      fBoundary = r;
1147                      fStatusIndex = ( r == fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex;
1148                      return true;
1149                  }
1150              }
1151              assert(false);
1152              fPositionInCache = -1;
1153              return false;
1154          };
1155 
1156         /**
1157          * Populate the cache with the dictionary based boundaries within a region of text.
1158          * @param startPos  The start position of a range of text
1159          * @param endPos    The end position of a range of text
1160          * @param firstRuleStatus The rule status index that applies to the break at startPos
1161          * @param otherRuleStatus The rule status index that applies to boundaries other than startPos
1162          * @hide draft / provisional / internal are hidden on Android
1163          */
populateDictionary(int startPos, int endPos, int firstRuleStatus, int otherRuleStatus)1164         void populateDictionary(int startPos, int endPos,
1165                                 int firstRuleStatus, int otherRuleStatus) {
1166             if ((endPos - startPos) <= 1) {
1167                 return;
1168             }
1169 
1170             reset();
1171             fFirstRuleStatusIndex = firstRuleStatus;
1172             fOtherRuleStatusIndex = otherRuleStatus;
1173 
1174             int rangeStart = startPos;
1175             int rangeEnd = endPos;
1176 
1177             int         category;
1178             int         current;
1179             int         foundBreakCount = 0;
1180 
1181             // Loop through the text, looking for ranges of dictionary characters.
1182             // For each span, find the appropriate break engine, and ask it to find
1183             // any breaks within the span.
1184 
1185             fText.setIndex(rangeStart);
1186             int     c = CharacterIteration.current32(fText);
1187             category = (short)fRData.fTrie.get(c);
1188             int dictStart = fRData.fFTable.fDictCategoriesStart;
1189 
1190             while(true) {
1191                 while((current = fText.getIndex()) < rangeEnd && (category < dictStart)) {
1192                     c = CharacterIteration.next32(fText);    // pre-increment
1193                     category = (short)fRData.fTrie.get(c);
1194                 }
1195                 if (current >= rangeEnd) {
1196                     break;
1197                 }
1198 
1199                 // We now have a dictionary character. Get the appropriate language object
1200                 // to deal with it.
1201                 LanguageBreakEngine lbe = getLanguageBreakEngine(c);
1202 
1203                 // Ask the language object if there are any breaks. It will add them to the cache and
1204                 // leave the text pointer on the other side of its range, ready to search for the next one.
1205                 if (lbe != null) {
1206                     foundBreakCount += lbe.findBreaks(fText, rangeStart, rangeEnd, fBreaks, fPhraseBreaking);
1207                 }
1208 
1209                 // Reload the loop variables for the next go-round
1210                 c = CharacterIteration.current32(fText);
1211                 category = (short)fRData.fTrie.get(c);
1212             }
1213 
1214             // If we found breaks, ensure that the first and last entries are
1215             // the original starting and ending position. And initialize the
1216             // cache iteration position to the first entry.
1217 
1218             // System.out.printf("foundBreakCount = %d%n", foundBreakCount);
1219             if (foundBreakCount > 0) {
1220                 assert(foundBreakCount == fBreaks.size());
1221                 if (startPos < fBreaks.elementAt(0)) {
1222                     // The dictionary did not place a boundary at the start of the segment of text.
1223                     // Add one now. This should not commonly happen, but it would be easy for interactions
1224                     // of the rules for dictionary segments and the break engine implementations to
1225                     // inadvertently cause it. Cover it here, just in case.
1226                     fBreaks.offer(startPos);
1227                 }
1228                 if (endPos > fBreaks.peek()) {
1229                     fBreaks.push(endPos);
1230                 }
1231                 fPositionInCache = 0;
1232                 // Note: Dictionary matching may extend beyond the original limit.
1233                 fStart = fBreaks.elementAt(0);
1234                 fLimit = fBreaks.peek();
1235             } else {
1236                 // there were no language-based breaks, even though the segment contained
1237                 // dictionary characters. Subsequent attempts to fetch boundaries from the dictionary cache
1238                 // for this range will fail, and the calling code will fall back to the rule based boundaries.
1239             }
1240 
1241         };
1242 
1243 
DictionaryCache()1244         DictionaryCache() {
1245             fPositionInCache = -1;
1246             fBreaks = new DictionaryBreakEngine.DequeI();
1247         }
1248 
1249         /**
1250          * copy constructor. Used by RuleBasedBreakIterator.clone().
1251          *
1252          * @param src the source object to be copied.
1253          */
DictionaryCache(DictionaryCache src)1254         DictionaryCache(DictionaryCache src)  {
1255             try {
1256                 fBreaks = (DictionaryBreakEngine.DequeI)src.fBreaks.clone();
1257             }
1258             catch (CloneNotSupportedException e) {
1259                 throw new RuntimeException(e);
1260             }
1261             fPositionInCache      = src.fPositionInCache;
1262             fStart                = src.fStart;
1263             fLimit                = src.fLimit;
1264             fFirstRuleStatusIndex = src.fFirstRuleStatusIndex;
1265             fOtherRuleStatusIndex = src.fOtherRuleStatusIndex;
1266             fBoundary             = src.fBoundary;
1267             fStatusIndex          = src.fStatusIndex;
1268         }
1269 
1270         // A data structure containing the boundaries themselves. Essentially a vector of raw ints.
1271         DictionaryBreakEngine.DequeI fBreaks;
1272         int             fPositionInCache;       // Index in fBreaks of last boundary returned by following()
1273         //                                      //    or preceding(). Optimizes sequential access.
1274         int             fStart;                 // Text position of first boundary in cache.
1275         int             fLimit;                 // Last boundary in cache. Which is the limit of the
1276         //                                      //    text segment being handled by the dictionary.
1277         int             fFirstRuleStatusIndex;  // Rule status info for first boundary.
1278         int             fOtherRuleStatusIndex;  // Rule status info for 2nd through last boundaries.
1279         int             fBoundary;              // Current boundary. Set by preceding(), following().
1280         int             fStatusIndex;           // Current rule status index. Set by preceding, following().
1281     };
1282 
1283 
1284 
1285 
1286 /*
1287  * class BreakCache
1288  *
1289  * Cache of break boundary positions and rule status values.
1290  * Break iterator API functions, next(), previous(), etc., will use cached results
1291  * when possible, and otherwise cache new results as they are obtained.
1292  *
1293  * Uniformly caches both dictionary and rule based (non-dictionary) boundaries.
1294  *
1295  * The cache is implemented as a single circular buffer.
1296  */
1297 
1298 /*
1299  * size of the circular cache buffer.
1300  */
1301 
1302 class BreakCache {
1303 
BreakCache()1304     BreakCache() {
1305         reset();
1306     };
1307 
reset(int pos, int ruleStatus)1308     void  reset(int pos, int ruleStatus) {
1309         fStartBufIdx = 0;
1310         fEndBufIdx = 0;
1311         fTextIdx = pos;
1312         fBufIdx = 0;
1313         fBoundaries[0] = pos;
1314         fStatuses[0] = (short)ruleStatus;
1315     }
1316 
reset()1317     void  reset() {reset(0, 0); };
1318 
next()1319     void  next() {
1320         if (fBufIdx == fEndBufIdx) {
1321             fDone = !populateFollowing();
1322             fPosition = fTextIdx;
1323             fRuleStatusIndex = fStatuses[fBufIdx];
1324         } else {
1325             fBufIdx = modChunkSize(fBufIdx + 1);
1326             fTextIdx = fPosition = fBoundaries[fBufIdx];
1327             fRuleStatusIndex = fStatuses[fBufIdx];
1328         }
1329     };
1330 
previous()1331     void  previous() {
1332         int initialBufIdx = fBufIdx;
1333         if (fBufIdx == fStartBufIdx) {
1334             // At start of cache. Prepend to it.
1335             populatePreceding();
1336         } else {
1337             // Cache already holds the next boundary
1338             fBufIdx = modChunkSize(fBufIdx - 1);
1339             fTextIdx = fBoundaries[fBufIdx];
1340         }
1341         fDone = (fBufIdx == initialBufIdx);
1342         fPosition = fTextIdx;
1343         fRuleStatusIndex = fStatuses[fBufIdx];
1344         return;
1345     };
1346 
1347     // Move the iteration state to the position following the startPosition.
1348     // Input position must be pinned to the input length.
following(int startPos)1349     void following(int startPos) {
1350         if (startPos == fTextIdx || seek(startPos) || populateNear(startPos)) {
1351             // startPos is in the cache. Do a next() from that position.
1352             // TODO: an awkward set of interactions with bi->fDone
1353             //       seek() does not clear it; it can't because of interactions with populateNear().
1354             //       next() does not clear it in the fast-path case, where everything matters. Maybe it should.
1355             //       So clear it here, for the case where seek() succeeded on an iterator that had previously run off the end.
1356             fDone = false;
1357             next();
1358         }
1359 
1360     };
1361 
preceding(int startPos)1362     void  preceding(int startPos) {
1363         if (startPos == fTextIdx || seek(startPos) || populateNear(startPos)) {
1364             if (startPos == fTextIdx) {
1365                 previous();
1366             } else {
1367                 // seek() leaves the BreakCache positioned at the preceding boundary
1368                 //        if the requested position is between two boundaries.
1369                 // current() pushes the BreakCache position out to the BreakIterator itself.
1370                 assert(startPos > fTextIdx);
1371                 current();
1372             }
1373         }
1374         return;
1375     };
1376 
1377     /**
1378      * Update the state of the public BreakIterator (fBI) to reflect the
1379      * current state of the break iterator cache (this).
1380      */
current()1381     int current() {
1382         fPosition = fTextIdx;
1383         fRuleStatusIndex = fStatuses[fBufIdx];
1384         fDone = false;
1385         return fTextIdx;
1386     };
1387 
1388     /**
1389      * Add boundaries to the cache near the specified position.
1390      * The given position need not be a boundary itself.
1391      * The input position must be within the range of the text, and
1392      * on a code point boundary.
1393      * If the requested position is a break boundary, leave the iteration
1394      * position on it.
1395      * If the requested position is not a boundary, leave the iteration
1396      * position on the preceding boundary and include both the the
1397      * preceding and following boundaries in the cache.
1398      * Additional boundaries, either preceding or following, may be added
1399      * to the cache as a side effect.
1400      *
1401      * Return false if the operation failed.
1402      */
populateNear(int position)1403     boolean populateNear(int position) {
1404         assert(position < fBoundaries[fStartBufIdx] || position > fBoundaries[fEndBufIdx]);
1405 
1406         // Add boundaries to the cache near the specified position.
1407         // The given position need not be a boundary itself.
1408         // The input position must be within the range of the text, and
1409         // on a code point boundary.
1410         // If the requested position is a break boundary, leave the iteration
1411         // position on it.
1412         // If the requested position is not a boundary, leave the iteration
1413         // position on the preceding boundary and include both the
1414         // preceding and following boundaries in the cache.
1415         // Additional boundaries, either preceding or following, may be added
1416         // to the cache as a side effect.
1417 
1418         // If the requested position is not near already cached positions, clear the existing cache,
1419         // find a near-by boundary and begin new cache contents there.
1420 
1421         // Threshold for a text position to be considered near to existing cache contents.
1422         // TODO: See issue ICU-22024 "perf tuning of Cache needed."
1423         //       This value is subject to change. See the ticket for more details.
1424         final int CACHE_NEAR = 15;
1425 
1426         int startOfText = fText.getBeginIndex();
1427         int aBoundary = -1;
1428         int ruleStatusIndex = 0;
1429         boolean retainCache = false;
1430         if ((position > fBoundaries[fStartBufIdx] - CACHE_NEAR) && position < (fBoundaries[fEndBufIdx] + CACHE_NEAR)) {
1431             // Requested position is near the existing cache. Retain it.
1432             retainCache = true;
1433         } else if (position <= startOfText + CACHE_NEAR) {
1434             // Requested position is near the start of the text. Fill cache from start, skipping
1435             // the need to find a safe point.
1436             retainCache = false;
1437             aBoundary = startOfText;
1438         } else {
1439             // Requested position is not near the existing cache.
1440             // Find a safe point to refill the cache from.
1441             int backupPos = handleSafePrevious(position);
1442 
1443             if (fBoundaries[fEndBufIdx] < position && fBoundaries[fEndBufIdx] >= (backupPos - CACHE_NEAR)) {
1444                 // The requested position is beyond the end of the existing cache, but the
1445                 // reverse rules produced a position near or before the cached region.
1446                 // Retain the existing cache, and fill from the end of it.
1447                 retainCache = true;
1448             } else if (backupPos < startOfText + CACHE_NEAR) {
1449                 // The safe reverse rules moved us to near the start of text.
1450                 // Take that (index 0) as the backup boundary, avoiding the complication
1451                 // (in the following block) of moving forward from the safe point to a known boundary.
1452                 //
1453                 // Retain the cache if it begins not too far from the requested position.
1454                 aBoundary = startOfText;
1455                 retainCache = (fBoundaries[fStartBufIdx] <= (position + CACHE_NEAR));
1456             } else {
1457                 // The safe reverse rules produced a position that is neither near the existing
1458                 // cache, nor near the start of text.
1459                 // Advance to the boundary following.
1460                 // There is a complication: the safe reverse rules identify pairs of code points
1461                 // that are safe. If advancing from the safe point moves forwards by less than
1462                 // two code points, we need to advance one more time to ensure that the boundary
1463                 // is good, including a correct rules status value.
1464                 //
1465                 retainCache = false;
1466                 fPosition = backupPos;
1467                 aBoundary = handleNext();
1468                 if (aBoundary == backupPos + 1 ||
1469                         (aBoundary == backupPos + 2 &&
1470                         Character.isHighSurrogate(fText.setIndex(backupPos)) &&
1471                         Character.isLowSurrogate(fText.next()))) {
1472                     // The initial handleNext() only advanced by a single code point. Go again.
1473                     // Safe rules identify safe pairs.
1474                     aBoundary = handleNext();
1475                 }
1476                 if (aBoundary == BreakIterator.DONE) {
1477                     aBoundary = fText.getEndIndex();
1478                 }
1479                 ruleStatusIndex = fRuleStatusIndex;
1480             }
1481         }
1482 
1483         if (!retainCache) {
1484             assert(aBoundary != -1);
1485             reset(aBoundary, ruleStatusIndex);               // Reset cache to hold aBoundary as a single starting point.
1486         }
1487 
1488 
1489         // Fill in boundaries between existing cache content and the new requested position.
1490 
1491         if (fBoundaries[fEndBufIdx] < position) {
1492             // The last position in the cache precedes the requested position.
1493             // Add following position(s) to the cache.
1494             while (fBoundaries[fEndBufIdx] < position) {
1495                 if (!populateFollowing()) {
1496                     assert false;
1497                     return false;
1498                 }
1499             }
1500             fBufIdx = fEndBufIdx;                      // Set iterator position to the end of the buffer.
1501             fTextIdx = fBoundaries[fBufIdx];           // Required because populateFollowing may add extra boundaries.
1502             while (fTextIdx > position) {              // Move backwards to a position at or preceding the requested pos.
1503                 previous();
1504             }
1505             return true;
1506         }
1507 
1508         if (fBoundaries[fStartBufIdx] > position) {
1509             // The first position in the cache is beyond the requested position.
1510             // back up more until we get a boundary <= the requested position.
1511             while (fBoundaries[fStartBufIdx] > position) {
1512                 populatePreceding();
1513             }
1514             fBufIdx = fStartBufIdx;                    // Set iterator position to the start of the buffer.
1515             fTextIdx = fBoundaries[fBufIdx];           // Required because populatePreceding may add extra boundaries.
1516             while (fTextIdx < position) {              // Move forwards to a position at or following the requested pos.
1517                 next();
1518             }
1519             if (fTextIdx > position) {
1520                 // If position is not itself a boundary, the next() loop above will overshoot.
1521                 // Back up one, leaving cache position at the boundary preceding the requested position.
1522                 previous();
1523             }
1524             return true;
1525         }
1526 
1527         assert fTextIdx == position;
1528         return true;
1529 
1530     };
1531 
1532     /**
1533      *  Add boundary(s) to the cache following the current last boundary.
1534      *  Return false if at the end of the text, and no more boundaries can be added.
1535      *  Leave iteration position at the first newly added boundary, or unchanged if no boundary was added.
1536      */
populateFollowing()1537     boolean populateFollowing() {
1538         int fromPosition = fBoundaries[fEndBufIdx];
1539         int fromRuleStatusIdx = fStatuses[fEndBufIdx];
1540         int pos = 0;
1541         int ruleStatusIdx = 0;
1542 
1543         if (fDictionaryCache.following(fromPosition)) {
1544             addFollowing(fDictionaryCache.fBoundary, fDictionaryCache.fStatusIndex, UpdateCachePosition);
1545             return true;
1546         }
1547 
1548         fPosition = fromPosition;
1549         pos = handleNext();
1550         if (pos == BreakIterator.DONE) {
1551             return false;
1552         }
1553 
1554         ruleStatusIdx = fRuleStatusIndex;
1555         if (fDictionaryCharCount > 0) {
1556             // The text segment obtained from the rules includes dictionary characters.
1557             // Subdivide it, with subdivided results going into the dictionary cache.
1558             fDictionaryCache.populateDictionary(fromPosition, pos, fromRuleStatusIdx, ruleStatusIdx);
1559             if (fDictionaryCache.following(fromPosition)) {
1560                 addFollowing(fDictionaryCache.fBoundary, fDictionaryCache.fStatusIndex, UpdateCachePosition);
1561                 return true;
1562                 // TODO: may want to move a sizable chunk of the dictionary cache to the break cache at this point.
1563                 //       But be careful with interactions with populateNear().
1564             }
1565         }
1566 
1567         // Rule based segment did not include dictionary characters.
1568         // Or, it did contain dictionary chars, but the dictionary segmenter didn't handle them,
1569         //    meaning that we didn't take the return, above.
1570         // Add its end point to the cache.
1571         addFollowing(pos, ruleStatusIdx, UpdateCachePosition);
1572 
1573         // Add several non-dictionary boundaries at this point, to optimize straight forward iteration.
1574         //    (subsequent calls to BreakIterator::next() will take the fast path, getting cached results.
1575         //
1576         for (int count=0; count<6; ++count) {
1577             pos = handleNext();
1578             if (pos == BreakIterator.DONE || fDictionaryCharCount > 0) {
1579                 break;
1580             }
1581             addFollowing(pos, fRuleStatusIndex, RetainCachePosition);
1582         }
1583         return true;
1584     };
1585 
1586     /**
1587      *  Add one or more boundaries to the cache preceding the first currently cached boundary.
1588      *  Leave the iteration position on the first added boundary.
1589      *  Return false if no boundaries could be added (if at the start of the text.)
1590      */
populatePreceding()1591     boolean populatePreceding() {
1592         int textBegin = fText.getBeginIndex();
1593         int fromPosition = fBoundaries[fStartBufIdx];
1594         if (fromPosition == textBegin) {
1595             return false;
1596         }
1597 
1598         int position = textBegin;
1599         int positionStatusIdx = 0;
1600 
1601         if (fDictionaryCache.preceding(fromPosition)) {
1602             addPreceding(fDictionaryCache.fBoundary, fDictionaryCache.fStatusIndex, UpdateCachePosition);
1603             return true;
1604         }
1605 
1606         int backupPosition = fromPosition;
1607 
1608         // Find a boundary somewhere preceding the first already-cached boundary
1609         do {
1610             backupPosition = backupPosition - 30;
1611             if (backupPosition <= textBegin) {
1612                 backupPosition = textBegin;
1613             } else {
1614                 backupPosition = handleSafePrevious(backupPosition);
1615             }
1616             if (backupPosition == BreakIterator.DONE || backupPosition == textBegin) {
1617                 position = textBegin;
1618                 positionStatusIdx = 0;
1619             } else {
1620                 // Advance to the boundary following the backup position.
1621                 // There is a complication: the safe reverse rules identify pairs of code points
1622                 // that are safe. If advancing from the safe point moves forwards by less than
1623                 // two code points, we need to advance one more time to ensure that the boundary
1624                 // is good, including a correct rules status value.
1625                 //
1626                 fPosition = backupPosition;  // TODO: pass starting position in a clearer way.
1627                 position = handleNext();
1628                 if (position == backupPosition + 1 ||
1629                         (position == backupPosition + 2 &&
1630                         Character.isHighSurrogate(fText.setIndex(backupPosition)) &&
1631                         Character.isLowSurrogate(fText.next()))) {
1632                     // The initial handleNext() only advanced by a single code point. Go again.
1633                     // Safe rules identify safe pairs.
1634                     position = handleNext();
1635                 }
1636                 positionStatusIdx = fRuleStatusIndex;
1637             }
1638         } while (position >= fromPosition);
1639 
1640         // Find boundaries between the one we just located and the first already-cached boundary
1641         // Put them in a side buffer, because we don't yet know where they will fall in the circular cache buffer.
1642 
1643         fSideBuffer.removeAllElements();
1644         fSideBuffer.push(position);
1645         fSideBuffer.push(positionStatusIdx);
1646 
1647         do {
1648             int prevPosition = fPosition = position;
1649             int prevStatusIdx = positionStatusIdx;
1650             position = handleNext();
1651             positionStatusIdx = fRuleStatusIndex;
1652             if (position == BreakIterator.DONE) {
1653                 break;
1654             }
1655 
1656             boolean segmentHandledByDictionary = false;
1657             if (fDictionaryCharCount != 0) {
1658                 // Segment from the rules includes dictionary characters.
1659                 // Subdivide it, with subdivided results going into the dictionary cache.
1660                 int dictSegEndPosition = position;
1661                 fDictionaryCache.populateDictionary(prevPosition, dictSegEndPosition, prevStatusIdx, positionStatusIdx);
1662                 while (fDictionaryCache.following(prevPosition)) {
1663                     position = fDictionaryCache.fBoundary;
1664                     positionStatusIdx = fDictionaryCache.fStatusIndex;
1665                     segmentHandledByDictionary = true;
1666                     assert(position > prevPosition);
1667                     if (position >= fromPosition) {
1668                         break;
1669                     }
1670                     assert(position <= dictSegEndPosition);
1671                     fSideBuffer.push(position);
1672                     fSideBuffer.push(positionStatusIdx);
1673                     prevPosition = position;
1674                 }
1675                 assert(position==dictSegEndPosition || position>=fromPosition);
1676             }
1677 
1678             if (!segmentHandledByDictionary && position < fromPosition) {
1679                 fSideBuffer.push(position);
1680                 fSideBuffer.push(positionStatusIdx);
1681             }
1682         } while (position < fromPosition);
1683 
1684         // Move boundaries from the side buffer to the main circular buffer.
1685         boolean success = false;
1686         if (!fSideBuffer.isEmpty()) {
1687             positionStatusIdx = fSideBuffer.pop();
1688             position = fSideBuffer.pop();
1689             addPreceding(position, positionStatusIdx, UpdateCachePosition);
1690             success = true;
1691         }
1692 
1693         while (!fSideBuffer.isEmpty()) {
1694             positionStatusIdx = fSideBuffer.pop();
1695             position = fSideBuffer.pop();
1696             if (!addPreceding(position, positionStatusIdx, RetainCachePosition)) {
1697                 // No space in circular buffer to hold a new preceding result while
1698                 // also retaining the current cache (iteration) position.
1699                 // Bailing out is safe; the cache will refill again if needed.
1700                 break;
1701             }
1702         }
1703         return success;
1704     };
1705 
1706 
1707     static final boolean RetainCachePosition = false;
1708     static final boolean UpdateCachePosition = true;
1709 
1710     /**
1711      * Add the boundary following the current position.
1712      * The current position can be left as it was, or changed to the newly added boundary,
1713      * as specified by the update parameter.
1714      */
addFollowing(int position, int ruleStatusIdx, boolean update)1715     void addFollowing(int position, int ruleStatusIdx, boolean update) {
1716         assert(position > fBoundaries[fEndBufIdx]);
1717         assert(ruleStatusIdx <= Short.MAX_VALUE);
1718         int nextIdx = modChunkSize(fEndBufIdx + 1);
1719         if (nextIdx == fStartBufIdx) {
1720             fStartBufIdx = modChunkSize(fStartBufIdx + 6);    // TODO: experiment. Probably revert to 1.
1721         }
1722         fBoundaries[nextIdx] = position;
1723         fStatuses[nextIdx] = (short)ruleStatusIdx;
1724         fEndBufIdx = nextIdx;
1725         if (update == UpdateCachePosition) {
1726             // Set current position to the newly added boundary.
1727             fBufIdx = nextIdx;
1728             fTextIdx = position;
1729         } else {
1730             // Retaining the original cache position.
1731             // Check if the added boundary wraps around the buffer, and would over-write the original position.
1732             // It's the responsibility of callers of this function to not add too many.
1733             assert(nextIdx != fBufIdx);
1734         }
1735 
1736     };
1737 
1738 
1739     /**
1740      * Add the boundary preceding the current position.
1741      * The current position can be left as it was, or changed to the newly added boundary,
1742      * as specified by the update parameter.
1743      */
addPreceding(int position, int ruleStatusIdx, boolean update)1744     boolean addPreceding(int position, int ruleStatusIdx, boolean update) {
1745         assert(position < fBoundaries[fStartBufIdx]);
1746         assert(ruleStatusIdx <= Short.MAX_VALUE);
1747         int nextIdx = modChunkSize(fStartBufIdx - 1);
1748         if (nextIdx == fEndBufIdx) {
1749             if (fBufIdx == fEndBufIdx && update == RetainCachePosition) {
1750                 // Failure. The insertion of the new boundary would claim the buffer position that is the
1751                 // current iteration position. And we also want to retain the current iteration position.
1752                 // (The buffer is already completely full of entries that precede the iteration position.)
1753                 return false;
1754             }
1755             fEndBufIdx = modChunkSize(fEndBufIdx - 1);
1756         }
1757         fBoundaries[nextIdx] = position;
1758         fStatuses[nextIdx] = (short)ruleStatusIdx;
1759         fStartBufIdx = nextIdx;
1760         if (update == UpdateCachePosition) {
1761             fBufIdx = nextIdx;
1762             fTextIdx = position;
1763         }
1764         return true;
1765     };
1766 
1767     /**
1768      *  Set the cache position to the specified position, or, if the position
1769      *  falls between to cached boundaries, to the preceding boundary.
1770      *  Fails if the requested position is outside of the range of boundaries currently held by the cache.
1771      *  The startPosition must be on a code point boundary.
1772      *
1773      *  Return true if successful, false if the specified position is after
1774      *  the last cached boundary or before the first.
1775      */
1776     boolean seek(int pos) {
1777         if (pos < fBoundaries[fStartBufIdx] || pos > fBoundaries[fEndBufIdx]) {
1778             return false;
1779         }
1780         if (pos == fBoundaries[fStartBufIdx]) {
1781             // Common case: seek(0), from BreakIterator::first()
1782             fBufIdx = fStartBufIdx;
1783             fTextIdx = fBoundaries[fBufIdx];
1784             return true;
1785         }
1786         if (pos == fBoundaries[fEndBufIdx]) {
1787             fBufIdx = fEndBufIdx;
1788             fTextIdx = fBoundaries[fBufIdx];
1789             return true;
1790         }
1791 
1792         int min = fStartBufIdx;
1793         int max = fEndBufIdx;
1794         while (min != max) {
1795             int probe = (min + max + (min>max ? CACHE_SIZE : 0)) / 2;
1796             probe = modChunkSize(probe);
1797             if (fBoundaries[probe] > pos) {
1798                 max = probe;
1799             } else {
1800                 min = modChunkSize(probe + 1);
1801             }
1802         }
1803         assert(fBoundaries[max] > pos);
1804         fBufIdx = modChunkSize(max - 1);
1805         fTextIdx = fBoundaries[fBufIdx];
1806         assert(fTextIdx <= pos);
1807         return true;
1808 
1809     };
1810 
1811 
1812     /**
1813      * copy constructor, used from RuleBasedBreakIterator.clone().
1814      *
1815      * @param src
1816      */
BreakCache(BreakCache src)1817     BreakCache(BreakCache src)  {
1818         fStartBufIdx = src.fStartBufIdx;
1819         fEndBufIdx = src.fEndBufIdx;
1820         fTextIdx = src.fTextIdx;
1821         fBufIdx = src.fBufIdx;
1822         fBoundaries = src.fBoundaries.clone();
1823         fStatuses = src.fStatuses.clone();
1824         fSideBuffer = new DictionaryBreakEngine.DequeI();  // Transient, no need to clone contents.
1825     }
1826 
dumpCache()1827     void dumpCache() {
1828         System.out.printf("fTextIdx:%d   fBufIdx:%d%n", fTextIdx, fBufIdx);
1829         for (int i=fStartBufIdx; ; i=modChunkSize(i+1)) {
1830             System.out.printf("%d  %d%n", i, fBoundaries[i]);
1831             if (i == fEndBufIdx) {
1832                 break;
1833             }
1834         }
1835     };
1836 
modChunkSize(int index)1837     private final int   modChunkSize(int index) { return index & (CACHE_SIZE - 1); };
1838 
1839     static final int CACHE_SIZE = 128;
1840     // static_assert((CACHE_SIZE & (CACHE_SIZE-1)) == 0, "CACHE_SIZE must be power of two.");
1841 
1842     int                 fStartBufIdx;
1843     int                 fEndBufIdx;    // inclusive
1844 
1845     int                 fTextIdx;
1846     int                 fBufIdx;
1847 
1848     int[]               fBoundaries = new int[CACHE_SIZE];
1849     short[]             fStatuses = new short[CACHE_SIZE];
1850 
1851     DictionaryBreakEngine.DequeI   fSideBuffer = new DictionaryBreakEngine.DequeI();
1852 };
1853 
1854 
1855 
1856 
1857 }
1858 
1859