• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* GENERATED SOURCE. DO NOT MODIFY. */
2 // © 2016 and later: Unicode, Inc. and others.
3 // License & terms of use: http://www.unicode.org/copyright.html
4 /*
5  *******************************************************************************
6  * Copyright (C) 2005-2016 International Business Machines Corporation and
7  * others. All Rights Reserved.
8  *******************************************************************************
9  */
10 
11 package android.icu.text;
12 
13 import static android.icu.impl.CharacterIteration.DONE32;
14 import static android.icu.impl.CharacterIteration.next32;
15 import static android.icu.impl.CharacterIteration.nextTrail32;
16 
17 import java.io.ByteArrayOutputStream;
18 import java.io.IOException;
19 import java.io.InputStream;
20 import java.io.OutputStream;
21 import java.nio.ByteBuffer;
22 import java.text.CharacterIterator;
23 import java.util.ArrayList;
24 import java.util.List;
25 import java.util.MissingResourceException;
26 
27 import android.icu.impl.CharacterIteration;
28 import android.icu.impl.ICUBinary;
29 import android.icu.impl.ICUDebug;
30 import android.icu.impl.RBBIDataWrapper;
31 import android.icu.impl.breakiter.BurmeseBreakEngine;
32 import android.icu.impl.breakiter.CjkBreakEngine;
33 import android.icu.impl.breakiter.DictionaryBreakEngine;
34 import android.icu.impl.breakiter.KhmerBreakEngine;
35 import android.icu.impl.breakiter.LSTMBreakEngine;
36 import android.icu.impl.breakiter.LanguageBreakEngine;
37 import android.icu.impl.breakiter.LaoBreakEngine;
38 import android.icu.impl.breakiter.ThaiBreakEngine;
39 import android.icu.impl.breakiter.UnhandledBreakEngine;
40 import android.icu.lang.UCharacter;
41 import android.icu.lang.UProperty;
42 import android.icu.lang.UScript;
43 import android.icu.util.CodePointTrie;
44 
45 /**
46  * Rule Based Break Iterator
47  * This is a port of the C++ class RuleBasedBreakIterator from ICU4C.
48  *
49  * @hide Only a subset of ICU is exposed in Android
50  */
51 public class RuleBasedBreakIterator extends BreakIterator {
52     //=======================================================================
53     // Constructors & Factories
54     //=======================================================================
55 
56     /**
57      * private constructor
58      */
RuleBasedBreakIterator()59     private RuleBasedBreakIterator() {
60         fDictionaryCharCount  = 0;
61         synchronized(gAllBreakEngines) {
62             fBreakEngines = new ArrayList<>(gAllBreakEngines);
63         }
64     }
65 
66     /**
67      * Create a break iterator from a precompiled set of break rules.
68      *
69      * Creating a break iterator from the binary rules is much faster than
70      * creating one from source rules.
71      *
72      * The binary rules are generated by the RuleBasedBreakIterator.compileRules() function.
73      * Binary break iterator rules are not guaranteed to be compatible between
74      * different versions of ICU.
75      *
76      * @param is an input stream supplying the compiled binary rules.
77      * @throws IOException if there is an error while reading the rules from the InputStream.
78      * @see    #compileRules(String, OutputStream)
79      */
getInstanceFromCompiledRules(InputStream is)80     public static RuleBasedBreakIterator getInstanceFromCompiledRules(InputStream is) throws IOException {
81         RuleBasedBreakIterator  This = new RuleBasedBreakIterator();
82         This.fRData = RBBIDataWrapper.get(ICUBinary.getByteBufferFromInputStreamAndCloseStream(is));
83         This.fLookAheadMatches = new int[This.fRData.fFTable.fLookAheadResultsSize];
84         return This;
85     }
86 
87     /**
88      * This factory method doesn't have an access modifier; it is only accessible in the same
89      * package.
90      *
91      * Create a break iterator from a precompiled set of break rules.
92      *
93      * Creating a break iterator from the binary rules is much faster than
94      * creating one from source rules.
95      *
96      * The binary rules are generated by the RuleBasedBreakIterator.compileRules() function.
97      * Binary break iterator rules are not guaranteed to be compatible between
98      * different versions of ICU.
99      *
100      * @param bytes a buffer supplying the compiled binary rules.
101      * @param phraseBreaking a flag indicating if phrase breaking is required.
102      * @throws IOException if there is an error while reading the rules from the buffer.
103      * @see    #compileRules(String, OutputStream)
104      * @hide draft / provisional / internal are hidden on Android
105      */
getInstanceFromCompiledRules( ByteBuffer bytes, boolean phraseBreaking)106     /* package-potected */ static RuleBasedBreakIterator getInstanceFromCompiledRules(
107             ByteBuffer bytes, boolean phraseBreaking) throws IOException {
108         RuleBasedBreakIterator instance = getInstanceFromCompiledRules(bytes);
109         instance.fPhraseBreaking = phraseBreaking;
110         return instance;
111     }
112 
113     /**
114      * Create a break iterator from a precompiled set of break rules.
115      *
116      * Creating a break iterator from the binary rules is much faster than
117      * creating one from source rules.
118      *
119      * The binary rules are generated by the RuleBasedBreakIterator.compileRules() function.
120      * Binary break iterator rules are not guaranteed to be compatible between
121      * different versions of ICU.
122      *
123      * @param bytes a buffer supplying the compiled binary rules.
124      * @throws IOException if there is an error while reading the rules from the buffer.
125      * @see    #compileRules(String, OutputStream)
126      * @deprecated This API is ICU internal only.
127      * @hide draft / provisional / internal are hidden on Android
128      */
129     @Deprecated
getInstanceFromCompiledRules(ByteBuffer bytes)130     public static RuleBasedBreakIterator getInstanceFromCompiledRules(ByteBuffer bytes) throws IOException {
131         RuleBasedBreakIterator  This = new RuleBasedBreakIterator();
132         This.fRData = RBBIDataWrapper.get(bytes);
133         This.fLookAheadMatches = new int[This.fRData.fFTable.fLookAheadResultsSize];
134         return This;
135     }
136 
137     /**
138      * Construct a RuleBasedBreakIterator from a set of rules supplied as a string.
139      * @param rules The break rules to be used.
140      */
RuleBasedBreakIterator(String rules)141     public RuleBasedBreakIterator(String rules)  {
142         this();
143         try {
144             ByteArrayOutputStream ruleOS = new ByteArrayOutputStream();
145             compileRules(rules, ruleOS);
146             fRData = RBBIDataWrapper.get(ByteBuffer.wrap(ruleOS.toByteArray()));
147             fLookAheadMatches = new int[fRData.fFTable.fLookAheadResultsSize];
148         } catch (IOException e) {
149             ///CLOVER:OFF
150             // An IO exception can only arrive here if there is a bug in the RBBI Rule compiler,
151             //  causing bogus compiled rules to be produced, but with no compile error raised.
152             RuntimeException rte = new RuntimeException("RuleBasedBreakIterator rule compilation internal error: "
153                     + e.getMessage());
154             throw rte;
155             ///CLOVER:ON
156         }
157     }
158 
159     //=======================================================================
160     // Boilerplate
161     //=======================================================================
162 
163     /**
164      * Clones this iterator.
165      * @return A newly-constructed RuleBasedBreakIterator with the same
166      * behavior as this one.
167      */
168     @Override
clone()169     public Object clone()  {
170         RuleBasedBreakIterator result;
171         result = (RuleBasedBreakIterator)super.clone();
172         if (fText != null) {
173             result.fText = (CharacterIterator)(fText.clone());
174         }
175         synchronized (gAllBreakEngines)  {
176             result.fBreakEngines = new ArrayList<>(gAllBreakEngines);
177         }
178         result.fLookAheadMatches = new int[fRData.fFTable.fLookAheadResultsSize];
179         result.fBreakCache = result.new BreakCache(fBreakCache);
180         result.fDictionaryCache = result.new DictionaryCache(fDictionaryCache);
181         return result;
182     }
183 
184 
185     /**
186      * Returns true if both BreakIterators are of the same class, have the same
187      * rules, and iterate over the same text.
188      */
189     @Override
equals(Object that)190     public boolean equals(Object that) {
191         if (that == null) {
192             return false;
193         }
194         if (this == that) {
195             return true;
196         }
197         try {
198             RuleBasedBreakIterator other = (RuleBasedBreakIterator) that;
199             if (fRData != other.fRData && (fRData == null || other.fRData == null)) {
200                 return false;
201             }
202             if (fRData != null && other.fRData != null &&
203                     (!fRData.fRuleSource.equals(other.fRData.fRuleSource))) {
204                 return false;
205             }
206             if (fText == null && other.fText == null) {
207                 return true;
208             }
209             if (fText == null || other.fText == null || !fText.equals(other.fText)) {
210                 return false;
211             }
212             return fPosition == other.fPosition;
213         }
214         catch(ClassCastException e) {
215             return false;
216         }
217      }
218 
219     /**
220      * Returns the description (rules) used to create this iterator.
221      * (In ICU4C, the same function is RuleBasedBreakIterator::getRules())
222      */
223     @Override
toString()224     public String toString() {
225         String retStr = "";
226         if (fRData != null) {
227             retStr =  fRData.fRuleSource;
228         }
229         return retStr;
230     }
231 
232     /**
233      * Compute a hashcode for this BreakIterator
234      * @return A hash code
235      */
236     @Override
hashCode()237     public int hashCode()
238     {
239         return fRData.fRuleSource.hashCode();
240     }
241 
242 
243     private static final int  START_STATE = 1;     // The state number of the starting state
244     private static final int  STOP_STATE  = 0;     // The state-transition value indicating "stop"
245 
246     // RBBIRunMode - the state machine runs an extra iteration at the beginning and end
247     //               of user text.  A variable with this enum type keeps track of where we
248     //               are.  The state machine only fetches user text input while in RUN mode.
249     private static final int  RBBI_START  = 0;
250     private static final int  RBBI_RUN    = 1;
251     private static final int  RBBI_END    = 2;
252 
253     /**
254      * The character iterator through which this BreakIterator accesses the text.
255      */
256     private CharacterIterator   fText = new java.text.StringCharacterIterator("");
257 
258     /**
259      * The rule data for this BreakIterator instance.
260      * Not intended for public use. Declared public for testing purposes only.
261      * @deprecated This API is ICU internal only.
262      * @hide draft / provisional / internal are hidden on Android
263      */
264     @Deprecated
265     public RBBIDataWrapper    fRData;
266 
267     /**
268      *  The iteration state - current position, rule status for the current position,
269      *                        and whether the iterator ran off the end, yielding UBRK_DONE.
270      *                        Current position is pinned to be 0 < position <= text.length.
271      *                        Current position is always set to a boundary.
272      *
273      *  The current  position of the iterator. Pinned, 0 < fPosition <= text.length.
274      *  Never has the value UBRK_DONE (-1).
275      */
276     private int                fPosition;
277 
278     /**
279      * Index of the Rule {tag} values for the most recent match.
280      */
281     private int                fRuleStatusIndex;
282 
283     /**
284      * True when iteration has run off the end, and iterator functions should return UBRK_DONE.
285      */
286     private boolean            fDone;
287 
288     /**
289      *  Array of look-ahead tentative results.
290      */
291     private int[]              fLookAheadMatches;
292 
293     /**
294      *   Cache of previously determined boundary positions.
295      */
296     private BreakCache         fBreakCache = new BreakCache();
297 
298     /**
299      * Flag used to indicate if phrase breaking is required.
300      */
301     private boolean            fPhraseBreaking = false;
302 
303 
304     /**
305      * Counter for the number of characters encountered with the "dictionary"
306      *   flag set.  Normal RBBI iterators don't use it, although the code
307      *   for updating it is live.  Dictionary Based break iterators (a subclass
308      *   of us) access this field directly.
309      * @hide draft / provisional / internal are hidden on Android
310      */
311     private int fDictionaryCharCount;
312 
313     private DictionaryCache     fDictionaryCache = new DictionaryCache();
314 
315     /**
316      * ICU debug argument name for RBBI
317      */
318     private static final String RBBI_DEBUG_ARG = "rbbi";
319 
320     /**
321      * Debugging flag.  Trace operation of state machine when true.
322      */
323     private static final boolean TRACE = ICUDebug.enabled(RBBI_DEBUG_ARG)
324             && ICUDebug.value(RBBI_DEBUG_ARG).indexOf("trace") >= 0;
325 
326     /**
327      * The "default" break engine - just skips over ranges of dictionary words,
328      * producing no breaks. Should only be used if characters need to be handled
329      * by a dictionary but we have no dictionary implementation for them.
330      *
331      * Only one instance; shared by all break iterators.
332      */
333     private static final UnhandledBreakEngine gUnhandledBreakEngine;
334 
335     /**
336      * List of all known break engines, common for all break iterators.
337      * Lazily updated as break engines are needed, because instantiation of
338      * break engines is expensive.
339      *
340      * Because gAllBreakEngines can be referenced concurrently from different
341      * BreakIterator instances, all access is synchronized.
342      */
343     private static final List<LanguageBreakEngine> gAllBreakEngines;
344 
345     static {
346         gUnhandledBreakEngine = new UnhandledBreakEngine();
347         gAllBreakEngines = new ArrayList<>();
348         gAllBreakEngines.add(gUnhandledBreakEngine);
349     }
350 
351     /**
352      * List of all known break engines. Similar to gAllBreakEngines, but local to a
353      * break iterator, allowing it to be used without synchronization.
354      */
355     private List<LanguageBreakEngine> fBreakEngines;
356 
357     /**
358      * Dump the contents of the state table and character classes for this break iterator.
359      * For debugging only.
360      * @deprecated This API is ICU internal only.
361      * @hide draft / provisional / internal are hidden on Android
362      */
363     @Deprecated
dump(java.io.PrintStream out)364     public void dump(java.io.PrintStream out) {
365         if (out == null) {
366             out = System.out;
367         }
368         this.fRData.dump(out);
369     }
370 
371     /**
372      * Compile a set of source break rules into the binary state tables used
373      * by the break iterator engine.  Creating a break iterator from precompiled
374      * rules is much faster than creating one from source rules.
375      *
376      * Binary break rules are not guaranteed to be compatible between different
377      * versions of ICU.
378      *
379      *
380      * @param rules  The source form of the break rules
381      * @param ruleBinary  An output stream to receive the compiled rules.
382      * @throws IOException If there is an error writing the output.
383      * @see #getInstanceFromCompiledRules(InputStream)
384      */
compileRules(String rules, OutputStream ruleBinary)385     public static void compileRules(String rules, OutputStream ruleBinary) throws IOException {
386         RBBIRuleBuilder.compileRules(rules, ruleBinary);
387     }
388 
389     //=======================================================================
390     // BreakIterator overrides
391     //=======================================================================
392 
393     /**
394      * Sets the current iteration position to the beginning of the text.
395      * (i.e., the CharacterIterator's starting offset).
396      * @return The offset of the beginning of the text.
397      */
398     @Override
first()399     public int first() {
400         if (fText == null) {
401             return BreakIterator.DONE;
402         }
403         fText.first();
404         int start =  fText.getIndex();
405         if (!fBreakCache.seek(start)) {
406             fBreakCache.populateNear(start);
407         }
408         fBreakCache.current();
409         assert(fPosition == start);
410         return fPosition;
411     }
412 
413     /**
414      * Sets the current iteration position to the end of the text.
415      * (i.e., the CharacterIterator's ending offset).
416      * @return The text's past-the-end offset.
417      */
418     @Override
last()419     public int last() {
420         if (fText == null) {
421             return BreakIterator.DONE;
422         }
423         int endPos = fText.getEndIndex();
424         boolean endShouldBeBoundary = isBoundary(endPos);      // Has side effect of setting iterator position.
425         assert(endShouldBeBoundary);
426         if (fPosition != endPos) {
427             assert(fPosition == endPos);
428         }
429         return endPos;
430     }
431 
432     /**
433      * Advances the iterator either forward or backward the specified number of steps.
434      * Negative values move backward, and positive values move forward.  This is
435      * equivalent to repeatedly calling next() or previous().
436      * @param n The number of steps to move.  The sign indicates the direction
437      * (negative is backwards, and positive is forwards).
438      * @return The character offset of the boundary position n boundaries away from
439      * the current one.
440      */
441     @Override
next(int n)442     public int next(int n) {
443         int result = 0;
444         if (n > 0) {
445             for (; n > 0 && result != DONE; --n) {
446                 result = next();
447             }
448         } else if (n < 0) {
449             for (; n < 0 && result != DONE; ++n) {
450                 result = previous();
451             }
452         } else {
453             result = current();
454         }
455         return result;
456     }
457 
458     /**
459      * Advances the iterator to the next boundary position.
460      * @return The position of the first boundary after this one.
461      */
462     @Override
next()463     public int next() {
464         fBreakCache.next();
465         return fDone ? DONE : fPosition;
466     }
467 
468     /**
469      * Moves the iterator backwards, to the boundary preceding the current one.
470      * @return The position of the boundary position immediately preceding the starting position.
471      */
472     @Override
previous()473     public int previous() {
474         fBreakCache.previous();
475         return fDone ? DONE : fPosition;
476     }
477 
478     /**
479      * Sets the iterator to refer to the first boundary position following
480      * the specified position.
481      * @param startPos The position from which to begin searching for a break position.
482      * @return The position of the first break after the current position.
483      */
484     @Override
following(int startPos)485     public int following(int startPos) {
486         // if the supplied position is before the beginning, return the
487         // text's starting offset
488         if (startPos < fText.getBeginIndex()) {
489             return first();
490         }
491 
492         // Move requested offset to a code point start. It might be between a lead and trail surrogate.
493         // Or it may be beyond the end of the text.
494         startPos = CISetIndex32(fText, startPos);
495         fBreakCache.following(startPos);
496         return fDone ? DONE : fPosition;
497     }
498 
499 
500     /**
501      * Sets the iterator to refer to the last boundary position before the
502      * specified position.
503      * @param offset The position to begin searching for a break from.
504      * @return The position of the last boundary before the starting position.
505      */
506     @Override
preceding(int offset)507     public int preceding(int offset) {
508         if (fText == null || offset > fText.getEndIndex()) {
509             return last();
510         } else if (offset < fText.getBeginIndex()) {
511             return first();
512         }
513 
514         // Move requested offset to a code point start. It might be between a lead and trail surrogate.
515         // int adjustedOffset = CISetIndex32(fText, offset);    // TODO: restore to match ICU4C behavior.
516         int adjustedOffset = offset;
517         fBreakCache.preceding(adjustedOffset);
518         return fDone ? DONE : fPosition;
519 
520     }
521 
522 
523     /**
524      * Throw IllegalArgumentException unless begin &lt;= offset &lt; end.
525      */
checkOffset(int offset, CharacterIterator text)526     protected static final void checkOffset(int offset, CharacterIterator text) {
527         if (offset < text.getBeginIndex() || offset > text.getEndIndex()) {
528             throw new IllegalArgumentException("offset out of bounds");
529         }
530     }
531 
532 
533     /**
534      * Returns true if the specified position is a boundary position.  As a side
535      * effect, leaves the iterator pointing to the first boundary position at
536      * or after "offset".
537      * @param offset the offset to check.
538      * @return True if "offset" is a boundary position.
539      */
540     @Override
isBoundary(int offset)541     public boolean isBoundary(int offset) {
542         // TODO: behavior difference with ICU4C, which considers out-of-range offsets
543         //       to not be boundaries, and to not be errors.
544         checkOffset(offset, fText);
545 
546         // Adjust offset to be on a code point boundary and not beyond the end of the text.
547         // Note that isBoundary() is always false for offsets that are not on code point boundaries.
548         // But we still need the side effect of leaving iteration at the following boundary.
549         int adjustedOffset = CISetIndex32(fText, offset);
550 
551         boolean result = false;
552         if (fBreakCache.seek(adjustedOffset) || fBreakCache.populateNear(adjustedOffset)) {
553             result = (fBreakCache.current() == offset);
554         }
555 
556         if (!result) {
557             // Not on a boundary. isBoundary() must leave iterator on the following boundary.
558             // fBreakCache.seek(), above, left us on the preceding boundary, so advance one.
559             next();
560         }
561         return result;
562 
563     }
564 
565     /**
566      * Returns the current iteration position.  Note that DONE is never
567      * returned from this function; if iteration has run to the end of a
568      * string, current() will return the length of the string while
569      * next() will return BreakIterator.DONE).
570      * @return The current iteration position.
571      */
572     @Override
current()573     public int current() {
574         return (fText != null) ? fPosition : BreakIterator.DONE;
575     }
576 
577 
578     /**
579      * Return the status tag from the break rule that determined the boundary at
580      * the current iteration position.  The values appear in the rule source
581      * within brackets, {123}, for example.  For rules that do not specify a
582      * status, a default value of 0 is returned.  If more than one rule applies,
583      * the numerically largest of the possible status values is returned.
584      * <p>
585      * Of the standard types of ICU break iterators, only the word and line break
586      * iterator provides status values.  The values are defined in
587      * class RuleBasedBreakIterator, and allow distinguishing between words
588      * that contain alphabetic letters, "words" that appear to be numbers,
589      * punctuation and spaces, words containing ideographic characters, and
590      * more.  Call <code>getRuleStatus</code> after obtaining a boundary
591      * position from <code>next()</code>, <code>previous()</code>, or
592      * any other break iterator functions that returns a boundary position.
593      * <p>
594      * Note that <code>getRuleStatus()</code> returns the value corresponding to
595      * <code>current()</code> index even after <code>next()</code> has returned DONE.
596      * <p>
597 
598      * @return the status from the break rule that determined the boundary
599      * at the current iteration position.
600      */
601 
602     @Override
getRuleStatus()603     public int  getRuleStatus() {
604         //   Status records have this form:
605         //           Count N         <--  fLastRuleStatusIndex points here.
606         //           Status val 0
607         //           Status val 1
608         //              ...
609         //           Status val N-1  <--  the value we need to return
610         //   The status values are sorted in ascending order.
611         //   This function returns the last (largest) of the array of status values.
612         int  idx = fRuleStatusIndex + fRData.fStatusTable[fRuleStatusIndex];
613         int  tagVal = fRData.fStatusTable[idx];
614         return tagVal;
615     }
616 
617     /**
618      * Get the status (tag) values from the break rule(s) that determined the boundary
619      * at the current iteration position.  The values appear in the rule source
620      * within brackets, {123}, for example.  The default status value for rules
621      * that do not explicitly provide one is zero.
622      * <p>
623      * The status values used by the standard ICU break rules are defined
624      * as public constants in class RuleBasedBreakIterator.
625      * <p>
626      * If the size  of the output array is insufficient to hold the data,
627      *  the output will be truncated to the available length.  No exception
628      *  will be thrown.
629      *
630      * @param fillInArray an array to be filled in with the status values.
631      * @return          The number of rule status values from the rules that determined
632      *                  the boundary at the current iteration position.
633      *                  In the event that the array is too small, the return value
634      *                  is the total number of status values that were available,
635      *                  not the reduced number that were actually returned.
636      */
637     @Override
getRuleStatusVec(int[] fillInArray)638     public int getRuleStatusVec(int[] fillInArray) {
639         int numStatusVals = fRData.fStatusTable[fRuleStatusIndex];
640         if (fillInArray != null) {
641             int numToCopy = Math.min(numStatusVals, fillInArray.length);
642             for (int i=0; i<numToCopy; i++) {
643                 fillInArray[i] = fRData.fStatusTable[fRuleStatusIndex + i + 1];
644             }
645         }
646         return numStatusVals;
647     }
648 
649     /**
650      * Returns a CharacterIterator over the text being analyzed.
651      * <p>
652      * <b><i>Caution:</i></b>The state of the returned CharacterIterator
653      * must not be modified in any way while the BreakIterator is still in use.
654      * Doing so will lead to undefined behavior of the BreakIterator.
655      * Clone the returned CharacterIterator first and work with that.
656      * <p>
657      * The returned CharacterIterator is a reference
658      * to the <b>actual iterator being used</b> by the BreakIterator.
659      * No guarantees are made about the current position
660      * of this iterator when it is returned; it may differ from the
661      * BreakIterators current position.  If you need to move that
662      * position to examine the text, clone this function's return value first.
663      * @return An iterator over the text being analyzed.
664      */
665     @Override
getText()666     public CharacterIterator getText() {
667         return fText;
668     }
669 
670     /**
671      * Set the iterator to analyze a new piece of text.  This function resets
672      * the current iteration position to the beginning of the text.
673      * (The old iterator is dropped.)
674      * <p>
675      * <b><i>Caution:</i></b> The supplied CharacterIterator is used
676      * directly by the BreakIterator, and must not be altered in any
677      * way by code outside of the BreakIterator.
678      * Doing so will lead to undefined behavior of the BreakIterator.
679      *
680      * @param newText An iterator over the text to analyze.
681      */
682     @Override
setText(CharacterIterator newText)683     public void setText(CharacterIterator newText) {
684         if (newText != null) {
685             fBreakCache.reset(newText.getBeginIndex(), 0);
686         } else {
687             fBreakCache.reset();
688         }
689         fDictionaryCache.reset();
690         fText = newText;
691         this.first();
692     }
693 
694      /**
695      * Control debug, trace and dump options.
696      * @deprecated This API is ICU internal only.
697      * @hide draft / provisional / internal are hidden on Android
698      */
699     @Deprecated
700     public static final String fDebugEnv = ICUDebug.enabled(RBBI_DEBUG_ARG) ?
701                                            ICUDebug.value(RBBI_DEBUG_ARG) : null;
702 
703 
getLanguageBreakEngine(int c)704     private LanguageBreakEngine getLanguageBreakEngine(int c) {
705 
706         // We have a dictionary character.
707         // Does an already instantiated break engine handle it?
708         for (LanguageBreakEngine candidate : fBreakEngines) {
709             if (candidate.handles(c)) {
710                 return candidate;
711             }
712         }
713 
714         synchronized (gAllBreakEngines) {
715             // This break iterator's list of break engines didn't handle the character.
716             // Check the global list, another break iterator may have instantiated the
717             // desired engine.
718             for (LanguageBreakEngine candidate : gAllBreakEngines) {
719                 if (candidate.handles(c)) {
720                     fBreakEngines.add(candidate);
721                     return candidate;
722                 }
723             }
724 
725             // The global list doesn't have an existing engine, build one.
726             int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
727             if (script == UScript.KATAKANA || script == UScript.HIRAGANA) {
728                 // Katakana, Hiragana and Han are handled by the same dictionary engine.
729                 // Fold them together for mapping from script -> engine.
730                 script = UScript.HAN;
731             }
732 
733             LanguageBreakEngine eng;
734             try {
735                 switch (script) {
736                 case UScript.THAI:
737                     try {
738                         eng = LSTMBreakEngine.create(script, LSTMBreakEngine.createData(script));
739                     } catch (MissingResourceException e) {
740                         eng = new ThaiBreakEngine();
741                     }
742                     break;
743                 case UScript.LAO:
744                     eng = new LaoBreakEngine();
745                     break;
746                 case UScript.MYANMAR:
747                     try {
748                         eng = LSTMBreakEngine.create(script, LSTMBreakEngine.createData(script));
749                     } catch (MissingResourceException e) {
750                         eng = new BurmeseBreakEngine();
751                     }
752                     break;
753                 case UScript.KHMER:
754                     eng = new KhmerBreakEngine();
755                     break;
756                 case UScript.HAN:
757                     eng = new CjkBreakEngine(false);
758                      break;
759                 case UScript.HANGUL:
760                     eng = new CjkBreakEngine(true);
761                     break;
762                 default:
763                     gUnhandledBreakEngine.handleChar(c);
764                     eng = gUnhandledBreakEngine;
765                     break;
766                 }
767             } catch (IOException e) {
768                 eng = null;
769             }
770 
771             if (eng != null && eng != gUnhandledBreakEngine) {
772                 gAllBreakEngines.add(eng);
773                 fBreakEngines.add(eng);
774             }
775             return eng;
776         }   // end synchronized(gAllBreakEngines)
777     }
778 
779     /**
780      * The State Machine Engine for moving forward is here.
781      * This function is the heart of the RBBI run time engine.
782      *
783      * Input
784      *    fPosition, the position in the text to begin from.
785      * Output
786      *    fPosition:           the boundary following the starting position.
787      *    fDictionaryCharCount the number of dictionary characters encountered.
788      *                         If > 0, the segment will be further subdivided
789      *    fRuleStatusIndex     Info from the state table indicating which rules caused the boundary.
790      *
791      * @return the new iterator position
792      *
793      * A note on supplementary characters and the position of underlying
794      * Java CharacterIterator:   Normally, a character iterator is positioned at
795      * the char most recently returned by next().  Within this function, when
796      * a supplementary char is being processed, the char iterator is left
797      * sitting on the trail surrogate, in the middle of the code point.
798      * This is different from everywhere else, where an iterator always
799      * points at the lead surrogate of a supplementary.
800      */
handleNext()801     private int handleNext() {
802         if (TRACE) {
803             System.out.println("Handle Next   pos      char  state category");
804         }
805 
806         // handleNext always sets the break tag value.
807         // Set the default for it.
808         fRuleStatusIndex  = 0;
809         fDictionaryCharCount = 0;
810 
811         // caches for quicker access
812         CharacterIterator text = fText;
813         CodePointTrie trie = fRData.fTrie;
814 
815         char[] stateTable  = fRData.fFTable.fTable;
816         int initialPosition = fPosition;
817         text.setIndex(initialPosition);
818         int result          = initialPosition;
819 
820         // Set up the starting char
821         int c = text.current();
822         if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) {
823             c = nextTrail32(text, c);
824             if (c == DONE32) {
825                 fDone = true;
826                 return BreakIterator.DONE;
827             }
828         }
829 
830         // Set the initial state for the state machine
831         int state           = START_STATE;
832         int row             = fRData.getRowIndex(state);
833         short category      = 3;
834         int flagsState      = fRData.fFTable.fFlags;
835         int dictStart       = fRData.fFTable.fDictCategoriesStart;
836         int mode            = RBBI_RUN;
837         if ((flagsState & RBBIDataWrapper.RBBI_BOF_REQUIRED) != 0) {
838             category = 2;
839             mode     = RBBI_START;
840             if (TRACE) {
841                 System.out.print("            " +  RBBIDataWrapper.intToString(text.getIndex(), 5));
842                 System.out.print(RBBIDataWrapper.intToHexString(c, 10));
843                 System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6));
844             }
845         }
846 
847         // loop until we reach the end of the text or transition to state 0
848         while (state != STOP_STATE) {
849             if (c == DONE32) {
850                 // Reached end of input string.
851                 if (mode == RBBI_END) {
852                     // We have already run the loop one last time with the
853                     // character set to the pseudo {eof} value. Now it is time
854                     // to unconditionally bail out.
855                     break;
856                 }
857                 // Run the loop one last time with the fake end-of-input character category
858                 mode = RBBI_END;
859                 category = 1;
860             }
861             else if (mode == RBBI_RUN) {
862                 // Get the char category.  An incoming category of 1 or 2 mens that
863                 //      we are preset for doing the beginning or end of input, and
864                 //      that we shouldn't get a category from an actual text input character.
865                 //
866 
867                 // look up the current character's character category, which tells us
868                 // which column in the state table to look at.
869                 //
870                 category = (short) trie.get(c);
871 
872                 // Check for categories that require word dictionary handling.
873                 if (category >= dictStart) {
874                     fDictionaryCharCount++;
875                 }
876 
877                 if (TRACE) {
878                     System.out.print("            " +  RBBIDataWrapper.intToString(text.getIndex(), 5));
879                     System.out.print(RBBIDataWrapper.intToHexString(c, 10));
880                     System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6));
881                 }
882 
883                 // Advance to the next character.
884                 // If this is a beginning-of-input loop iteration, don't advance.
885                 //    The next iteration will be processing the first real input character.
886                 c = text.next();
887                 if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) {
888                     c = nextTrail32(text, c);
889                 }
890             }
891             else {
892                 mode = RBBI_RUN;
893             }
894 
895             // look up a state transition in the state table
896             state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category];
897             row   = fRData.getRowIndex(state);
898             int accepting = stateTable[row + RBBIDataWrapper.ACCEPTING];
899             if (accepting == RBBIDataWrapper.ACCEPTING_UNCONDITIONAL) {
900                 // Match found, common case
901                 result = text.getIndex();
902                 if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c <= UTF16.CODEPOINT_MAX_VALUE) {
903                     // The iterator has been left in the middle of a surrogate pair.
904                     // We want the start of it.
905                     result--;
906                 }
907 
908                 //  Remember the break status (tag) values.
909                 fRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGSIDX];
910             } else if (accepting > RBBIDataWrapper.ACCEPTING_UNCONDITIONAL) {
911                 // Lookahead match is completed
912                 int lookaheadResult = fLookAheadMatches[accepting];
913                 if (lookaheadResult >= 0) {
914                     fRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGSIDX];
915                     fPosition = lookaheadResult;
916                     return lookaheadResult;
917                 }
918             }
919 
920 
921             // If we are at the position of the '/' in a look-ahead (hard break) rule;
922             // record the current position, to be returned later, if the full rule matches.
923             // TODO: Move this check before the previous check of fAccepting.
924             //       This would enable hard-break rules with no following context.
925             //       But there are line break test failures when trying this. Investigate.
926             //       Issue ICU-20837
927             int rule = stateTable[row + RBBIDataWrapper.LOOKAHEAD];
928             if (rule != 0) {
929                 int  pos = text.getIndex();
930                 if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c <= UTF16.CODEPOINT_MAX_VALUE) {
931                     // The iterator has been left in the middle of a surrogate pair.
932                     // We want the beginning  of it.
933                     pos--;
934                 }
935                 fLookAheadMatches[rule] = pos;
936             }
937 
938 
939         }        // End of state machine main loop
940 
941         // The state machine is done.  Check whether it found a match...
942 
943         // If the iterator failed to advance in the match engine force it ahead by one.
944         // This indicates a defect in the break rules, which should always match
945         // at least one character.
946 
947         if (result == initialPosition) {
948             if (TRACE) {
949                 System.out.println("Iterator did not move. Advancing by 1.");
950             }
951             text.setIndex(initialPosition);
952             next32(text);
953             result = text.getIndex();
954             fRuleStatusIndex = 0;
955         }
956 
957         // Leave the iterator at our result position.
958         //   (we may have advanced beyond the last accepting position chasing after
959         //    longer matches that never completed.)
960         fPosition = result;
961 
962         if (TRACE) {
963             System.out.println("result = " + result);
964         }
965         return result;
966     }
967 
968     /**
969      * Iterate backwards from an arbitrary position in the input text using the Safe Reverse rules.
970      * This locates a "Safe Position" from which the forward break rules
971      * will operate correctly. A Safe Position is not necessarily a boundary itself.
972      *
973      * The logic of this function is very similar to handleNext(), above, but simpler
974      * because the safe table does not require as many options.
975      *
976      * @param fromPosition the position in the input text to begin the iteration.
977      * @hide draft / provisional / internal are hidden on Android
978      */
handleSafePrevious(int fromPosition)979     private int handleSafePrevious(int fromPosition) {
980         char            state;
981         short           category = 0;
982         int             result = 0;
983 
984         // caches for quicker access
985         CharacterIterator text = fText;
986         CodePointTrie trie = fRData.fTrie;
987         char[] stateTable  = fRData.fRTable.fTable;
988 
989         CISetIndex32(text, fromPosition);
990         if (TRACE) {
991             System.out.print("Handle Previous   pos   char  state category");
992         }
993 
994         // if we're already at the start of the text, return DONE.
995         if (text.getIndex() == text.getBeginIndex()) {
996             return BreakIterator.DONE;
997         }
998 
999         //  Set the initial state for the state machine
1000         int c = CharacterIteration.previous32(text);
1001         state = START_STATE;
1002         int row = fRData.getRowIndex(state);
1003 
1004         // loop until we reach the start of the text or transition to state 0
1005         //
1006         for (; c != DONE32; c = CharacterIteration.previous32(text)) {
1007 
1008             // look up the current character's character category, which tells us
1009             // which column in the state table to look at.
1010             //
1011             //  And off the dictionary flag bit. For reverse iteration it is not used.
1012             category = (short) trie.get(c);
1013             if (TRACE) {
1014                 System.out.print("            " +  RBBIDataWrapper.intToString(text.getIndex(), 5));
1015                 System.out.print(RBBIDataWrapper.intToHexString(c, 10));
1016                 System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6));
1017             }
1018 
1019             // State Transition - move machine to its next state
1020             //
1021             assert(category < fRData.fHeader.fCatCount);
1022             state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category];
1023             row   = fRData.getRowIndex(state);
1024 
1025             if (state == STOP_STATE) {
1026                 // This is the normal exit from the lookup state machine.
1027                 // Transition to state zero means we have found a safe point.
1028                 break;
1029             }
1030         }
1031 
1032         // The state machine is done.
1033         result = text.getIndex();
1034         if (TRACE) {
1035             System.out.println("result = " + result);
1036         }
1037         return result;
1038     }
1039 
1040     /**
1041      * Set the index of a CharacterIterator.
1042      * Pin the index to the valid range range of BeginIndex <= index <= EndIndex.
1043      * If the index points to a trail surrogate of a supplementary character, adjust it
1044      * to the start (lead surrogate) index.
1045      *
1046      * @param ci A CharacterIterator to set
1047      * @param index the index to set
1048      * @return the resulting index, possibly pinned or adjusted.
1049      */
1050     private static int CISetIndex32(CharacterIterator ci, int index) {
1051         if (index <= ci.getBeginIndex()) {
1052             ci.first();
1053         } else if (index >= ci.getEndIndex()) {
1054             ci.setIndex(ci.getEndIndex());
1055         } else if (Character.isLowSurrogate(ci.setIndex(index))) {
1056             if (!Character.isHighSurrogate(ci.previous())) {
1057                 ci.next();
1058             }
1059         }
1060         return ci.getIndex();
1061     }
1062 
1063     /** DictionaryCache  stores the boundaries obtained from a run of dictionary characters.
1064      *                 Dictionary boundaries are moved first to this cache, then from here
1065      *                 to the main BreakCache, where they may inter-leave with non-dictionary
1066      *                 boundaries. The public BreakIterator API always fetches directly
1067      *                 from the main BreakCache, not from here.
1068      *
1069      *                 In common situations, the number of boundaries in a single dictionary run
1070      *                 should be quite small, it will be terminated by punctuation, spaces,
1071      *                 or any other non-dictionary characters. The main BreakCache may end
1072      *                 up with boundaries from multiple dictionary based runs.
1073      *
1074      *                 The boundaries are stored in a simple ArrayList (vector), with the
1075      *                 assumption that they will be accessed sequentially.
1076      */
1077     class DictionaryCache  {
1078 
1079          void reset() {
1080              fPositionInCache = -1;
1081              fStart = 0;
1082              fLimit = 0;
1083              fFirstRuleStatusIndex = 0;
1084              fOtherRuleStatusIndex = 0;
1085              fBreaks.removeAllElements();
1086          };
1087 
1088          boolean following(int fromPos) {
1089              if (fromPos >= fLimit || fromPos < fStart) {
1090                  fPositionInCache = -1;
1091                  return false;
1092              }
1093 
1094              // Sequential iteration, move from previous boundary to the following
1095 
1096              int r = 0;
1097              if (fPositionInCache >= 0 && fPositionInCache < fBreaks.size() && fBreaks.elementAt(fPositionInCache) == fromPos) {
1098                  ++fPositionInCache;
1099                  if (fPositionInCache >= fBreaks.size()) {
1100                      fPositionInCache = -1;
1101                      return false;
1102                  }
1103                  r = fBreaks.elementAt(fPositionInCache);
1104                  assert(r > fromPos);
1105                  fBoundary = r;
1106                  fStatusIndex = fOtherRuleStatusIndex;
1107                  return true;
1108              }
1109 
1110              // Random indexing. Linear search for the boundary following the given position.
1111 
1112              for (fPositionInCache = 0; fPositionInCache < fBreaks.size(); ++fPositionInCache) {
1113                  r= fBreaks.elementAt(fPositionInCache);
1114                  if (r > fromPos) {
1115                      fBoundary = r;
1116                      fStatusIndex = fOtherRuleStatusIndex;
1117                      return true;
1118                  }
1119              }
1120 
1121              // Internal error. fStart <= fromPos < fLimit, but no cached boundary.
1122              assert(false);
1123              fPositionInCache = -1;
1124              return false;
1125          };
1126 
preceding(int fromPos)1127          boolean preceding(int fromPos) {
1128              if (fromPos <= fStart || fromPos > fLimit) {
1129                  fPositionInCache = -1;
1130                  return false;
1131              }
1132 
1133              if (fromPos == fLimit) {
1134                  fPositionInCache = fBreaks.size() - 1;
1135                  if (fPositionInCache >= 0) {
1136                      assert(fBreaks.elementAt(fPositionInCache) == fromPos);
1137                  }
1138              }
1139 
1140              int r;
1141              if (fPositionInCache > 0 && fPositionInCache < fBreaks.size() && fBreaks.elementAt(fPositionInCache) == fromPos) {
1142                  --fPositionInCache;
1143                  r = fBreaks.elementAt(fPositionInCache);
1144                  assert(r < fromPos);
1145                  fBoundary = r;
1146                  fStatusIndex = ( r== fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex;
1147                  return true;
1148              }
1149 
1150              if (fPositionInCache == 0) {
1151                  fPositionInCache = -1;
1152                  return false;
1153              }
1154 
1155              for (fPositionInCache = fBreaks.size()-1; fPositionInCache >= 0; --fPositionInCache) {
1156                  r = fBreaks.elementAt(fPositionInCache);
1157                  if (r < fromPos) {
1158                      fBoundary = r;
1159                      fStatusIndex = ( r == fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex;
1160                      return true;
1161                  }
1162              }
1163              assert(false);
1164              fPositionInCache = -1;
1165              return false;
1166          };
1167 
1168         /**
1169          * Populate the cache with the dictionary based boundaries within a region of text.
1170          * @param startPos  The start position of a range of text
1171          * @param endPos    The end position of a range of text
1172          * @param firstRuleStatus The rule status index that applies to the break at startPos
1173          * @param otherRuleStatus The rule status index that applies to boundaries other than startPos
1174          * @hide draft / provisional / internal are hidden on Android
1175          */
populateDictionary(int startPos, int endPos, int firstRuleStatus, int otherRuleStatus)1176         void populateDictionary(int startPos, int endPos,
1177                                 int firstRuleStatus, int otherRuleStatus) {
1178             if ((endPos - startPos) <= 1) {
1179                 return;
1180             }
1181 
1182             reset();
1183             fFirstRuleStatusIndex = firstRuleStatus;
1184             fOtherRuleStatusIndex = otherRuleStatus;
1185 
1186             int rangeStart = startPos;
1187             int rangeEnd = endPos;
1188 
1189             int         category;
1190             int         current;
1191             int         foundBreakCount = 0;
1192 
1193             // Loop through the text, looking for ranges of dictionary characters.
1194             // For each span, find the appropriate break engine, and ask it to find
1195             // any breaks within the span.
1196 
1197             fText.setIndex(rangeStart);
1198             int     c = CharacterIteration.current32(fText);
1199             category = (short)fRData.fTrie.get(c);
1200             int dictStart = fRData.fFTable.fDictCategoriesStart;
1201 
1202             while(true) {
1203                 while((current = fText.getIndex()) < rangeEnd && (category < dictStart)) {
1204                     c = CharacterIteration.next32(fText);    // pre-increment
1205                     category = (short)fRData.fTrie.get(c);
1206                 }
1207                 if (current >= rangeEnd) {
1208                     break;
1209                 }
1210 
1211                 // We now have a dictionary character. Get the appropriate language object
1212                 // to deal with it.
1213                 LanguageBreakEngine lbe = getLanguageBreakEngine(c);
1214 
1215                 // Ask the language object if there are any breaks. It will add them to the cache and
1216                 // leave the text pointer on the other side of its range, ready to search for the next one.
1217                 if (lbe != null) {
1218                     foundBreakCount += lbe.findBreaks(fText, rangeStart, rangeEnd, fBreaks, fPhraseBreaking);
1219                 }
1220 
1221                 // Reload the loop variables for the next go-round
1222                 c = CharacterIteration.current32(fText);
1223                 category = (short)fRData.fTrie.get(c);
1224             }
1225 
1226             // If we found breaks, ensure that the first and last entries are
1227             // the original starting and ending position. And initialize the
1228             // cache iteration position to the first entry.
1229 
1230             // System.out.printf("foundBreakCount = %d%n", foundBreakCount);
1231             if (foundBreakCount > 0) {
1232                 assert(foundBreakCount == fBreaks.size());
1233                 if (startPos < fBreaks.elementAt(0)) {
1234                     // The dictionary did not place a boundary at the start of the segment of text.
1235                     // Add one now. This should not commonly happen, but it would be easy for interactions
1236                     // of the rules for dictionary segments and the break engine implementations to
1237                     // inadvertently cause it. Cover it here, just in case.
1238                     fBreaks.offer(startPos);
1239                 }
1240                 if (endPos > fBreaks.peek()) {
1241                     fBreaks.push(endPos);
1242                 }
1243                 fPositionInCache = 0;
1244                 // Note: Dictionary matching may extend beyond the original limit.
1245                 fStart = fBreaks.elementAt(0);
1246                 fLimit = fBreaks.peek();
1247             } else {
1248                 // there were no language-based breaks, even though the segment contained
1249                 // dictionary characters. Subsequent attempts to fetch boundaries from the dictionary cache
1250                 // for this range will fail, and the calling code will fall back to the rule based boundaries.
1251             }
1252 
1253         };
1254 
1255 
DictionaryCache()1256         DictionaryCache() {
1257             fPositionInCache = -1;
1258             fBreaks = new DictionaryBreakEngine.DequeI();
1259         }
1260 
1261         /**
1262          * copy constructor. Used by RuleBasedBreakIterator.clone().
1263          *
1264          * @param src the source object to be copied.
1265          */
DictionaryCache(DictionaryCache src)1266         DictionaryCache(DictionaryCache src)  {
1267             try {
1268                 fBreaks = (DictionaryBreakEngine.DequeI)src.fBreaks.clone();
1269             }
1270             catch (CloneNotSupportedException e) {
1271                 throw new RuntimeException(e);
1272             }
1273             fPositionInCache      = src.fPositionInCache;
1274             fStart                = src.fStart;
1275             fLimit                = src.fLimit;
1276             fFirstRuleStatusIndex = src.fFirstRuleStatusIndex;
1277             fOtherRuleStatusIndex = src.fOtherRuleStatusIndex;
1278             fBoundary             = src.fBoundary;
1279             fStatusIndex          = src.fStatusIndex;
1280         }
1281 
1282         // A data structure containing the boundaries themselves. Essentially a vector of raw ints.
1283         DictionaryBreakEngine.DequeI fBreaks;
1284         int             fPositionInCache;       // Index in fBreaks of last boundary returned by following()
1285         //                                      //    or preceding(). Optimizes sequential access.
1286         int             fStart;                 // Text position of first boundary in cache.
1287         int             fLimit;                 // Last boundary in cache. Which is the limit of the
1288         //                                      //    text segment being handled by the dictionary.
1289         int             fFirstRuleStatusIndex;  // Rule status info for first boundary.
1290         int             fOtherRuleStatusIndex;  // Rule status info for 2nd through last boundaries.
1291         int             fBoundary;              // Current boundary. Set by preceding(), following().
1292         int             fStatusIndex;           // Current rule status index. Set by preceding, following().
1293     };
1294 
1295 
1296 
1297 
1298 /*
1299  * class BreakCache
1300  *
1301  * Cache of break boundary positions and rule status values.
1302  * Break iterator API functions, next(), previous(), etc., will use cached results
1303  * when possible, and otherwise cache new results as they are obtained.
1304  *
1305  * Uniformly caches both dictionary and rule based (non-dictionary) boundaries.
1306  *
1307  * The cache is implemented as a single circular buffer.
1308  */
1309 
1310 /*
1311  * size of the circular cache buffer.
1312  */
1313 
1314 class BreakCache {
1315 
BreakCache()1316     BreakCache() {
1317         reset();
1318     };
1319 
reset(int pos, int ruleStatus)1320     void  reset(int pos, int ruleStatus) {
1321         fStartBufIdx = 0;
1322         fEndBufIdx = 0;
1323         fTextIdx = pos;
1324         fBufIdx = 0;
1325         fBoundaries[0] = pos;
1326         fStatuses[0] = (short)ruleStatus;
1327     }
1328 
reset()1329     void  reset() {reset(0, 0); };
1330 
next()1331     void  next() {
1332         if (fBufIdx == fEndBufIdx) {
1333             fDone = !populateFollowing();
1334             fPosition = fTextIdx;
1335             fRuleStatusIndex = fStatuses[fBufIdx];
1336         } else {
1337             fBufIdx = modChunkSize(fBufIdx + 1);
1338             fTextIdx = fPosition = fBoundaries[fBufIdx];
1339             fRuleStatusIndex = fStatuses[fBufIdx];
1340         }
1341     };
1342 
previous()1343     void  previous() {
1344         int initialBufIdx = fBufIdx;
1345         if (fBufIdx == fStartBufIdx) {
1346             // At start of cache. Prepend to it.
1347             populatePreceding();
1348         } else {
1349             // Cache already holds the next boundary
1350             fBufIdx = modChunkSize(fBufIdx - 1);
1351             fTextIdx = fBoundaries[fBufIdx];
1352         }
1353         fDone = (fBufIdx == initialBufIdx);
1354         fPosition = fTextIdx;
1355         fRuleStatusIndex = fStatuses[fBufIdx];
1356         return;
1357     };
1358 
1359     // Move the iteration state to the position following the startPosition.
1360     // Input position must be pinned to the input length.
following(int startPos)1361     void following(int startPos) {
1362         if (startPos == fTextIdx || seek(startPos) || populateNear(startPos)) {
1363             // startPos is in the cache. Do a next() from that position.
1364             // TODO: an awkward set of interactions with bi->fDone
1365             //       seek() does not clear it; it can't because of interactions with populateNear().
1366             //       next() does not clear it in the fast-path case, where everything matters. Maybe it should.
1367             //       So clear it here, for the case where seek() succeeded on an iterator that had previously run off the end.
1368             fDone = false;
1369             next();
1370         }
1371 
1372     };
1373 
preceding(int startPos)1374     void  preceding(int startPos) {
1375         if (startPos == fTextIdx || seek(startPos) || populateNear(startPos)) {
1376             if (startPos == fTextIdx) {
1377                 previous();
1378             } else {
1379                 // seek() leaves the BreakCache positioned at the preceding boundary
1380                 //        if the requested position is between two boundaries.
1381                 // current() pushes the BreakCache position out to the BreakIterator itself.
1382                 assert(startPos > fTextIdx);
1383                 current();
1384             }
1385         }
1386         return;
1387     };
1388 
1389     /**
1390      * Update the state of the public BreakIterator (fBI) to reflect the
1391      * current state of the break iterator cache (this).
1392      */
current()1393     int current() {
1394         fPosition = fTextIdx;
1395         fRuleStatusIndex = fStatuses[fBufIdx];
1396         fDone = false;
1397         return fTextIdx;
1398     };
1399 
1400     /**
1401      * Add boundaries to the cache near the specified position.
1402      * The given position need not be a boundary itself.
1403      * The input position must be within the range of the text, and
1404      * on a code point boundary.
1405      * If the requested position is a break boundary, leave the iteration
1406      * position on it.
1407      * If the requested position is not a boundary, leave the iteration
1408      * position on the preceding boundary and include both the the
1409      * preceding and following boundaries in the cache.
1410      * Additional boundaries, either preceding or following, may be added
1411      * to the cache as a side effect.
1412      *
1413      * Return false if the operation failed.
1414      */
populateNear(int position)1415     boolean populateNear(int position) {
1416         assert(position < fBoundaries[fStartBufIdx] || position > fBoundaries[fEndBufIdx]);
1417 
1418         // Find a boundary somewhere in the vicinity of the requested position.
1419         // Depending on the safe rules and the text data, it could be either before, at, or after
1420         // the requested position.
1421 
1422 
1423         // If the requested position is not near already cached positions, clear the existing cache,
1424         // find a near-by boundary and begin new cache contents there.
1425 
1426         if ((position < fBoundaries[fStartBufIdx] - 15) || position > (fBoundaries[fEndBufIdx] + 15)) {
1427             int aBoundary = fText.getBeginIndex();
1428             int ruleStatusIndex = 0;
1429             if (position > aBoundary + 20) {
1430                 int backupPos = handleSafePrevious(position);
1431                 if (backupPos > aBoundary) {
1432                     // Advance to the boundary following the backup position.
1433                     // There is a complication: the safe reverse rules identify pairs of code points
1434                     // that are safe. If advancing from the safe point moves forwards by less than
1435                     // two code points, we need to advance one more time to ensure that the boundary
1436                     // is good, including a correct rules status value.
1437                     //
1438                     fPosition = backupPos;
1439                     aBoundary = handleNext();
1440                     if (aBoundary == backupPos + 1 ||
1441                             (aBoundary == backupPos + 2 &&
1442                             Character.isHighSurrogate(fText.setIndex(backupPos)) &&
1443                             Character.isLowSurrogate(fText.next()))) {
1444                         // The initial handleNext() only advanced by a single code point. Go again.
1445                         // Safe rules identify safe pairs.
1446                         aBoundary = handleNext();
1447                     }
1448                 }
1449                 ruleStatusIndex = fRuleStatusIndex;
1450             }
1451             reset(aBoundary, ruleStatusIndex);               // Reset cache to hold aBoundary as a single starting point.
1452         }
1453 
1454         // Fill in boundaries between existing cache content and the new requested position.
1455 
1456         if (fBoundaries[fEndBufIdx] < position) {
1457             // The last position in the cache precedes the requested position.
1458             // Add following position(s) to the cache.
1459             while (fBoundaries[fEndBufIdx] < position) {
1460                 if (!populateFollowing()) {
1461                     assert false;
1462                     return false;
1463                 }
1464             }
1465             fBufIdx = fEndBufIdx;                      // Set iterator position to the end of the buffer.
1466             fTextIdx = fBoundaries[fBufIdx];           // Required because populateFollowing may add extra boundaries.
1467             while (fTextIdx > position) {              // Move backwards to a position at or preceding the requested pos.
1468                 previous();
1469             }
1470             return true;
1471         }
1472 
1473         if (fBoundaries[fStartBufIdx] > position) {
1474             // The first position in the cache is beyond the requested position.
1475             // back up more until we get a boundary <= the requested position.
1476             while (fBoundaries[fStartBufIdx] > position) {
1477                 populatePreceding();
1478             }
1479             fBufIdx = fStartBufIdx;                    // Set iterator position to the start of the buffer.
1480             fTextIdx = fBoundaries[fBufIdx];           // Required because populatePreceding may add extra boundaries.
1481             while (fTextIdx < position) {              // Move forwards to a position at or following the requested pos.
1482                 next();
1483             }
1484             if (fTextIdx > position) {
1485                 // If position is not itself a boundary, the next() loop above will overshoot.
1486                 // Back up one, leaving cache position at the boundary preceding the requested position.
1487                 previous();
1488             }
1489             return true;
1490         }
1491 
1492         assert fTextIdx == position;
1493         return true;
1494 
1495     };
1496 
1497     /**
1498      *  Add boundary(s) to the cache following the current last boundary.
1499      *  Return false if at the end of the text, and no more boundaries can be added.
1500      *  Leave iteration position at the first newly added boundary, or unchanged if no boundary was added.
1501      */
populateFollowing()1502     boolean populateFollowing() {
1503         int fromPosition = fBoundaries[fEndBufIdx];
1504         int fromRuleStatusIdx = fStatuses[fEndBufIdx];
1505         int pos = 0;
1506         int ruleStatusIdx = 0;
1507 
1508         if (fDictionaryCache.following(fromPosition)) {
1509             addFollowing(fDictionaryCache.fBoundary, fDictionaryCache.fStatusIndex, UpdateCachePosition);
1510             return true;
1511         }
1512 
1513         fPosition = fromPosition;
1514         pos = handleNext();
1515         if (pos == BreakIterator.DONE) {
1516             return false;
1517         }
1518 
1519         ruleStatusIdx = fRuleStatusIndex;
1520         if (fDictionaryCharCount > 0) {
1521             // The text segment obtained from the rules includes dictionary characters.
1522             // Subdivide it, with subdivided results going into the dictionary cache.
1523             fDictionaryCache.populateDictionary(fromPosition, pos, fromRuleStatusIdx, ruleStatusIdx);
1524             if (fDictionaryCache.following(fromPosition)) {
1525                 addFollowing(fDictionaryCache.fBoundary, fDictionaryCache.fStatusIndex, UpdateCachePosition);
1526                 return true;
1527                 // TODO: may want to move a sizable chunk of the dictionary cache to the break cache at this point.
1528                 //       But be careful with interactions with populateNear().
1529             }
1530         }
1531 
1532         // Rule based segment did not include dictionary characters.
1533         // Or, it did contain dictionary chars, but the dictionary segmenter didn't handle them,
1534         //    meaning that we didn't take the return, above.
1535         // Add its end point to the cache.
1536         addFollowing(pos, ruleStatusIdx, UpdateCachePosition);
1537 
1538         // Add several non-dictionary boundaries at this point, to optimize straight forward iteration.
1539         //    (subsequent calls to BreakIterator::next() will take the fast path, getting cached results.
1540         //
1541         for (int count=0; count<6; ++count) {
1542             pos = handleNext();
1543             if (pos == BreakIterator.DONE || fDictionaryCharCount > 0) {
1544                 break;
1545             }
1546             addFollowing(pos, fRuleStatusIndex, RetainCachePosition);
1547         }
1548         return true;
1549     };
1550 
1551     /**
1552      *  Add one or more boundaries to the cache preceding the first currently cached boundary.
1553      *  Leave the iteration position on the first added boundary.
1554      *  Return false if no boundaries could be added (if at the start of the text.)
1555      */
populatePreceding()1556     boolean populatePreceding() {
1557         int textBegin = fText.getBeginIndex();
1558         int fromPosition = fBoundaries[fStartBufIdx];
1559         if (fromPosition == textBegin) {
1560             return false;
1561         }
1562 
1563         int position = textBegin;
1564         int positionStatusIdx = 0;
1565 
1566         if (fDictionaryCache.preceding(fromPosition)) {
1567             addPreceding(fDictionaryCache.fBoundary, fDictionaryCache.fStatusIndex, UpdateCachePosition);
1568             return true;
1569         }
1570 
1571         int backupPosition = fromPosition;
1572 
1573         // Find a boundary somewhere preceding the first already-cached boundary
1574         do {
1575             backupPosition = backupPosition - 30;
1576             if (backupPosition <= textBegin) {
1577                 backupPosition = textBegin;
1578             } else {
1579                 backupPosition = handleSafePrevious(backupPosition);
1580             }
1581             if (backupPosition == BreakIterator.DONE || backupPosition == textBegin) {
1582                 position = textBegin;
1583                 positionStatusIdx = 0;
1584             } else {
1585                 // Advance to the boundary following the backup position.
1586                 // There is a complication: the safe reverse rules identify pairs of code points
1587                 // that are safe. If advancing from the safe point moves forwards by less than
1588                 // two code points, we need to advance one more time to ensure that the boundary
1589                 // is good, including a correct rules status value.
1590                 //
1591                 fPosition = backupPosition;  // TODO: pass starting position in a clearer way.
1592                 position = handleNext();
1593                 if (position == backupPosition + 1 ||
1594                         (position == backupPosition + 2 &&
1595                         Character.isHighSurrogate(fText.setIndex(backupPosition)) &&
1596                         Character.isLowSurrogate(fText.next()))) {
1597                     // The initial handleNext() only advanced by a single code point. Go again.
1598                     // Safe rules identify safe pairs.
1599                     position = handleNext();
1600                 }
1601                 positionStatusIdx = fRuleStatusIndex;
1602             }
1603         } while (position >= fromPosition);
1604 
1605         // Find boundaries between the one we just located and the first already-cached boundary
1606         // Put them in a side buffer, because we don't yet know where they will fall in the circular cache buffer.
1607 
1608         fSideBuffer.removeAllElements();
1609         fSideBuffer.push(position);
1610         fSideBuffer.push(positionStatusIdx);
1611 
1612         do {
1613             int prevPosition = fPosition = position;
1614             int prevStatusIdx = positionStatusIdx;
1615             position = handleNext();
1616             positionStatusIdx = fRuleStatusIndex;
1617             if (position == BreakIterator.DONE) {
1618                 break;
1619             }
1620 
1621             boolean segmentHandledByDictionary = false;
1622             if (fDictionaryCharCount != 0) {
1623                 // Segment from the rules includes dictionary characters.
1624                 // Subdivide it, with subdivided results going into the dictionary cache.
1625                 int dictSegEndPosition = position;
1626                 fDictionaryCache.populateDictionary(prevPosition, dictSegEndPosition, prevStatusIdx, positionStatusIdx);
1627                 while (fDictionaryCache.following(prevPosition)) {
1628                     position = fDictionaryCache.fBoundary;
1629                     positionStatusIdx = fDictionaryCache.fStatusIndex;
1630                     segmentHandledByDictionary = true;
1631                     assert(position > prevPosition);
1632                     if (position >= fromPosition) {
1633                         break;
1634                     }
1635                     assert(position <= dictSegEndPosition);
1636                     fSideBuffer.push(position);
1637                     fSideBuffer.push(positionStatusIdx);
1638                     prevPosition = position;
1639                 }
1640                 assert(position==dictSegEndPosition || position>=fromPosition);
1641             }
1642 
1643             if (!segmentHandledByDictionary && position < fromPosition) {
1644                 fSideBuffer.push(position);
1645                 fSideBuffer.push(positionStatusIdx);
1646             }
1647         } while (position < fromPosition);
1648 
1649         // Move boundaries from the side buffer to the main circular buffer.
1650         boolean success = false;
1651         if (!fSideBuffer.isEmpty()) {
1652             positionStatusIdx = fSideBuffer.pop();
1653             position = fSideBuffer.pop();
1654             addPreceding(position, positionStatusIdx, UpdateCachePosition);
1655             success = true;
1656         }
1657 
1658         while (!fSideBuffer.isEmpty()) {
1659             positionStatusIdx = fSideBuffer.pop();
1660             position = fSideBuffer.pop();
1661             if (!addPreceding(position, positionStatusIdx, RetainCachePosition)) {
1662                 // No space in circular buffer to hold a new preceding result while
1663                 // also retaining the current cache (iteration) position.
1664                 // Bailing out is safe; the cache will refill again if needed.
1665                 break;
1666             }
1667         }
1668         return success;
1669     };
1670 
1671 
1672     static final boolean RetainCachePosition = false;
1673     static final boolean UpdateCachePosition = true;
1674 
1675     /**
1676      * Add the boundary following the current position.
1677      * The current position can be left as it was, or changed to the newly added boundary,
1678      * as specified by the update parameter.
1679      */
addFollowing(int position, int ruleStatusIdx, boolean update)1680     void addFollowing(int position, int ruleStatusIdx, boolean update) {
1681         assert(position > fBoundaries[fEndBufIdx]);
1682         assert(ruleStatusIdx <= Short.MAX_VALUE);
1683         int nextIdx = modChunkSize(fEndBufIdx + 1);
1684         if (nextIdx == fStartBufIdx) {
1685             fStartBufIdx = modChunkSize(fStartBufIdx + 6);    // TODO: experiment. Probably revert to 1.
1686         }
1687         fBoundaries[nextIdx] = position;
1688         fStatuses[nextIdx] = (short)ruleStatusIdx;
1689         fEndBufIdx = nextIdx;
1690         if (update == UpdateCachePosition) {
1691             // Set current position to the newly added boundary.
1692             fBufIdx = nextIdx;
1693             fTextIdx = position;
1694         } else {
1695             // Retaining the original cache position.
1696             // Check if the added boundary wraps around the buffer, and would over-write the original position.
1697             // It's the responsibility of callers of this function to not add too many.
1698             assert(nextIdx != fBufIdx);
1699         }
1700 
1701     };
1702 
1703 
1704     /**
1705      * Add the boundary preceding the current position.
1706      * The current position can be left as it was, or changed to the newly added boundary,
1707      * as specified by the update parameter.
1708      */
addPreceding(int position, int ruleStatusIdx, boolean update)1709     boolean addPreceding(int position, int ruleStatusIdx, boolean update) {
1710         assert(position < fBoundaries[fStartBufIdx]);
1711         assert(ruleStatusIdx <= Short.MAX_VALUE);
1712         int nextIdx = modChunkSize(fStartBufIdx - 1);
1713         if (nextIdx == fEndBufIdx) {
1714             if (fBufIdx == fEndBufIdx && update == RetainCachePosition) {
1715                 // Failure. The insertion of the new boundary would claim the buffer position that is the
1716                 // current iteration position. And we also want to retain the current iteration position.
1717                 // (The buffer is already completely full of entries that precede the iteration position.)
1718                 return false;
1719             }
1720             fEndBufIdx = modChunkSize(fEndBufIdx - 1);
1721         }
1722         fBoundaries[nextIdx] = position;
1723         fStatuses[nextIdx] = (short)ruleStatusIdx;
1724         fStartBufIdx = nextIdx;
1725         if (update == UpdateCachePosition) {
1726             fBufIdx = nextIdx;
1727             fTextIdx = position;
1728         }
1729         return true;
1730     };
1731 
1732     /**
1733      *  Set the cache position to the specified position, or, if the position
1734      *  falls between to cached boundaries, to the preceding boundary.
1735      *  Fails if the requested position is outside of the range of boundaries currently held by the cache.
1736      *  The startPosition must be on a code point boundary.
1737      *
1738      *  Return true if successful, false if the specified position is after
1739      *  the last cached boundary or before the first.
1740      */
1741     boolean seek(int pos) {
1742         if (pos < fBoundaries[fStartBufIdx] || pos > fBoundaries[fEndBufIdx]) {
1743             return false;
1744         }
1745         if (pos == fBoundaries[fStartBufIdx]) {
1746             // Common case: seek(0), from BreakIterator::first()
1747             fBufIdx = fStartBufIdx;
1748             fTextIdx = fBoundaries[fBufIdx];
1749             return true;
1750         }
1751         if (pos == fBoundaries[fEndBufIdx]) {
1752             fBufIdx = fEndBufIdx;
1753             fTextIdx = fBoundaries[fBufIdx];
1754             return true;
1755         }
1756 
1757         int min = fStartBufIdx;
1758         int max = fEndBufIdx;
1759         while (min != max) {
1760             int probe = (min + max + (min>max ? CACHE_SIZE : 0)) / 2;
1761             probe = modChunkSize(probe);
1762             if (fBoundaries[probe] > pos) {
1763                 max = probe;
1764             } else {
1765                 min = modChunkSize(probe + 1);
1766             }
1767         }
1768         assert(fBoundaries[max] > pos);
1769         fBufIdx = modChunkSize(max - 1);
1770         fTextIdx = fBoundaries[fBufIdx];
1771         assert(fTextIdx <= pos);
1772         return true;
1773 
1774     };
1775 
1776 
1777     /**
1778      * copy constructor, used from RuleBasedBreakIterator.clone().
1779      *
1780      * @param src
1781      */
BreakCache(BreakCache src)1782     BreakCache(BreakCache src)  {
1783         fStartBufIdx = src.fStartBufIdx;
1784         fEndBufIdx = src.fEndBufIdx;
1785         fTextIdx = src.fTextIdx;
1786         fBufIdx = src.fBufIdx;
1787         fBoundaries = src.fBoundaries.clone();
1788         fStatuses = src.fStatuses.clone();
1789         fSideBuffer = new DictionaryBreakEngine.DequeI();  // Transient, no need to clone contents.
1790     }
1791 
dumpCache()1792     void dumpCache() {
1793         System.out.printf("fTextIdx:%d   fBufIdx:%d%n", fTextIdx, fBufIdx);
1794         for (int i=fStartBufIdx; ; i=modChunkSize(i+1)) {
1795             System.out.printf("%d  %d%n", i, fBoundaries[i]);
1796             if (i == fEndBufIdx) {
1797                 break;
1798             }
1799         }
1800     };
1801 
modChunkSize(int index)1802     private final int   modChunkSize(int index) { return index & (CACHE_SIZE - 1); };
1803 
1804     static final int CACHE_SIZE = 128;
1805     // static_assert((CACHE_SIZE & (CACHE_SIZE-1)) == 0, "CACHE_SIZE must be power of two.");
1806 
1807     int                 fStartBufIdx;
1808     int                 fEndBufIdx;    // inclusive
1809 
1810     int                 fTextIdx;
1811     int                 fBufIdx;
1812 
1813     int[]               fBoundaries = new int[CACHE_SIZE];
1814     short[]             fStatuses = new short[CACHE_SIZE];
1815 
1816     DictionaryBreakEngine.DequeI   fSideBuffer = new DictionaryBreakEngine.DequeI();
1817 };
1818 
1819 
1820 
1821 
1822 }
1823 
1824