• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4  *******************************************************************************
5  * Copyright (C) 2005-2016 International Business Machines Corporation and
6  * others. All Rights Reserved.
7  *******************************************************************************
8  */
9 
10 package com.ibm.icu.text;
11 
12 import static com.ibm.icu.impl.CharacterIteration.DONE32;
13 import static com.ibm.icu.impl.CharacterIteration.next32;
14 import static com.ibm.icu.impl.CharacterIteration.nextTrail32;
15 
16 import java.io.ByteArrayOutputStream;
17 import java.io.IOException;
18 import java.io.InputStream;
19 import java.io.OutputStream;
20 import java.nio.ByteBuffer;
21 import java.text.CharacterIterator;
22 import java.util.MissingResourceException;
23 import java.util.concurrent.ConcurrentLinkedQueue;
24 
25 import com.ibm.icu.impl.CharacterIteration;
26 import com.ibm.icu.impl.ICUBinary;
27 import com.ibm.icu.impl.ICUDebug;
28 import com.ibm.icu.impl.RBBIDataWrapper;
29 import com.ibm.icu.impl.breakiter.BurmeseBreakEngine;
30 import com.ibm.icu.impl.breakiter.CjkBreakEngine;
31 import com.ibm.icu.impl.breakiter.DictionaryBreakEngine;
32 import com.ibm.icu.impl.breakiter.KhmerBreakEngine;
33 import com.ibm.icu.impl.breakiter.LSTMBreakEngine;
34 import com.ibm.icu.impl.breakiter.LanguageBreakEngine;
35 import com.ibm.icu.impl.breakiter.LaoBreakEngine;
36 import com.ibm.icu.impl.breakiter.ThaiBreakEngine;
37 import com.ibm.icu.impl.breakiter.UnhandledBreakEngine;
38 import com.ibm.icu.lang.UCharacter;
39 import com.ibm.icu.lang.UProperty;
40 import com.ibm.icu.lang.UScript;
41 import com.ibm.icu.util.CodePointTrie;
42 
43 /**
44  * Rule Based Break Iterator
45  * This is a port of the C++ class RuleBasedBreakIterator from ICU4C.
46  *
47  * @stable ICU 2.0
48  */
49 public class RuleBasedBreakIterator extends BreakIterator {
50     //=======================================================================
51     // Constructors & Factories
52     //=======================================================================
53 
54     /**
55      * private constructor
56      */
RuleBasedBreakIterator()57     private RuleBasedBreakIterator() {
58         fDictionaryCharCount  = 0;
59     }
60 
61     /**
62      * Create a break iterator from a precompiled set of break rules.
63      *
64      * Creating a break iterator from the binary rules is much faster than
65      * creating one from source rules.
66      *
67      * The binary rules are generated by the RuleBasedBreakIterator.compileRules() function.
68      * Binary break iterator rules are not guaranteed to be compatible between
69      * different versions of ICU.
70      *
71      * @param is an input stream supplying the compiled binary rules.
72      * @throws IOException if there is an error while reading the rules from the InputStream.
73      * @see    #compileRules(String, OutputStream)
74      * @stable ICU 4.8
75      */
getInstanceFromCompiledRules(InputStream is)76     public static RuleBasedBreakIterator getInstanceFromCompiledRules(InputStream is) throws IOException {
77         RuleBasedBreakIterator  This = new RuleBasedBreakIterator();
78         This.fRData = RBBIDataWrapper.get(ICUBinary.getByteBufferFromInputStreamAndCloseStream(is));
79         This.fLookAheadMatches = new int[This.fRData.fFTable.fLookAheadResultsSize];
80         return This;
81     }
82 
83     /**
84      * This factory method doesn't have an access modifier; it is only accessible in the same
85      * package.
86      *
87      * Create a break iterator from a precompiled set of break rules.
88      *
89      * Creating a break iterator from the binary rules is much faster than
90      * creating one from source rules.
91      *
92      * The binary rules are generated by the RuleBasedBreakIterator.compileRules() function.
93      * Binary break iterator rules are not guaranteed to be compatible between
94      * different versions of ICU.
95      *
96      * @param bytes a buffer supplying the compiled binary rules.
97      * @param phraseBreaking a flag indicating if phrase breaking is required.
98      * @throws IOException if there is an error while reading the rules from the buffer.
99      * @see    #compileRules(String, OutputStream)
100      * @internal
101      */
getInstanceFromCompiledRules( ByteBuffer bytes, boolean phraseBreaking)102     /* package-potected */ static RuleBasedBreakIterator getInstanceFromCompiledRules(
103             ByteBuffer bytes, boolean phraseBreaking) throws IOException {
104         RuleBasedBreakIterator instance = getInstanceFromCompiledRules(bytes);
105         instance.fPhraseBreaking = phraseBreaking;
106         return instance;
107     }
108 
109     /**
110      * Create a break iterator from a precompiled set of break rules.
111      *
112      * Creating a break iterator from the binary rules is much faster than
113      * creating one from source rules.
114      *
115      * The binary rules are generated by the RuleBasedBreakIterator.compileRules() function.
116      * Binary break iterator rules are not guaranteed to be compatible between
117      * different versions of ICU.
118      *
119      * @param bytes a buffer supplying the compiled binary rules.
120      * @throws IOException if there is an error while reading the rules from the buffer.
121      * @see    #compileRules(String, OutputStream)
122      * @internal
123      * @deprecated This API is ICU internal only.
124      */
125     @Deprecated
getInstanceFromCompiledRules(ByteBuffer bytes)126     public static RuleBasedBreakIterator getInstanceFromCompiledRules(ByteBuffer bytes) throws IOException {
127         RuleBasedBreakIterator  This = new RuleBasedBreakIterator();
128         This.fRData = RBBIDataWrapper.get(bytes);
129         This.fLookAheadMatches = new int[This.fRData.fFTable.fLookAheadResultsSize];
130         return This;
131     }
132 
133     /**
134      * Construct a RuleBasedBreakIterator from a set of rules supplied as a string.
135      * @param rules The break rules to be used.
136      * @stable ICU 2.2
137      */
RuleBasedBreakIterator(String rules)138     public RuleBasedBreakIterator(String rules)  {
139         this();
140         try {
141             ByteArrayOutputStream ruleOS = new ByteArrayOutputStream();
142             compileRules(rules, ruleOS);
143             fRData = RBBIDataWrapper.get(ByteBuffer.wrap(ruleOS.toByteArray()));
144             fLookAheadMatches = new int[fRData.fFTable.fLookAheadResultsSize];
145         } catch (IOException e) {
146             ///CLOVER:OFF
147             // An IO exception can only arrive here if there is a bug in the RBBI Rule compiler,
148             //  causing bogus compiled rules to be produced, but with no compile error raised.
149             RuntimeException rte = new RuntimeException("RuleBasedBreakIterator rule compilation internal error: "
150                     + e.getMessage());
151             throw rte;
152             ///CLOVER:ON
153         }
154     }
155 
156     //=======================================================================
157     // Boilerplate
158     //=======================================================================
159 
160     /**
161      * Clones this iterator.
162      * @return A newly-constructed RuleBasedBreakIterator with the same
163      * behavior as this one.
164      * @stable ICU 2.0
165      */
166     @Override
clone()167     public Object clone()  {
168         RuleBasedBreakIterator result;
169         result = (RuleBasedBreakIterator)super.clone();
170         if (fText != null) {
171             result.fText = (CharacterIterator)(fText.clone());
172         }
173         result.fLookAheadMatches = new int[fRData.fFTable.fLookAheadResultsSize];
174         result.fBreakCache = result.new BreakCache(fBreakCache);
175         result.fDictionaryCache = result.new DictionaryCache(fDictionaryCache);
176         return result;
177     }
178 
179 
180     /**
181      * Returns true if both BreakIterators are of the same class, have the same
182      * rules, and iterate over the same text.
183      * @stable ICU 2.0
184      */
185     @Override
equals(Object that)186     public boolean equals(Object that) {
187         if (that == null) {
188             return false;
189         }
190         if (this == that) {
191             return true;
192         }
193         try {
194             RuleBasedBreakIterator other = (RuleBasedBreakIterator) that;
195             if (fRData != other.fRData && (fRData == null || other.fRData == null)) {
196                 return false;
197             }
198             if (fRData != null && other.fRData != null &&
199                     (!fRData.fRuleSource.equals(other.fRData.fRuleSource))) {
200                 return false;
201             }
202             if (fText == null && other.fText == null) {
203                 return true;
204             }
205             if (fText == null || other.fText == null || !fText.equals(other.fText)) {
206                 return false;
207             }
208             return fPosition == other.fPosition;
209         }
210         catch(ClassCastException e) {
211             return false;
212         }
213      }
214 
215     /**
216      * Returns the description (rules) used to create this iterator.
217      * (In ICU4C, the same function is RuleBasedBreakIterator::getRules())
218      * @stable ICU 2.0
219      */
220     @Override
toString()221     public String toString() {
222         String retStr = "";
223         if (fRData != null) {
224             retStr =  fRData.fRuleSource;
225         }
226         return retStr;
227     }
228 
229     /**
230      * Compute a hashcode for this BreakIterator
231      * @return A hash code
232      * @stable ICU 2.0
233      */
234     @Override
hashCode()235     public int hashCode()
236     {
237         return fRData.fRuleSource.hashCode();
238     }
239 
240 
241     private static final int  START_STATE = 1;     // The state number of the starting state
242     private static final int  STOP_STATE  = 0;     // The state-transition value indicating "stop"
243 
244     // RBBIRunMode - the state machine runs an extra iteration at the beginning and end
245     //               of user text.  A variable with this enum type keeps track of where we
246     //               are.  The state machine only fetches user text input while in RUN mode.
247     private static final int  RBBI_START  = 0;
248     private static final int  RBBI_RUN    = 1;
249     private static final int  RBBI_END    = 2;
250 
251     /**
252      * The character iterator through which this BreakIterator accesses the text.
253      */
254     private CharacterIterator   fText = new java.text.StringCharacterIterator("");
255 
256     /**
257      * The rule data for this BreakIterator instance.
258      * Not intended for public use. Declared public for testing purposes only.
259      * @internal
260      * @deprecated This API is ICU internal only.
261      */
262     @Deprecated
263     public RBBIDataWrapper    fRData;
264 
265     /**
266      *  The iteration state - current position, rule status for the current position,
267      *                        and whether the iterator ran off the end, yielding UBRK_DONE.
268      *                        Current position is pinned to be 0 < position <= text.length.
269      *                        Current position is always set to a boundary.
270      *
271      *  The current  position of the iterator. Pinned, 0 < fPosition <= text.length.
272      *  Never has the value UBRK_DONE (-1).
273      */
274     private int                fPosition;
275 
276     /**
277      * Index of the Rule {tag} values for the most recent match.
278      */
279     private int                fRuleStatusIndex;
280 
281     /**
282      * True when iteration has run off the end, and iterator functions should return UBRK_DONE.
283      */
284     private boolean            fDone;
285 
286     /**
287      *  Array of look-ahead tentative results.
288      */
289     private int[]              fLookAheadMatches;
290 
291     /**
292      *   Cache of previously determined boundary positions.
293      */
294     private BreakCache         fBreakCache = new BreakCache();
295 
296     /**
297      * Flag used to indicate if phrase breaking is required.
298      */
299     private boolean            fPhraseBreaking = false;
300 
301 
302     /**
303      * Counter for the number of characters encountered with the "dictionary"
304      *   flag set.  Normal RBBI iterators don't use it, although the code
305      *   for updating it is live.  Dictionary Based break iterators (a subclass
306      *   of us) access this field directly.
307      * @internal
308      */
309     private int fDictionaryCharCount;
310 
311     private DictionaryCache     fDictionaryCache = new DictionaryCache();
312 
313     /**
314      * ICU debug argument name for RBBI
315      */
316     private static final String RBBI_DEBUG_ARG = "rbbi";
317 
318     /**
319      * Debugging flag.  Trace operation of state machine when true.
320      */
321     private static final boolean TRACE = ICUDebug.enabled(RBBI_DEBUG_ARG)
322             && ICUDebug.value(RBBI_DEBUG_ARG).indexOf("trace") >= 0;
323 
324     /**
325      * The "default" break engine - just skips over ranges of dictionary words,
326      * producing no breaks. Should only be used if characters need to be handled
327      * by a dictionary but we have no dictionary implementation for them.
328      *
329      * Only one instance; shared by all break iterators.
330      */
331     private static final UnhandledBreakEngine gUnhandledBreakEngine;
332 
333     /**
334      * List of all known break engines, common for all break iterators.
335      * Lazily updated as break engines are needed, because instantiation of
336      * break engines is expensive.
337      *
338      * Important notes:
339      * <ul>Because we don't want to add the same LanguageBreakEngine multiple times, all writes
340      *     are synchronized.
341      * <ul>Read access avoids explicit synchronization, but will end up being synchronized if
342      *     needed.
343      */
344     private static final ConcurrentLinkedQueue<LanguageBreakEngine> gAllBreakEngines;
345 
346     static {
347         gUnhandledBreakEngine = new UnhandledBreakEngine();
348         gAllBreakEngines = new ConcurrentLinkedQueue<>();
349         gAllBreakEngines.add(gUnhandledBreakEngine);
350     }
351 
352     /**
353      * Dump the contents of the state table and character classes for this break iterator.
354      * For debugging only.
355      * @internal
356      * @deprecated This API is ICU internal only.
357      */
358     @Deprecated
dump(java.io.PrintStream out)359     public void dump(java.io.PrintStream out) {
360         if (out == null) {
361             out = System.out;
362         }
363         this.fRData.dump(out);
364     }
365 
366     /**
367      * Compile a set of source break rules into the binary state tables used
368      * by the break iterator engine.  Creating a break iterator from precompiled
369      * rules is much faster than creating one from source rules.
370      *
371      * Binary break rules are not guaranteed to be compatible between different
372      * versions of ICU.
373      *
374      *
375      * @param rules  The source form of the break rules
376      * @param ruleBinary  An output stream to receive the compiled rules.
377      * @throws IOException If there is an error writing the output.
378      * @see #getInstanceFromCompiledRules(InputStream)
379      * @stable ICU 4.8
380      */
compileRules(String rules, OutputStream ruleBinary)381     public static void compileRules(String rules, OutputStream ruleBinary) throws IOException {
382         RBBIRuleBuilder.compileRules(rules, ruleBinary);
383     }
384 
385     //=======================================================================
386     // BreakIterator overrides
387     //=======================================================================
388 
389     /**
390      * Sets the current iteration position to the beginning of the text.
391      * (i.e., the CharacterIterator's starting offset).
392      * @return The offset of the beginning of the text.
393      * @stable ICU 2.0
394      */
395     @Override
first()396     public int first() {
397         if (fText == null) {
398             return BreakIterator.DONE;
399         }
400         fText.first();
401         int start =  fText.getIndex();
402         if (!fBreakCache.seek(start)) {
403             fBreakCache.populateNear(start);
404         }
405         fBreakCache.current();
406         assert(fPosition == start);
407         return fPosition;
408     }
409 
410     /**
411      * Sets the current iteration position to the end of the text.
412      * (i.e., the CharacterIterator's ending offset).
413      * @return The text's past-the-end offset.
414      * @stable ICU 2.0
415      */
416     @Override
last()417     public int last() {
418         if (fText == null) {
419             return BreakIterator.DONE;
420         }
421         int endPos = fText.getEndIndex();
422         boolean endShouldBeBoundary = isBoundary(endPos);      // Has side effect of setting iterator position.
423         assert(endShouldBeBoundary);
424         if (fPosition != endPos) {
425             assert(fPosition == endPos);
426         }
427         return endPos;
428     }
429 
430     /**
431      * Advances the iterator either forward or backward the specified number of steps.
432      * Negative values move backward, and positive values move forward.  This is
433      * equivalent to repeatedly calling next() or previous().
434      * @param n The number of steps to move.  The sign indicates the direction
435      * (negative is backwards, and positive is forwards).
436      * @return The character offset of the boundary position n boundaries away from
437      * the current one.
438      * @stable ICU 2.0
439      */
440     @Override
next(int n)441     public int next(int n) {
442         int result = 0;
443         if (n > 0) {
444             for (; n > 0 && result != DONE; --n) {
445                 result = next();
446             }
447         } else if (n < 0) {
448             for (; n < 0 && result != DONE; ++n) {
449                 result = previous();
450             }
451         } else {
452             result = current();
453         }
454         return result;
455     }
456 
457     /**
458      * Advances the iterator to the next boundary position.
459      * @return The position of the first boundary after this one.
460      * @stable ICU 2.0
461      */
462     @Override
next()463     public int next() {
464         fBreakCache.next();
465         return fDone ? DONE : fPosition;
466     }
467 
468     /**
469      * Moves the iterator backwards, to the boundary preceding the current one.
470      * @return The position of the boundary position immediately preceding the starting position.
471      * @stable ICU 2.0
472      */
473     @Override
previous()474     public int previous() {
475         fBreakCache.previous();
476         return fDone ? DONE : fPosition;
477     }
478 
479     /**
480      * Sets the iterator to refer to the first boundary position following
481      * the specified position.
482      * @param startPos The position from which to begin searching for a break position.
483      * @return The position of the first break after the current position.
484      * @stable ICU 2.0
485      */
486     @Override
following(int startPos)487     public int following(int startPos) {
488         // if the supplied position is before the beginning, return the
489         // text's starting offset
490         if (startPos < fText.getBeginIndex()) {
491             return first();
492         }
493 
494         // Move requested offset to a code point start. It might be between a lead and trail surrogate.
495         // Or it may be beyond the end of the text.
496         startPos = CISetIndex32(fText, startPos);
497         fBreakCache.following(startPos);
498         return fDone ? DONE : fPosition;
499     }
500 
501 
502     /**
503      * Sets the iterator to refer to the last boundary position before the
504      * specified position.
505      * @param offset The position to begin searching for a break from.
506      * @return The position of the last boundary before the starting position.
507      * @stable ICU 2.0
508      */
509     @Override
preceding(int offset)510     public int preceding(int offset) {
511         if (fText == null || offset > fText.getEndIndex()) {
512             return last();
513         } else if (offset < fText.getBeginIndex()) {
514             return first();
515         }
516 
517         // Move requested offset to a code point start. It might be between a lead and trail surrogate.
518         // int adjustedOffset = CISetIndex32(fText, offset);    // TODO: restore to match ICU4C behavior.
519         int adjustedOffset = offset;
520         fBreakCache.preceding(adjustedOffset);
521         return fDone ? DONE : fPosition;
522 
523     }
524 
525 
526     /**
527      * Throw IllegalArgumentException unless begin &lt;= offset &lt; end.
528      * @stable ICU 2.0
529      */
checkOffset(int offset, CharacterIterator text)530     protected static final void checkOffset(int offset, CharacterIterator text) {
531         if (offset < text.getBeginIndex() || offset > text.getEndIndex()) {
532             throw new IllegalArgumentException("offset out of bounds");
533         }
534     }
535 
536 
537     /**
538      * Returns true if the specified position is a boundary position.  As a side
539      * effect, leaves the iterator pointing to the first boundary position at
540      * or after "offset".
541      * @param offset the offset to check.
542      * @return True if "offset" is a boundary position.
543      * @stable ICU 2.0
544      */
545     @Override
isBoundary(int offset)546     public boolean isBoundary(int offset) {
547         // TODO: behavior difference with ICU4C, which considers out-of-range offsets
548         //       to not be boundaries, and to not be errors.
549         checkOffset(offset, fText);
550 
551         // Adjust offset to be on a code point boundary and not beyond the end of the text.
552         // Note that isBoundary() is always false for offsets that are not on code point boundaries.
553         // But we still need the side effect of leaving iteration at the following boundary.
554         int adjustedOffset = CISetIndex32(fText, offset);
555 
556         boolean result = false;
557         if (fBreakCache.seek(adjustedOffset) || fBreakCache.populateNear(adjustedOffset)) {
558             result = (fBreakCache.current() == offset);
559         }
560 
561         if (!result) {
562             // Not on a boundary. isBoundary() must leave iterator on the following boundary.
563             // fBreakCache.seek(), above, left us on the preceding boundary, so advance one.
564             next();
565         }
566         return result;
567 
568     }
569 
570     /**
571      * Returns the current iteration position.  Note that DONE is never
572      * returned from this function; if iteration has run to the end of a
573      * string, current() will return the length of the string while
574      * next() will return BreakIterator.DONE).
575      * @return The current iteration position.
576      * @stable ICU 2.0
577      */
578     @Override
current()579     public int current() {
580         return (fText != null) ? fPosition : BreakIterator.DONE;
581     }
582 
583 
584     /**
585      * Return the status tag from the break rule that determined the boundary at
586      * the current iteration position.  The values appear in the rule source
587      * within brackets, {123}, for example.  For rules that do not specify a
588      * status, a default value of 0 is returned.  If more than one rule applies,
589      * the numerically largest of the possible status values is returned.
590      * <p>
591      * Of the standard types of ICU break iterators, only the word and line break
592      * iterator provides status values.  The values are defined in
593      * class RuleBasedBreakIterator, and allow distinguishing between words
594      * that contain alphabetic letters, "words" that appear to be numbers,
595      * punctuation and spaces, words containing ideographic characters, and
596      * more.  Call <code>getRuleStatus</code> after obtaining a boundary
597      * position from <code>next()</code>, <code>previous()</code>, or
598      * any other break iterator functions that returns a boundary position.
599      * <p>
600      * Note that <code>getRuleStatus()</code> returns the value corresponding to
601      * <code>current()</code> index even after <code>next()</code> has returned DONE.
602      * <p>
603 
604      * @return the status from the break rule that determined the boundary
605      * at the current iteration position.
606      *
607      * @stable ICU 60
608      */
609 
610     @Override
getRuleStatus()611     public int  getRuleStatus() {
612         //   Status records have this form:
613         //           Count N         <--  fLastRuleStatusIndex points here.
614         //           Status val 0
615         //           Status val 1
616         //              ...
617         //           Status val N-1  <--  the value we need to return
618         //   The status values are sorted in ascending order.
619         //   This function returns the last (largest) of the array of status values.
620         int  idx = fRuleStatusIndex + fRData.fStatusTable[fRuleStatusIndex];
621         int  tagVal = fRData.fStatusTable[idx];
622         return tagVal;
623     }
624 
625     /**
626      * Get the status (tag) values from the break rule(s) that determined the boundary
627      * at the current iteration position.  The values appear in the rule source
628      * within brackets, {123}, for example.  The default status value for rules
629      * that do not explicitly provide one is zero.
630      * <p>
631      * The status values used by the standard ICU break rules are defined
632      * as public constants in class RuleBasedBreakIterator.
633      * <p>
634      * If the size  of the output array is insufficient to hold the data,
635      *  the output will be truncated to the available length.  No exception
636      *  will be thrown.
637      *
638      * @param fillInArray an array to be filled in with the status values.
639      * @return          The number of rule status values from the rules that determined
640      *                  the boundary at the current iteration position.
641      *                  In the event that the array is too small, the return value
642      *                  is the total number of status values that were available,
643      *                  not the reduced number that were actually returned.
644      * @stable ICU 60
645      */
646     @Override
getRuleStatusVec(int[] fillInArray)647     public int getRuleStatusVec(int[] fillInArray) {
648         int numStatusVals = fRData.fStatusTable[fRuleStatusIndex];
649         if (fillInArray != null) {
650             int numToCopy = Math.min(numStatusVals, fillInArray.length);
651             for (int i=0; i<numToCopy; i++) {
652                 fillInArray[i] = fRData.fStatusTable[fRuleStatusIndex + i + 1];
653             }
654         }
655         return numStatusVals;
656     }
657 
658     /**
659      * Returns a CharacterIterator over the text being analyzed.
660      * <p>
661      * <b><i>Caution:</i></b>The state of the returned CharacterIterator
662      * must not be modified in any way while the BreakIterator is still in use.
663      * Doing so will lead to undefined behavior of the BreakIterator.
664      * Clone the returned CharacterIterator first and work with that.
665      * <p>
666      * The returned CharacterIterator is a reference
667      * to the <b>actual iterator being used</b> by the BreakIterator.
668      * No guarantees are made about the current position
669      * of this iterator when it is returned; it may differ from the
670      * BreakIterators current position.  If you need to move that
671      * position to examine the text, clone this function's return value first.
672      * @return An iterator over the text being analyzed.
673      * @stable ICU 2.0
674      */
675     @Override
getText()676     public CharacterIterator getText() {
677         return fText;
678     }
679 
680     /**
681      * Set the iterator to analyze a new piece of text.  This function resets
682      * the current iteration position to the beginning of the text.
683      * (The old iterator is dropped.)
684      * <p>
685      * <b><i>Caution:</i></b> The supplied CharacterIterator is used
686      * directly by the BreakIterator, and must not be altered in any
687      * way by code outside of the BreakIterator.
688      * Doing so will lead to undefined behavior of the BreakIterator.
689      *
690      * @param newText An iterator over the text to analyze.
691      * @stable ICU 2.0
692      */
693     @Override
setText(CharacterIterator newText)694     public void setText(CharacterIterator newText) {
695         if (newText != null) {
696             fBreakCache.reset(newText.getBeginIndex(), 0);
697         } else {
698             fBreakCache.reset();
699         }
700         fDictionaryCache.reset();
701         fText = newText;
702         this.first();
703     }
704 
705      /**
706      * Control debug, trace and dump options.
707      * @internal
708      * @deprecated This API is ICU internal only.
709      */
710     @Deprecated
711     public static final String fDebugEnv = ICUDebug.enabled(RBBI_DEBUG_ARG) ?
712                                            ICUDebug.value(RBBI_DEBUG_ARG) : null;
713 
714 
getLanguageBreakEngine(int c)715     private LanguageBreakEngine getLanguageBreakEngine(int c) {
716 
717         // We have a dictionary character.
718         // Does an already instantiated break engine handle it?
719         // First read without synchronization, which could lead to a new language
720         // break engine being added and we didn't go over it.
721         for (LanguageBreakEngine candidate : gAllBreakEngines) {
722             if (candidate.handles(c)) {
723                 return candidate;
724             }
725         }
726 
727         synchronized (gAllBreakEngines) {
728             // Another break iterator may have instantiated the desired engine.
729             for (LanguageBreakEngine candidate : gAllBreakEngines) {
730                 if (candidate.handles(c)) {
731                     return candidate;
732                 }
733             }
734 
735             // The global list doesn't have an existing engine, build one.
736             int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
737             if (script == UScript.KATAKANA || script == UScript.HIRAGANA) {
738                 // Katakana, Hiragana and Han are handled by the same dictionary engine.
739                 // Fold them together for mapping from script -> engine.
740                 script = UScript.HAN;
741             }
742 
743             LanguageBreakEngine eng;
744             try {
745                 switch (script) {
746                 case UScript.THAI:
747                     try {
748                         eng = LSTMBreakEngine.create(script, LSTMBreakEngine.createData(script));
749                     } catch (MissingResourceException e) {
750                         eng = new ThaiBreakEngine();
751                     }
752                     break;
753                 case UScript.LAO:
754                     eng = new LaoBreakEngine();
755                     break;
756                 case UScript.MYANMAR:
757                     try {
758                         eng = LSTMBreakEngine.create(script, LSTMBreakEngine.createData(script));
759                     } catch (MissingResourceException e) {
760                         eng = new BurmeseBreakEngine();
761                     }
762                     break;
763                 case UScript.KHMER:
764                     eng = new KhmerBreakEngine();
765                     break;
766                 case UScript.HAN:
767                     eng = new CjkBreakEngine(false);
768                      break;
769                 case UScript.HANGUL:
770                     eng = new CjkBreakEngine(true);
771                     break;
772                 default:
773                     gUnhandledBreakEngine.handleChar(c);
774                     eng = gUnhandledBreakEngine;
775                     break;
776                 }
777             } catch (IOException e) {
778                 eng = null;
779             }
780 
781             if (eng != null && eng != gUnhandledBreakEngine) {
782                 gAllBreakEngines.add(eng);
783             }
784             return eng;
785         }   // end synchronized(gAllBreakEngines)
786     }
787 
788     /**
789      * The State Machine Engine for moving forward is here.
790      * This function is the heart of the RBBI run time engine.
791      *
792      * Input
793      *    fPosition, the position in the text to begin from.
794      * Output
795      *    fPosition:           the boundary following the starting position.
796      *    fDictionaryCharCount the number of dictionary characters encountered.
797      *                         If > 0, the segment will be further subdivided
798      *    fRuleStatusIndex     Info from the state table indicating which rules caused the boundary.
799      *
800      * @return the new iterator position
801      *
802      * A note on supplementary characters and the position of underlying
803      * Java CharacterIterator:   Normally, a character iterator is positioned at
804      * the char most recently returned by next().  Within this function, when
805      * a supplementary char is being processed, the char iterator is left
806      * sitting on the trail surrogate, in the middle of the code point.
807      * This is different from everywhere else, where an iterator always
808      * points at the lead surrogate of a supplementary.
809      */
handleNext()810     private int handleNext() {
811         if (TRACE) {
812             System.out.println("Handle Next   pos      char  state category");
813         }
814 
815         // handleNext always sets the break tag value.
816         // Set the default for it.
817         fRuleStatusIndex  = 0;
818         fDictionaryCharCount = 0;
819 
820         // caches for quicker access
821         CharacterIterator text = fText;
822         CodePointTrie trie = fRData.fTrie;
823 
824         char[] stateTable  = fRData.fFTable.fTable;
825         int initialPosition = fPosition;
826         text.setIndex(initialPosition);
827         int result          = initialPosition;
828 
829         // Set up the starting char
830         int c = text.current();
831         if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) {
832             c = nextTrail32(text, c);
833             if (c == DONE32) {
834                 fDone = true;
835                 return BreakIterator.DONE;
836             }
837         }
838 
839         // Set the initial state for the state machine
840         int state           = START_STATE;
841         int row             = fRData.getRowIndex(state);
842         short category      = 3;
843         int flagsState      = fRData.fFTable.fFlags;
844         int dictStart       = fRData.fFTable.fDictCategoriesStart;
845         int mode            = RBBI_RUN;
846         if ((flagsState & RBBIDataWrapper.RBBI_BOF_REQUIRED) != 0) {
847             category = 2;
848             mode     = RBBI_START;
849             if (TRACE) {
850                 System.out.print("            " +  RBBIDataWrapper.intToString(text.getIndex(), 5));
851                 System.out.print(RBBIDataWrapper.intToHexString(c, 10));
852                 System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6));
853             }
854         }
855 
856         // loop until we reach the end of the text or transition to state 0
857         while (state != STOP_STATE) {
858             if (c == DONE32) {
859                 // Reached end of input string.
860                 if (mode == RBBI_END) {
861                     // We have already run the loop one last time with the
862                     // character set to the pseudo {eof} value. Now it is time
863                     // to unconditionally bail out.
864                     break;
865                 }
866                 // Run the loop one last time with the fake end-of-input character category
867                 mode = RBBI_END;
868                 category = 1;
869             }
870             else if (mode == RBBI_RUN) {
871                 // Get the char category.  An incoming category of 1 or 2 mens that
872                 //      we are preset for doing the beginning or end of input, and
873                 //      that we shouldn't get a category from an actual text input character.
874                 //
875 
876                 // look up the current character's character category, which tells us
877                 // which column in the state table to look at.
878                 //
879                 category = (short) trie.get(c);
880 
881                 // Check for categories that require word dictionary handling.
882                 if (category >= dictStart) {
883                     fDictionaryCharCount++;
884                 }
885 
886                 if (TRACE) {
887                     System.out.print("            " +  RBBIDataWrapper.intToString(text.getIndex(), 5));
888                     System.out.print(RBBIDataWrapper.intToHexString(c, 10));
889                     System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6));
890                 }
891 
892                 // Advance to the next character.
893                 // If this is a beginning-of-input loop iteration, don't advance.
894                 //    The next iteration will be processing the first real input character.
895                 c = text.next();
896                 if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) {
897                     c = nextTrail32(text, c);
898                 }
899             }
900             else {
901                 mode = RBBI_RUN;
902             }
903 
904             // look up a state transition in the state table
905             state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category];
906             row   = fRData.getRowIndex(state);
907             int accepting = stateTable[row + RBBIDataWrapper.ACCEPTING];
908             if (accepting == RBBIDataWrapper.ACCEPTING_UNCONDITIONAL) {
909                 // Match found, common case
910                 result = text.getIndex();
911                 if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c <= UTF16.CODEPOINT_MAX_VALUE) {
912                     // The iterator has been left in the middle of a surrogate pair.
913                     // We want the start of it.
914                     result--;
915                 }
916 
917                 //  Remember the break status (tag) values.
918                 fRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGSIDX];
919             } else if (accepting > RBBIDataWrapper.ACCEPTING_UNCONDITIONAL) {
920                 // Lookahead match is completed
921                 int lookaheadResult = fLookAheadMatches[accepting];
922                 if (lookaheadResult >= 0) {
923                     fRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGSIDX];
924                     fPosition = lookaheadResult;
925                     return lookaheadResult;
926                 }
927             }
928 
929 
930             // If we are at the position of the '/' in a look-ahead (hard break) rule;
931             // record the current position, to be returned later, if the full rule matches.
932             // TODO: Move this check before the previous check of fAccepting.
933             //       This would enable hard-break rules with no following context.
934             //       But there are line break test failures when trying this. Investigate.
935             //       Issue ICU-20837
936             int rule = stateTable[row + RBBIDataWrapper.LOOKAHEAD];
937             if (rule != 0) {
938                 int  pos = text.getIndex();
939                 if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c <= UTF16.CODEPOINT_MAX_VALUE) {
940                     // The iterator has been left in the middle of a surrogate pair.
941                     // We want the beginning  of it.
942                     pos--;
943                 }
944                 fLookAheadMatches[rule] = pos;
945             }
946 
947 
948         }        // End of state machine main loop
949 
950         // The state machine is done.  Check whether it found a match...
951 
952         // If the iterator failed to advance in the match engine force it ahead by one.
953         // This indicates a defect in the break rules, which should always match
954         // at least one character.
955 
956         if (result == initialPosition) {
957             if (TRACE) {
958                 System.out.println("Iterator did not move. Advancing by 1.");
959             }
960             text.setIndex(initialPosition);
961             next32(text);
962             result = text.getIndex();
963             fRuleStatusIndex = 0;
964         }
965 
966         // Leave the iterator at our result position.
967         //   (we may have advanced beyond the last accepting position chasing after
968         //    longer matches that never completed.)
969         fPosition = result;
970 
971         if (TRACE) {
972             System.out.println("result = " + result);
973         }
974         return result;
975     }
976 
977     /**
978      * Iterate backwards from an arbitrary position in the input text using the Safe Reverse rules.
979      * This locates a "Safe Position" from which the forward break rules
980      * will operate correctly. A Safe Position is not necessarily a boundary itself.
981      *
982      * The logic of this function is very similar to handleNext(), above, but simpler
983      * because the safe table does not require as many options.
984      *
985      * @param fromPosition the position in the input text to begin the iteration.
986      * @internal
987      */
handleSafePrevious(int fromPosition)988     private int handleSafePrevious(int fromPosition) {
989         char            state;
990         short           category = 0;
991         int             result = 0;
992 
993         // caches for quicker access
994         CharacterIterator text = fText;
995         CodePointTrie trie = fRData.fTrie;
996         char[] stateTable  = fRData.fRTable.fTable;
997 
998         CISetIndex32(text, fromPosition);
999         if (TRACE) {
1000             System.out.print("Handle Previous   pos   char  state category");
1001         }
1002 
1003         // if we're already at the start of the text, return DONE.
1004         if (text.getIndex() == text.getBeginIndex()) {
1005             return BreakIterator.DONE;
1006         }
1007 
1008         //  Set the initial state for the state machine
1009         int c = CharacterIteration.previous32(text);
1010         state = START_STATE;
1011         int row = fRData.getRowIndex(state);
1012 
1013         // loop until we reach the start of the text or transition to state 0
1014         //
1015         for (; c != DONE32; c = CharacterIteration.previous32(text)) {
1016 
1017             // look up the current character's character category, which tells us
1018             // which column in the state table to look at.
1019             //
1020             //  And off the dictionary flag bit. For reverse iteration it is not used.
1021             category = (short) trie.get(c);
1022             if (TRACE) {
1023                 System.out.print("            " +  RBBIDataWrapper.intToString(text.getIndex(), 5));
1024                 System.out.print(RBBIDataWrapper.intToHexString(c, 10));
1025                 System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6));
1026             }
1027 
1028             // State Transition - move machine to its next state
1029             //
1030             assert(category < fRData.fHeader.fCatCount);
1031             state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category];
1032             row   = fRData.getRowIndex(state);
1033 
1034             if (state == STOP_STATE) {
1035                 // This is the normal exit from the lookup state machine.
1036                 // Transition to state zero means we have found a safe point.
1037                 break;
1038             }
1039         }
1040 
1041         // The state machine is done.
1042         result = text.getIndex();
1043         if (TRACE) {
1044             System.out.println("result = " + result);
1045         }
1046         return result;
1047     }
1048 
1049     /**
1050      * Set the index of a CharacterIterator.
1051      * Pin the index to the valid range range of BeginIndex <= index <= EndIndex.
1052      * If the index points to a trail surrogate of a supplementary character, adjust it
1053      * to the start (lead surrogate) index.
1054      *
1055      * @param ci A CharacterIterator to set
1056      * @param index the index to set
1057      * @return the resulting index, possibly pinned or adjusted.
1058      */
1059     private static int CISetIndex32(CharacterIterator ci, int index) {
1060         if (index <= ci.getBeginIndex()) {
1061             ci.first();
1062         } else if (index >= ci.getEndIndex()) {
1063             ci.setIndex(ci.getEndIndex());
1064         } else if (Character.isLowSurrogate(ci.setIndex(index))) {
1065             if (!Character.isHighSurrogate(ci.previous())) {
1066                 ci.next();
1067             }
1068         }
1069         return ci.getIndex();
1070     }
1071 
1072     /** DictionaryCache  stores the boundaries obtained from a run of dictionary characters.
1073      *                 Dictionary boundaries are moved first to this cache, then from here
1074      *                 to the main BreakCache, where they may inter-leave with non-dictionary
1075      *                 boundaries. The public BreakIterator API always fetches directly
1076      *                 from the main BreakCache, not from here.
1077      *
1078      *                 In common situations, the number of boundaries in a single dictionary run
1079      *                 should be quite small, it will be terminated by punctuation, spaces,
1080      *                 or any other non-dictionary characters. The main BreakCache may end
1081      *                 up with boundaries from multiple dictionary based runs.
1082      *
1083      *                 The boundaries are stored in a simple ArrayList (vector), with the
1084      *                 assumption that they will be accessed sequentially.
1085      */
1086     class DictionaryCache  {
1087 
1088          void reset() {
1089              fPositionInCache = -1;
1090              fStart = 0;
1091              fLimit = 0;
1092              fFirstRuleStatusIndex = 0;
1093              fOtherRuleStatusIndex = 0;
1094              fBreaks.removeAllElements();
1095          };
1096 
1097          boolean following(int fromPos) {
1098              if (fromPos >= fLimit || fromPos < fStart) {
1099                  fPositionInCache = -1;
1100                  return false;
1101              }
1102 
1103              // Sequential iteration, move from previous boundary to the following
1104 
1105              int r = 0;
1106              if (fPositionInCache >= 0 && fPositionInCache < fBreaks.size() && fBreaks.elementAt(fPositionInCache) == fromPos) {
1107                  ++fPositionInCache;
1108                  if (fPositionInCache >= fBreaks.size()) {
1109                      fPositionInCache = -1;
1110                      return false;
1111                  }
1112                  r = fBreaks.elementAt(fPositionInCache);
1113                  assert(r > fromPos);
1114                  fBoundary = r;
1115                  fStatusIndex = fOtherRuleStatusIndex;
1116                  return true;
1117              }
1118 
1119              // Random indexing. Linear search for the boundary following the given position.
1120 
1121              for (fPositionInCache = 0; fPositionInCache < fBreaks.size(); ++fPositionInCache) {
1122                  r= fBreaks.elementAt(fPositionInCache);
1123                  if (r > fromPos) {
1124                      fBoundary = r;
1125                      fStatusIndex = fOtherRuleStatusIndex;
1126                      return true;
1127                  }
1128              }
1129 
1130              // Internal error. fStart <= fromPos < fLimit, but no cached boundary.
1131              assert(false);
1132              fPositionInCache = -1;
1133              return false;
1134          };
1135 
preceding(int fromPos)1136          boolean preceding(int fromPos) {
1137              if (fromPos <= fStart || fromPos > fLimit) {
1138                  fPositionInCache = -1;
1139                  return false;
1140              }
1141 
1142              if (fromPos == fLimit) {
1143                  fPositionInCache = fBreaks.size() - 1;
1144                  if (fPositionInCache >= 0) {
1145                      assert(fBreaks.elementAt(fPositionInCache) == fromPos);
1146                  }
1147              }
1148 
1149              int r;
1150              if (fPositionInCache > 0 && fPositionInCache < fBreaks.size() && fBreaks.elementAt(fPositionInCache) == fromPos) {
1151                  --fPositionInCache;
1152                  r = fBreaks.elementAt(fPositionInCache);
1153                  assert(r < fromPos);
1154                  fBoundary = r;
1155                  fStatusIndex = ( r== fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex;
1156                  return true;
1157              }
1158 
1159              if (fPositionInCache == 0) {
1160                  fPositionInCache = -1;
1161                  return false;
1162              }
1163 
1164              for (fPositionInCache = fBreaks.size()-1; fPositionInCache >= 0; --fPositionInCache) {
1165                  r = fBreaks.elementAt(fPositionInCache);
1166                  if (r < fromPos) {
1167                      fBoundary = r;
1168                      fStatusIndex = ( r == fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex;
1169                      return true;
1170                  }
1171              }
1172              assert(false);
1173              fPositionInCache = -1;
1174              return false;
1175          };
1176 
1177         /**
1178          * Populate the cache with the dictionary based boundaries within a region of text.
1179          * @param startPos  The start position of a range of text
1180          * @param endPos    The end position of a range of text
1181          * @param firstRuleStatus The rule status index that applies to the break at startPos
1182          * @param otherRuleStatus The rule status index that applies to boundaries other than startPos
1183          * @internal
1184          */
populateDictionary(int startPos, int endPos, int firstRuleStatus, int otherRuleStatus)1185         void populateDictionary(int startPos, int endPos,
1186                                 int firstRuleStatus, int otherRuleStatus) {
1187             if ((endPos - startPos) <= 1) {
1188                 return;
1189             }
1190 
1191             reset();
1192             fFirstRuleStatusIndex = firstRuleStatus;
1193             fOtherRuleStatusIndex = otherRuleStatus;
1194 
1195             int rangeStart = startPos;
1196             int rangeEnd = endPos;
1197 
1198             int         category;
1199             int         current;
1200             int         foundBreakCount = 0;
1201 
1202             // Loop through the text, looking for ranges of dictionary characters.
1203             // For each span, find the appropriate break engine, and ask it to find
1204             // any breaks within the span.
1205 
1206             fText.setIndex(rangeStart);
1207             int     c = CharacterIteration.current32(fText);
1208             category = (short)fRData.fTrie.get(c);
1209             int dictStart = fRData.fFTable.fDictCategoriesStart;
1210 
1211             while(true) {
1212                 while((current = fText.getIndex()) < rangeEnd && (category < dictStart)) {
1213                     c = CharacterIteration.next32(fText);    // pre-increment
1214                     category = (short)fRData.fTrie.get(c);
1215                 }
1216                 if (current >= rangeEnd) {
1217                     break;
1218                 }
1219 
1220                 // We now have a dictionary character. Get the appropriate language object
1221                 // to deal with it.
1222                 LanguageBreakEngine lbe = getLanguageBreakEngine(c);
1223 
1224                 // Ask the language object if there are any breaks. It will add them to the cache and
1225                 // leave the text pointer on the other side of its range, ready to search for the next one.
1226                 if (lbe != null) {
1227                     foundBreakCount += lbe.findBreaks(fText, rangeStart, rangeEnd, fBreaks, fPhraseBreaking);
1228                 }
1229 
1230                 // Reload the loop variables for the next go-round
1231                 c = CharacterIteration.current32(fText);
1232                 category = (short)fRData.fTrie.get(c);
1233             }
1234 
1235             // If we found breaks, ensure that the first and last entries are
1236             // the original starting and ending position. And initialize the
1237             // cache iteration position to the first entry.
1238 
1239             // System.out.printf("foundBreakCount = %d%n", foundBreakCount);
1240             if (foundBreakCount > 0) {
1241                 assert(foundBreakCount == fBreaks.size());
1242                 if (startPos < fBreaks.elementAt(0)) {
1243                     // The dictionary did not place a boundary at the start of the segment of text.
1244                     // Add one now. This should not commonly happen, but it would be easy for interactions
1245                     // of the rules for dictionary segments and the break engine implementations to
1246                     // inadvertently cause it. Cover it here, just in case.
1247                     fBreaks.offer(startPos);
1248                 }
1249                 if (endPos > fBreaks.peek()) {
1250                     fBreaks.push(endPos);
1251                 }
1252                 fPositionInCache = 0;
1253                 // Note: Dictionary matching may extend beyond the original limit.
1254                 fStart = fBreaks.elementAt(0);
1255                 fLimit = fBreaks.peek();
1256             } else {
1257                 // there were no language-based breaks, even though the segment contained
1258                 // dictionary characters. Subsequent attempts to fetch boundaries from the dictionary cache
1259                 // for this range will fail, and the calling code will fall back to the rule based boundaries.
1260             }
1261 
1262         };
1263 
1264 
DictionaryCache()1265         DictionaryCache() {
1266             fPositionInCache = -1;
1267             fBreaks = new DictionaryBreakEngine.DequeI();
1268         }
1269 
1270         /**
1271          * copy constructor. Used by RuleBasedBreakIterator.clone().
1272          *
1273          * @param src the source object to be copied.
1274          */
DictionaryCache(DictionaryCache src)1275         DictionaryCache(DictionaryCache src)  {
1276             try {
1277                 fBreaks = (DictionaryBreakEngine.DequeI)src.fBreaks.clone();
1278             }
1279             catch (CloneNotSupportedException e) {
1280                 throw new RuntimeException(e);
1281             }
1282             fPositionInCache      = src.fPositionInCache;
1283             fStart                = src.fStart;
1284             fLimit                = src.fLimit;
1285             fFirstRuleStatusIndex = src.fFirstRuleStatusIndex;
1286             fOtherRuleStatusIndex = src.fOtherRuleStatusIndex;
1287             fBoundary             = src.fBoundary;
1288             fStatusIndex          = src.fStatusIndex;
1289         }
1290 
1291         // A data structure containing the boundaries themselves. Essentially a vector of raw ints.
1292         DictionaryBreakEngine.DequeI fBreaks;
1293         int             fPositionInCache;       // Index in fBreaks of last boundary returned by following()
1294         //                                      //    or preceding(). Optimizes sequential access.
1295         int             fStart;                 // Text position of first boundary in cache.
1296         int             fLimit;                 // Last boundary in cache. Which is the limit of the
1297         //                                      //    text segment being handled by the dictionary.
1298         int             fFirstRuleStatusIndex;  // Rule status info for first boundary.
1299         int             fOtherRuleStatusIndex;  // Rule status info for 2nd through last boundaries.
1300         int             fBoundary;              // Current boundary. Set by preceding(), following().
1301         int             fStatusIndex;           // Current rule status index. Set by preceding, following().
1302     };
1303 
1304 
1305 
1306 
1307 /*
1308  * class BreakCache
1309  *
1310  * Cache of break boundary positions and rule status values.
1311  * Break iterator API functions, next(), previous(), etc., will use cached results
1312  * when possible, and otherwise cache new results as they are obtained.
1313  *
1314  * Uniformly caches both dictionary and rule based (non-dictionary) boundaries.
1315  *
1316  * The cache is implemented as a single circular buffer.
1317  */
1318 
1319 /*
1320  * size of the circular cache buffer.
1321  */
1322 
1323 class BreakCache {
1324 
BreakCache()1325     BreakCache() {
1326         reset();
1327     };
1328 
reset(int pos, int ruleStatus)1329     void  reset(int pos, int ruleStatus) {
1330         fStartBufIdx = 0;
1331         fEndBufIdx = 0;
1332         fTextIdx = pos;
1333         fBufIdx = 0;
1334         fBoundaries[0] = pos;
1335         fStatuses[0] = (short)ruleStatus;
1336     }
1337 
reset()1338     void  reset() {reset(0, 0); };
1339 
next()1340     void  next() {
1341         if (fBufIdx == fEndBufIdx) {
1342             fDone = !populateFollowing();
1343             fPosition = fTextIdx;
1344             fRuleStatusIndex = fStatuses[fBufIdx];
1345         } else {
1346             fBufIdx = modChunkSize(fBufIdx + 1);
1347             fTextIdx = fPosition = fBoundaries[fBufIdx];
1348             fRuleStatusIndex = fStatuses[fBufIdx];
1349         }
1350     };
1351 
previous()1352     void  previous() {
1353         int initialBufIdx = fBufIdx;
1354         if (fBufIdx == fStartBufIdx) {
1355             // At start of cache. Prepend to it.
1356             populatePreceding();
1357         } else {
1358             // Cache already holds the next boundary
1359             fBufIdx = modChunkSize(fBufIdx - 1);
1360             fTextIdx = fBoundaries[fBufIdx];
1361         }
1362         fDone = (fBufIdx == initialBufIdx);
1363         fPosition = fTextIdx;
1364         fRuleStatusIndex = fStatuses[fBufIdx];
1365         return;
1366     };
1367 
1368     // Move the iteration state to the position following the startPosition.
1369     // Input position must be pinned to the input length.
following(int startPos)1370     void following(int startPos) {
1371         if (startPos == fTextIdx || seek(startPos) || populateNear(startPos)) {
1372             // startPos is in the cache. Do a next() from that position.
1373             // TODO: an awkward set of interactions with bi->fDone
1374             //       seek() does not clear it; it can't because of interactions with populateNear().
1375             //       next() does not clear it in the fast-path case, where everything matters. Maybe it should.
1376             //       So clear it here, for the case where seek() succeeded on an iterator that had previously run off the end.
1377             fDone = false;
1378             next();
1379         }
1380 
1381     };
1382 
preceding(int startPos)1383     void  preceding(int startPos) {
1384         if (startPos == fTextIdx || seek(startPos) || populateNear(startPos)) {
1385             if (startPos == fTextIdx) {
1386                 previous();
1387             } else {
1388                 // seek() leaves the BreakCache positioned at the preceding boundary
1389                 //        if the requested position is between two boundaries.
1390                 // current() pushes the BreakCache position out to the BreakIterator itself.
1391                 assert(startPos > fTextIdx);
1392                 current();
1393             }
1394         }
1395         return;
1396     };
1397 
1398     /**
1399      * Update the state of the public BreakIterator (fBI) to reflect the
1400      * current state of the break iterator cache (this).
1401      */
current()1402     int current() {
1403         fPosition = fTextIdx;
1404         fRuleStatusIndex = fStatuses[fBufIdx];
1405         fDone = false;
1406         return fTextIdx;
1407     };
1408 
1409     /**
1410      * Add boundaries to the cache near the specified position.
1411      * The given position need not be a boundary itself.
1412      * The input position must be within the range of the text, and
1413      * on a code point boundary.
1414      * If the requested position is a break boundary, leave the iteration
1415      * position on it.
1416      * If the requested position is not a boundary, leave the iteration
1417      * position on the preceding boundary and include both the the
1418      * preceding and following boundaries in the cache.
1419      * Additional boundaries, either preceding or following, may be added
1420      * to the cache as a side effect.
1421      *
1422      * Return false if the operation failed.
1423      */
populateNear(int position)1424     boolean populateNear(int position) {
1425         assert(position < fBoundaries[fStartBufIdx] || position > fBoundaries[fEndBufIdx]);
1426 
1427         // Add boundaries to the cache near the specified position.
1428         // The given position need not be a boundary itself.
1429         // The input position must be within the range of the text, and
1430         // on a code point boundary.
1431         // If the requested position is a break boundary, leave the iteration
1432         // position on it.
1433         // If the requested position is not a boundary, leave the iteration
1434         // position on the preceding boundary and include both the
1435         // preceding and following boundaries in the cache.
1436         // Additional boundaries, either preceding or following, may be added
1437         // to the cache as a side effect.
1438 
1439         // If the requested position is not near already cached positions, clear the existing cache,
1440         // find a near-by boundary and begin new cache contents there.
1441 
1442         // Threshold for a text position to be considered near to existing cache contents.
1443         // TODO: See issue ICU-22024 "perf tuning of Cache needed."
1444         //       This value is subject to change. See the ticket for more details.
1445         final int CACHE_NEAR = 15;
1446 
1447         int startOfText = fText.getBeginIndex();
1448         int aBoundary = -1;
1449         int ruleStatusIndex = 0;
1450         boolean retainCache = false;
1451         if ((position > fBoundaries[fStartBufIdx] - CACHE_NEAR) && position < (fBoundaries[fEndBufIdx] + CACHE_NEAR)) {
1452             // Requested position is near the existing cache. Retain it.
1453             retainCache = true;
1454         } else if (position <= startOfText + CACHE_NEAR) {
1455             // Requested position is near the start of the text. Fill cache from start, skipping
1456             // the need to find a safe point.
1457             retainCache = false;
1458             aBoundary = startOfText;
1459         } else {
1460             // Requested position is not near the existing cache.
1461             // Find a safe point to refill the cache from.
1462             int backupPos = handleSafePrevious(position);
1463 
1464             if (fBoundaries[fEndBufIdx] < position && fBoundaries[fEndBufIdx] >= (backupPos - CACHE_NEAR)) {
1465                 // The requested position is beyond the end of the existing cache, but the
1466                 // reverse rules produced a position near or before the cached region.
1467                 // Retain the existing cache, and fill from the end of it.
1468                 retainCache = true;
1469             } else if (backupPos < startOfText + CACHE_NEAR) {
1470                 // The safe reverse rules moved us to near the start of text.
1471                 // Take that (index 0) as the backup boundary, avoiding the complication
1472                 // (in the following block) of moving forward from the safe point to a known boundary.
1473                 //
1474                 // Retain the cache if it begins not too far from the requested position.
1475                 aBoundary = startOfText;
1476                 retainCache = (fBoundaries[fStartBufIdx] <= (position + CACHE_NEAR));
1477             } else {
1478                 // The safe reverse rules produced a position that is neither near the existing
1479                 // cache, nor near the start of text.
1480                 // Advance to the boundary following.
1481                 // There is a complication: the safe reverse rules identify pairs of code points
1482                 // that are safe. If advancing from the safe point moves forwards by less than
1483                 // two code points, we need to advance one more time to ensure that the boundary
1484                 // is good, including a correct rules status value.
1485                 //
1486                 retainCache = false;
1487                 fPosition = backupPos;
1488                 aBoundary = handleNext();
1489                 if (aBoundary == backupPos + 1 ||
1490                         (aBoundary == backupPos + 2 &&
1491                         Character.isHighSurrogate(fText.setIndex(backupPos)) &&
1492                         Character.isLowSurrogate(fText.next()))) {
1493                     // The initial handleNext() only advanced by a single code point. Go again.
1494                     // Safe rules identify safe pairs.
1495                     aBoundary = handleNext();
1496                 }
1497                 if (aBoundary == BreakIterator.DONE) {
1498                     aBoundary = fText.getEndIndex();
1499                 }
1500                 ruleStatusIndex = fRuleStatusIndex;
1501             }
1502         }
1503 
1504         if (!retainCache) {
1505             assert(aBoundary != -1);
1506             reset(aBoundary, ruleStatusIndex);               // Reset cache to hold aBoundary as a single starting point.
1507         }
1508 
1509 
1510         // Fill in boundaries between existing cache content and the new requested position.
1511 
1512         if (fBoundaries[fEndBufIdx] < position) {
1513             // The last position in the cache precedes the requested position.
1514             // Add following position(s) to the cache.
1515             while (fBoundaries[fEndBufIdx] < position) {
1516                 if (!populateFollowing()) {
1517                     assert false;
1518                     return false;
1519                 }
1520             }
1521             fBufIdx = fEndBufIdx;                      // Set iterator position to the end of the buffer.
1522             fTextIdx = fBoundaries[fBufIdx];           // Required because populateFollowing may add extra boundaries.
1523             while (fTextIdx > position) {              // Move backwards to a position at or preceding the requested pos.
1524                 previous();
1525             }
1526             return true;
1527         }
1528 
1529         if (fBoundaries[fStartBufIdx] > position) {
1530             // The first position in the cache is beyond the requested position.
1531             // back up more until we get a boundary <= the requested position.
1532             while (fBoundaries[fStartBufIdx] > position) {
1533                 populatePreceding();
1534             }
1535             fBufIdx = fStartBufIdx;                    // Set iterator position to the start of the buffer.
1536             fTextIdx = fBoundaries[fBufIdx];           // Required because populatePreceding may add extra boundaries.
1537             while (fTextIdx < position) {              // Move forwards to a position at or following the requested pos.
1538                 next();
1539             }
1540             if (fTextIdx > position) {
1541                 // If position is not itself a boundary, the next() loop above will overshoot.
1542                 // Back up one, leaving cache position at the boundary preceding the requested position.
1543                 previous();
1544             }
1545             return true;
1546         }
1547 
1548         assert fTextIdx == position;
1549         return true;
1550 
1551     };
1552 
1553     /**
1554      *  Add boundary(s) to the cache following the current last boundary.
1555      *  Return false if at the end of the text, and no more boundaries can be added.
1556      *  Leave iteration position at the first newly added boundary, or unchanged if no boundary was added.
1557      */
populateFollowing()1558     boolean populateFollowing() {
1559         int fromPosition = fBoundaries[fEndBufIdx];
1560         int fromRuleStatusIdx = fStatuses[fEndBufIdx];
1561         int pos = 0;
1562         int ruleStatusIdx = 0;
1563 
1564         if (fDictionaryCache.following(fromPosition)) {
1565             addFollowing(fDictionaryCache.fBoundary, fDictionaryCache.fStatusIndex, UpdateCachePosition);
1566             return true;
1567         }
1568 
1569         fPosition = fromPosition;
1570         pos = handleNext();
1571         if (pos == BreakIterator.DONE) {
1572             return false;
1573         }
1574 
1575         ruleStatusIdx = fRuleStatusIndex;
1576         if (fDictionaryCharCount > 0) {
1577             // The text segment obtained from the rules includes dictionary characters.
1578             // Subdivide it, with subdivided results going into the dictionary cache.
1579             fDictionaryCache.populateDictionary(fromPosition, pos, fromRuleStatusIdx, ruleStatusIdx);
1580             if (fDictionaryCache.following(fromPosition)) {
1581                 addFollowing(fDictionaryCache.fBoundary, fDictionaryCache.fStatusIndex, UpdateCachePosition);
1582                 return true;
1583                 // TODO: may want to move a sizable chunk of the dictionary cache to the break cache at this point.
1584                 //       But be careful with interactions with populateNear().
1585             }
1586         }
1587 
1588         // Rule based segment did not include dictionary characters.
1589         // Or, it did contain dictionary chars, but the dictionary segmenter didn't handle them,
1590         //    meaning that we didn't take the return, above.
1591         // Add its end point to the cache.
1592         addFollowing(pos, ruleStatusIdx, UpdateCachePosition);
1593 
1594         // Add several non-dictionary boundaries at this point, to optimize straight forward iteration.
1595         //    (subsequent calls to BreakIterator::next() will take the fast path, getting cached results.
1596         //
1597         for (int count=0; count<6; ++count) {
1598             pos = handleNext();
1599             if (pos == BreakIterator.DONE || fDictionaryCharCount > 0) {
1600                 break;
1601             }
1602             addFollowing(pos, fRuleStatusIndex, RetainCachePosition);
1603         }
1604         return true;
1605     };
1606 
1607     /**
1608      *  Add one or more boundaries to the cache preceding the first currently cached boundary.
1609      *  Leave the iteration position on the first added boundary.
1610      *  Return false if no boundaries could be added (if at the start of the text.)
1611      */
populatePreceding()1612     boolean populatePreceding() {
1613         int textBegin = fText.getBeginIndex();
1614         int fromPosition = fBoundaries[fStartBufIdx];
1615         if (fromPosition == textBegin) {
1616             return false;
1617         }
1618 
1619         int position = textBegin;
1620         int positionStatusIdx = 0;
1621 
1622         if (fDictionaryCache.preceding(fromPosition)) {
1623             addPreceding(fDictionaryCache.fBoundary, fDictionaryCache.fStatusIndex, UpdateCachePosition);
1624             return true;
1625         }
1626 
1627         int backupPosition = fromPosition;
1628 
1629         // Find a boundary somewhere preceding the first already-cached boundary
1630         do {
1631             backupPosition = backupPosition - 30;
1632             if (backupPosition <= textBegin) {
1633                 backupPosition = textBegin;
1634             } else {
1635                 backupPosition = handleSafePrevious(backupPosition);
1636             }
1637             if (backupPosition == BreakIterator.DONE || backupPosition == textBegin) {
1638                 position = textBegin;
1639                 positionStatusIdx = 0;
1640             } else {
1641                 // Advance to the boundary following the backup position.
1642                 // There is a complication: the safe reverse rules identify pairs of code points
1643                 // that are safe. If advancing from the safe point moves forwards by less than
1644                 // two code points, we need to advance one more time to ensure that the boundary
1645                 // is good, including a correct rules status value.
1646                 //
1647                 fPosition = backupPosition;  // TODO: pass starting position in a clearer way.
1648                 position = handleNext();
1649                 if (position == backupPosition + 1 ||
1650                         (position == backupPosition + 2 &&
1651                         Character.isHighSurrogate(fText.setIndex(backupPosition)) &&
1652                         Character.isLowSurrogate(fText.next()))) {
1653                     // The initial handleNext() only advanced by a single code point. Go again.
1654                     // Safe rules identify safe pairs.
1655                     position = handleNext();
1656                 }
1657                 positionStatusIdx = fRuleStatusIndex;
1658             }
1659         } while (position >= fromPosition);
1660 
1661         // Find boundaries between the one we just located and the first already-cached boundary
1662         // Put them in a side buffer, because we don't yet know where they will fall in the circular cache buffer.
1663 
1664         fSideBuffer.removeAllElements();
1665         fSideBuffer.push(position);
1666         fSideBuffer.push(positionStatusIdx);
1667 
1668         do {
1669             int prevPosition = fPosition = position;
1670             int prevStatusIdx = positionStatusIdx;
1671             position = handleNext();
1672             positionStatusIdx = fRuleStatusIndex;
1673             if (position == BreakIterator.DONE) {
1674                 break;
1675             }
1676 
1677             boolean segmentHandledByDictionary = false;
1678             if (fDictionaryCharCount != 0) {
1679                 // Segment from the rules includes dictionary characters.
1680                 // Subdivide it, with subdivided results going into the dictionary cache.
1681                 int dictSegEndPosition = position;
1682                 fDictionaryCache.populateDictionary(prevPosition, dictSegEndPosition, prevStatusIdx, positionStatusIdx);
1683                 while (fDictionaryCache.following(prevPosition)) {
1684                     position = fDictionaryCache.fBoundary;
1685                     positionStatusIdx = fDictionaryCache.fStatusIndex;
1686                     segmentHandledByDictionary = true;
1687                     assert(position > prevPosition);
1688                     if (position >= fromPosition) {
1689                         break;
1690                     }
1691                     assert(position <= dictSegEndPosition);
1692                     fSideBuffer.push(position);
1693                     fSideBuffer.push(positionStatusIdx);
1694                     prevPosition = position;
1695                 }
1696                 assert(position==dictSegEndPosition || position>=fromPosition);
1697             }
1698 
1699             if (!segmentHandledByDictionary && position < fromPosition) {
1700                 fSideBuffer.push(position);
1701                 fSideBuffer.push(positionStatusIdx);
1702             }
1703         } while (position < fromPosition);
1704 
1705         // Move boundaries from the side buffer to the main circular buffer.
1706         boolean success = false;
1707         if (!fSideBuffer.isEmpty()) {
1708             positionStatusIdx = fSideBuffer.pop();
1709             position = fSideBuffer.pop();
1710             addPreceding(position, positionStatusIdx, UpdateCachePosition);
1711             success = true;
1712         }
1713 
1714         while (!fSideBuffer.isEmpty()) {
1715             positionStatusIdx = fSideBuffer.pop();
1716             position = fSideBuffer.pop();
1717             if (!addPreceding(position, positionStatusIdx, RetainCachePosition)) {
1718                 // No space in circular buffer to hold a new preceding result while
1719                 // also retaining the current cache (iteration) position.
1720                 // Bailing out is safe; the cache will refill again if needed.
1721                 break;
1722             }
1723         }
1724         return success;
1725     };
1726 
1727 
1728     static final boolean RetainCachePosition = false;
1729     static final boolean UpdateCachePosition = true;
1730 
1731     /**
1732      * Add the boundary following the current position.
1733      * The current position can be left as it was, or changed to the newly added boundary,
1734      * as specified by the update parameter.
1735      */
addFollowing(int position, int ruleStatusIdx, boolean update)1736     void addFollowing(int position, int ruleStatusIdx, boolean update) {
1737         assert(position > fBoundaries[fEndBufIdx]);
1738         assert(ruleStatusIdx <= Short.MAX_VALUE);
1739         int nextIdx = modChunkSize(fEndBufIdx + 1);
1740         if (nextIdx == fStartBufIdx) {
1741             fStartBufIdx = modChunkSize(fStartBufIdx + 6);    // TODO: experiment. Probably revert to 1.
1742         }
1743         fBoundaries[nextIdx] = position;
1744         fStatuses[nextIdx] = (short)ruleStatusIdx;
1745         fEndBufIdx = nextIdx;
1746         if (update == UpdateCachePosition) {
1747             // Set current position to the newly added boundary.
1748             fBufIdx = nextIdx;
1749             fTextIdx = position;
1750         } else {
1751             // Retaining the original cache position.
1752             // Check if the added boundary wraps around the buffer, and would over-write the original position.
1753             // It's the responsibility of callers of this function to not add too many.
1754             assert(nextIdx != fBufIdx);
1755         }
1756 
1757     };
1758 
1759 
1760     /**
1761      * Add the boundary preceding the current position.
1762      * The current position can be left as it was, or changed to the newly added boundary,
1763      * as specified by the update parameter.
1764      */
addPreceding(int position, int ruleStatusIdx, boolean update)1765     boolean addPreceding(int position, int ruleStatusIdx, boolean update) {
1766         assert(position < fBoundaries[fStartBufIdx]);
1767         assert(ruleStatusIdx <= Short.MAX_VALUE);
1768         int nextIdx = modChunkSize(fStartBufIdx - 1);
1769         if (nextIdx == fEndBufIdx) {
1770             if (fBufIdx == fEndBufIdx && update == RetainCachePosition) {
1771                 // Failure. The insertion of the new boundary would claim the buffer position that is the
1772                 // current iteration position. And we also want to retain the current iteration position.
1773                 // (The buffer is already completely full of entries that precede the iteration position.)
1774                 return false;
1775             }
1776             fEndBufIdx = modChunkSize(fEndBufIdx - 1);
1777         }
1778         fBoundaries[nextIdx] = position;
1779         fStatuses[nextIdx] = (short)ruleStatusIdx;
1780         fStartBufIdx = nextIdx;
1781         if (update == UpdateCachePosition) {
1782             fBufIdx = nextIdx;
1783             fTextIdx = position;
1784         }
1785         return true;
1786     };
1787 
1788     /**
1789      *  Set the cache position to the specified position, or, if the position
1790      *  falls between to cached boundaries, to the preceding boundary.
1791      *  Fails if the requested position is outside of the range of boundaries currently held by the cache.
1792      *  The startPosition must be on a code point boundary.
1793      *
1794      *  Return true if successful, false if the specified position is after
1795      *  the last cached boundary or before the first.
1796      */
1797     boolean seek(int pos) {
1798         if (pos < fBoundaries[fStartBufIdx] || pos > fBoundaries[fEndBufIdx]) {
1799             return false;
1800         }
1801         if (pos == fBoundaries[fStartBufIdx]) {
1802             // Common case: seek(0), from BreakIterator::first()
1803             fBufIdx = fStartBufIdx;
1804             fTextIdx = fBoundaries[fBufIdx];
1805             return true;
1806         }
1807         if (pos == fBoundaries[fEndBufIdx]) {
1808             fBufIdx = fEndBufIdx;
1809             fTextIdx = fBoundaries[fBufIdx];
1810             return true;
1811         }
1812 
1813         int min = fStartBufIdx;
1814         int max = fEndBufIdx;
1815         while (min != max) {
1816             int probe = (min + max + (min>max ? CACHE_SIZE : 0)) / 2;
1817             probe = modChunkSize(probe);
1818             if (fBoundaries[probe] > pos) {
1819                 max = probe;
1820             } else {
1821                 min = modChunkSize(probe + 1);
1822             }
1823         }
1824         assert(fBoundaries[max] > pos);
1825         fBufIdx = modChunkSize(max - 1);
1826         fTextIdx = fBoundaries[fBufIdx];
1827         assert(fTextIdx <= pos);
1828         return true;
1829 
1830     };
1831 
1832 
1833     /**
1834      * copy constructor, used from RuleBasedBreakIterator.clone().
1835      *
1836      * @param src
1837      */
BreakCache(BreakCache src)1838     BreakCache(BreakCache src)  {
1839         fStartBufIdx = src.fStartBufIdx;
1840         fEndBufIdx = src.fEndBufIdx;
1841         fTextIdx = src.fTextIdx;
1842         fBufIdx = src.fBufIdx;
1843         fBoundaries = src.fBoundaries.clone();
1844         fStatuses = src.fStatuses.clone();
1845         fSideBuffer = new DictionaryBreakEngine.DequeI();  // Transient, no need to clone contents.
1846     }
1847 
dumpCache()1848     void dumpCache() {
1849         System.out.printf("fTextIdx:%d   fBufIdx:%d%n", fTextIdx, fBufIdx);
1850         for (int i=fStartBufIdx; ; i=modChunkSize(i+1)) {
1851             System.out.printf("%d  %d%n", i, fBoundaries[i]);
1852             if (i == fEndBufIdx) {
1853                 break;
1854             }
1855         }
1856     };
1857 
modChunkSize(int index)1858     private final int   modChunkSize(int index) { return index & (CACHE_SIZE - 1); };
1859 
1860     static final int CACHE_SIZE = 128;
1861     // static_assert((CACHE_SIZE & (CACHE_SIZE-1)) == 0, "CACHE_SIZE must be power of two.");
1862 
1863     int                 fStartBufIdx;
1864     int                 fEndBufIdx;    // inclusive
1865 
1866     int                 fTextIdx;
1867     int                 fBufIdx;
1868 
1869     int[]               fBoundaries = new int[CACHE_SIZE];
1870     short[]             fStatuses = new short[CACHE_SIZE];
1871 
1872     DictionaryBreakEngine.DequeI   fSideBuffer = new DictionaryBreakEngine.DequeI();
1873 };
1874 
1875 
1876 
1877 
1878 }
1879 
1880