• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4  *******************************************************************************
5  * Copyright (C) 2005-2016 International Business Machines Corporation and
6  * others. All Rights Reserved.
7  *******************************************************************************
8  */
9 
10 package com.ibm.icu.text;
11 
12 import static com.ibm.icu.impl.CharacterIteration.DONE32;
13 import static com.ibm.icu.impl.CharacterIteration.next32;
14 import static com.ibm.icu.impl.CharacterIteration.nextTrail32;
15 
16 import java.io.ByteArrayOutputStream;
17 import java.io.IOException;
18 import java.io.InputStream;
19 import java.io.OutputStream;
20 import java.nio.ByteBuffer;
21 import java.text.CharacterIterator;
22 import java.util.ArrayList;
23 import java.util.List;
24 import java.util.MissingResourceException;
25 
26 import com.ibm.icu.impl.CharacterIteration;
27 import com.ibm.icu.impl.ICUBinary;
28 import com.ibm.icu.impl.ICUDebug;
29 import com.ibm.icu.impl.RBBIDataWrapper;
30 import com.ibm.icu.impl.breakiter.BurmeseBreakEngine;
31 import com.ibm.icu.impl.breakiter.CjkBreakEngine;
32 import com.ibm.icu.impl.breakiter.DictionaryBreakEngine;
33 import com.ibm.icu.impl.breakiter.KhmerBreakEngine;
34 import com.ibm.icu.impl.breakiter.LSTMBreakEngine;
35 import com.ibm.icu.impl.breakiter.LanguageBreakEngine;
36 import com.ibm.icu.impl.breakiter.LaoBreakEngine;
37 import com.ibm.icu.impl.breakiter.ThaiBreakEngine;
38 import com.ibm.icu.impl.breakiter.UnhandledBreakEngine;
39 import com.ibm.icu.lang.UCharacter;
40 import com.ibm.icu.lang.UProperty;
41 import com.ibm.icu.lang.UScript;
42 import com.ibm.icu.util.CodePointTrie;
43 
44 /**
45  * Rule Based Break Iterator
46  * This is a port of the C++ class RuleBasedBreakIterator from ICU4C.
47  *
48  * @stable ICU 2.0
49  */
50 public class RuleBasedBreakIterator extends BreakIterator {
51     //=======================================================================
52     // Constructors & Factories
53     //=======================================================================
54 
55     /**
56      * private constructor
57      */
RuleBasedBreakIterator()58     private RuleBasedBreakIterator() {
59         fDictionaryCharCount  = 0;
60         synchronized(gAllBreakEngines) {
61             fBreakEngines = new ArrayList<>(gAllBreakEngines);
62         }
63     }
64 
65     /**
66      * Create a break iterator from a precompiled set of break rules.
67      *
68      * Creating a break iterator from the binary rules is much faster than
69      * creating one from source rules.
70      *
71      * The binary rules are generated by the RuleBasedBreakIterator.compileRules() function.
72      * Binary break iterator rules are not guaranteed to be compatible between
73      * different versions of ICU.
74      *
75      * @param is an input stream supplying the compiled binary rules.
76      * @throws IOException if there is an error while reading the rules from the InputStream.
77      * @see    #compileRules(String, OutputStream)
78      * @stable ICU 4.8
79      */
getInstanceFromCompiledRules(InputStream is)80     public static RuleBasedBreakIterator getInstanceFromCompiledRules(InputStream is) throws IOException {
81         RuleBasedBreakIterator  This = new RuleBasedBreakIterator();
82         This.fRData = RBBIDataWrapper.get(ICUBinary.getByteBufferFromInputStreamAndCloseStream(is));
83         This.fLookAheadMatches = new int[This.fRData.fFTable.fLookAheadResultsSize];
84         return This;
85     }
86 
87     /**
88      * This factory method doesn't have an access modifier; it is only accessible in the same
89      * package.
90      *
91      * Create a break iterator from a precompiled set of break rules.
92      *
93      * Creating a break iterator from the binary rules is much faster than
94      * creating one from source rules.
95      *
96      * The binary rules are generated by the RuleBasedBreakIterator.compileRules() function.
97      * Binary break iterator rules are not guaranteed to be compatible between
98      * different versions of ICU.
99      *
100      * @param bytes a buffer supplying the compiled binary rules.
101      * @param phraseBreaking a flag indicating if phrase breaking is required.
102      * @throws IOException if there is an error while reading the rules from the buffer.
103      * @see    #compileRules(String, OutputStream)
104      * @internal
105      */
getInstanceFromCompiledRules( ByteBuffer bytes, boolean phraseBreaking)106     /* package-potected */ static RuleBasedBreakIterator getInstanceFromCompiledRules(
107             ByteBuffer bytes, boolean phraseBreaking) throws IOException {
108         RuleBasedBreakIterator instance = getInstanceFromCompiledRules(bytes);
109         instance.fPhraseBreaking = phraseBreaking;
110         return instance;
111     }
112 
113     /**
114      * Create a break iterator from a precompiled set of break rules.
115      *
116      * Creating a break iterator from the binary rules is much faster than
117      * creating one from source rules.
118      *
119      * The binary rules are generated by the RuleBasedBreakIterator.compileRules() function.
120      * Binary break iterator rules are not guaranteed to be compatible between
121      * different versions of ICU.
122      *
123      * @param bytes a buffer supplying the compiled binary rules.
124      * @throws IOException if there is an error while reading the rules from the buffer.
125      * @see    #compileRules(String, OutputStream)
126      * @internal
127      * @deprecated This API is ICU internal only.
128      */
129     @Deprecated
getInstanceFromCompiledRules(ByteBuffer bytes)130     public static RuleBasedBreakIterator getInstanceFromCompiledRules(ByteBuffer bytes) throws IOException {
131         RuleBasedBreakIterator  This = new RuleBasedBreakIterator();
132         This.fRData = RBBIDataWrapper.get(bytes);
133         This.fLookAheadMatches = new int[This.fRData.fFTable.fLookAheadResultsSize];
134         return This;
135     }
136 
137     /**
138      * Construct a RuleBasedBreakIterator from a set of rules supplied as a string.
139      * @param rules The break rules to be used.
140      * @stable ICU 2.2
141      */
RuleBasedBreakIterator(String rules)142     public RuleBasedBreakIterator(String rules)  {
143         this();
144         try {
145             ByteArrayOutputStream ruleOS = new ByteArrayOutputStream();
146             compileRules(rules, ruleOS);
147             fRData = RBBIDataWrapper.get(ByteBuffer.wrap(ruleOS.toByteArray()));
148             fLookAheadMatches = new int[fRData.fFTable.fLookAheadResultsSize];
149         } catch (IOException e) {
150             ///CLOVER:OFF
151             // An IO exception can only arrive here if there is a bug in the RBBI Rule compiler,
152             //  causing bogus compiled rules to be produced, but with no compile error raised.
153             RuntimeException rte = new RuntimeException("RuleBasedBreakIterator rule compilation internal error: "
154                     + e.getMessage());
155             throw rte;
156             ///CLOVER:ON
157         }
158     }
159 
160     //=======================================================================
161     // Boilerplate
162     //=======================================================================
163 
164     /**
165      * Clones this iterator.
166      * @return A newly-constructed RuleBasedBreakIterator with the same
167      * behavior as this one.
168      * @stable ICU 2.0
169      */
170     @Override
clone()171     public Object clone()  {
172         RuleBasedBreakIterator result;
173         result = (RuleBasedBreakIterator)super.clone();
174         if (fText != null) {
175             result.fText = (CharacterIterator)(fText.clone());
176         }
177         synchronized (gAllBreakEngines)  {
178             result.fBreakEngines = new ArrayList<>(gAllBreakEngines);
179         }
180         result.fLookAheadMatches = new int[fRData.fFTable.fLookAheadResultsSize];
181         result.fBreakCache = result.new BreakCache(fBreakCache);
182         result.fDictionaryCache = result.new DictionaryCache(fDictionaryCache);
183         return result;
184     }
185 
186 
187     /**
188      * Returns true if both BreakIterators are of the same class, have the same
189      * rules, and iterate over the same text.
190      * @stable ICU 2.0
191      */
192     @Override
equals(Object that)193     public boolean equals(Object that) {
194         if (that == null) {
195             return false;
196         }
197         if (this == that) {
198             return true;
199         }
200         try {
201             RuleBasedBreakIterator other = (RuleBasedBreakIterator) that;
202             if (fRData != other.fRData && (fRData == null || other.fRData == null)) {
203                 return false;
204             }
205             if (fRData != null && other.fRData != null &&
206                     (!fRData.fRuleSource.equals(other.fRData.fRuleSource))) {
207                 return false;
208             }
209             if (fText == null && other.fText == null) {
210                 return true;
211             }
212             if (fText == null || other.fText == null || !fText.equals(other.fText)) {
213                 return false;
214             }
215             return fPosition == other.fPosition;
216         }
217         catch(ClassCastException e) {
218             return false;
219         }
220      }
221 
222     /**
223      * Returns the description (rules) used to create this iterator.
224      * (In ICU4C, the same function is RuleBasedBreakIterator::getRules())
225      * @stable ICU 2.0
226      */
227     @Override
toString()228     public String toString() {
229         String retStr = "";
230         if (fRData != null) {
231             retStr =  fRData.fRuleSource;
232         }
233         return retStr;
234     }
235 
236     /**
237      * Compute a hashcode for this BreakIterator
238      * @return A hash code
239      * @stable ICU 2.0
240      */
241     @Override
hashCode()242     public int hashCode()
243     {
244         return fRData.fRuleSource.hashCode();
245     }
246 
247 
248     private static final int  START_STATE = 1;     // The state number of the starting state
249     private static final int  STOP_STATE  = 0;     // The state-transition value indicating "stop"
250 
251     // RBBIRunMode - the state machine runs an extra iteration at the beginning and end
252     //               of user text.  A variable with this enum type keeps track of where we
253     //               are.  The state machine only fetches user text input while in RUN mode.
254     private static final int  RBBI_START  = 0;
255     private static final int  RBBI_RUN    = 1;
256     private static final int  RBBI_END    = 2;
257 
258     /**
259      * The character iterator through which this BreakIterator accesses the text.
260      */
261     private CharacterIterator   fText = new java.text.StringCharacterIterator("");
262 
263     /**
264      * The rule data for this BreakIterator instance.
265      * Not intended for public use. Declared public for testing purposes only.
266      * @internal
267      * @deprecated This API is ICU internal only.
268      */
269     @Deprecated
270     public RBBIDataWrapper    fRData;
271 
272     /**
273      *  The iteration state - current position, rule status for the current position,
274      *                        and whether the iterator ran off the end, yielding UBRK_DONE.
275      *                        Current position is pinned to be 0 < position <= text.length.
276      *                        Current position is always set to a boundary.
277      *
278      *  The current  position of the iterator. Pinned, 0 < fPosition <= text.length.
279      *  Never has the value UBRK_DONE (-1).
280      */
281     private int                fPosition;
282 
283     /**
284      * Index of the Rule {tag} values for the most recent match.
285      */
286     private int                fRuleStatusIndex;
287 
288     /**
289      * True when iteration has run off the end, and iterator functions should return UBRK_DONE.
290      */
291     private boolean            fDone;
292 
293     /**
294      *  Array of look-ahead tentative results.
295      */
296     private int[]              fLookAheadMatches;
297 
298     /**
299      *   Cache of previously determined boundary positions.
300      */
301     private BreakCache         fBreakCache = new BreakCache();
302 
303     /**
304      * Flag used to indicate if phrase breaking is required.
305      */
306     private boolean            fPhraseBreaking = false;
307 
308 
309     /**
310      * Counter for the number of characters encountered with the "dictionary"
311      *   flag set.  Normal RBBI iterators don't use it, although the code
312      *   for updating it is live.  Dictionary Based break iterators (a subclass
313      *   of us) access this field directly.
314      * @internal
315      */
316     private int fDictionaryCharCount;
317 
318     private DictionaryCache     fDictionaryCache = new DictionaryCache();
319 
320     /**
321      * ICU debug argument name for RBBI
322      */
323     private static final String RBBI_DEBUG_ARG = "rbbi";
324 
325     /**
326      * Debugging flag.  Trace operation of state machine when true.
327      */
328     private static final boolean TRACE = ICUDebug.enabled(RBBI_DEBUG_ARG)
329             && ICUDebug.value(RBBI_DEBUG_ARG).indexOf("trace") >= 0;
330 
331     /**
332      * The "default" break engine - just skips over ranges of dictionary words,
333      * producing no breaks. Should only be used if characters need to be handled
334      * by a dictionary but we have no dictionary implementation for them.
335      *
336      * Only one instance; shared by all break iterators.
337      */
338     private static final UnhandledBreakEngine gUnhandledBreakEngine;
339 
340     /**
341      * List of all known break engines, common for all break iterators.
342      * Lazily updated as break engines are needed, because instantiation of
343      * break engines is expensive.
344      *
345      * Because gAllBreakEngines can be referenced concurrently from different
346      * BreakIterator instances, all access is synchronized.
347      */
348     private static final List<LanguageBreakEngine> gAllBreakEngines;
349 
350     static {
351         gUnhandledBreakEngine = new UnhandledBreakEngine();
352         gAllBreakEngines = new ArrayList<>();
353         gAllBreakEngines.add(gUnhandledBreakEngine);
354     }
355 
356     /**
357      * List of all known break engines. Similar to gAllBreakEngines, but local to a
358      * break iterator, allowing it to be used without synchronization.
359      */
360     private List<LanguageBreakEngine> fBreakEngines;
361 
362     /**
363      * Dump the contents of the state table and character classes for this break iterator.
364      * For debugging only.
365      * @internal
366      * @deprecated This API is ICU internal only.
367      */
368     @Deprecated
dump(java.io.PrintStream out)369     public void dump(java.io.PrintStream out) {
370         if (out == null) {
371             out = System.out;
372         }
373         this.fRData.dump(out);
374     }
375 
376     /**
377      * Compile a set of source break rules into the binary state tables used
378      * by the break iterator engine.  Creating a break iterator from precompiled
379      * rules is much faster than creating one from source rules.
380      *
381      * Binary break rules are not guaranteed to be compatible between different
382      * versions of ICU.
383      *
384      *
385      * @param rules  The source form of the break rules
386      * @param ruleBinary  An output stream to receive the compiled rules.
387      * @throws IOException If there is an error writing the output.
388      * @see #getInstanceFromCompiledRules(InputStream)
389      * @stable ICU 4.8
390      */
compileRules(String rules, OutputStream ruleBinary)391     public static void compileRules(String rules, OutputStream ruleBinary) throws IOException {
392         RBBIRuleBuilder.compileRules(rules, ruleBinary);
393     }
394 
395     //=======================================================================
396     // BreakIterator overrides
397     //=======================================================================
398 
399     /**
400      * Sets the current iteration position to the beginning of the text.
401      * (i.e., the CharacterIterator's starting offset).
402      * @return The offset of the beginning of the text.
403      * @stable ICU 2.0
404      */
405     @Override
first()406     public int first() {
407         if (fText == null) {
408             return BreakIterator.DONE;
409         }
410         fText.first();
411         int start =  fText.getIndex();
412         if (!fBreakCache.seek(start)) {
413             fBreakCache.populateNear(start);
414         }
415         fBreakCache.current();
416         assert(fPosition == start);
417         return fPosition;
418     }
419 
420     /**
421      * Sets the current iteration position to the end of the text.
422      * (i.e., the CharacterIterator's ending offset).
423      * @return The text's past-the-end offset.
424      * @stable ICU 2.0
425      */
426     @Override
last()427     public int last() {
428         if (fText == null) {
429             return BreakIterator.DONE;
430         }
431         int endPos = fText.getEndIndex();
432         boolean endShouldBeBoundary = isBoundary(endPos);      // Has side effect of setting iterator position.
433         assert(endShouldBeBoundary);
434         if (fPosition != endPos) {
435             assert(fPosition == endPos);
436         }
437         return endPos;
438     }
439 
440     /**
441      * Advances the iterator either forward or backward the specified number of steps.
442      * Negative values move backward, and positive values move forward.  This is
443      * equivalent to repeatedly calling next() or previous().
444      * @param n The number of steps to move.  The sign indicates the direction
445      * (negative is backwards, and positive is forwards).
446      * @return The character offset of the boundary position n boundaries away from
447      * the current one.
448      * @stable ICU 2.0
449      */
450     @Override
next(int n)451     public int next(int n) {
452         int result = 0;
453         if (n > 0) {
454             for (; n > 0 && result != DONE; --n) {
455                 result = next();
456             }
457         } else if (n < 0) {
458             for (; n < 0 && result != DONE; ++n) {
459                 result = previous();
460             }
461         } else {
462             result = current();
463         }
464         return result;
465     }
466 
467     /**
468      * Advances the iterator to the next boundary position.
469      * @return The position of the first boundary after this one.
470      * @stable ICU 2.0
471      */
472     @Override
next()473     public int next() {
474         fBreakCache.next();
475         return fDone ? DONE : fPosition;
476     }
477 
478     /**
479      * Moves the iterator backwards, to the boundary preceding the current one.
480      * @return The position of the boundary position immediately preceding the starting position.
481      * @stable ICU 2.0
482      */
483     @Override
previous()484     public int previous() {
485         fBreakCache.previous();
486         return fDone ? DONE : fPosition;
487     }
488 
489     /**
490      * Sets the iterator to refer to the first boundary position following
491      * the specified position.
492      * @param startPos The position from which to begin searching for a break position.
493      * @return The position of the first break after the current position.
494      * @stable ICU 2.0
495      */
496     @Override
following(int startPos)497     public int following(int startPos) {
498         // if the supplied position is before the beginning, return the
499         // text's starting offset
500         if (startPos < fText.getBeginIndex()) {
501             return first();
502         }
503 
504         // Move requested offset to a code point start. It might be between a lead and trail surrogate.
505         // Or it may be beyond the end of the text.
506         startPos = CISetIndex32(fText, startPos);
507         fBreakCache.following(startPos);
508         return fDone ? DONE : fPosition;
509     }
510 
511 
512     /**
513      * Sets the iterator to refer to the last boundary position before the
514      * specified position.
515      * @param offset The position to begin searching for a break from.
516      * @return The position of the last boundary before the starting position.
517      * @stable ICU 2.0
518      */
519     @Override
preceding(int offset)520     public int preceding(int offset) {
521         if (fText == null || offset > fText.getEndIndex()) {
522             return last();
523         } else if (offset < fText.getBeginIndex()) {
524             return first();
525         }
526 
527         // Move requested offset to a code point start. It might be between a lead and trail surrogate.
528         // int adjustedOffset = CISetIndex32(fText, offset);    // TODO: restore to match ICU4C behavior.
529         int adjustedOffset = offset;
530         fBreakCache.preceding(adjustedOffset);
531         return fDone ? DONE : fPosition;
532 
533     }
534 
535 
536     /**
537      * Throw IllegalArgumentException unless begin &lt;= offset &lt; end.
538      * @stable ICU 2.0
539      */
checkOffset(int offset, CharacterIterator text)540     protected static final void checkOffset(int offset, CharacterIterator text) {
541         if (offset < text.getBeginIndex() || offset > text.getEndIndex()) {
542             throw new IllegalArgumentException("offset out of bounds");
543         }
544     }
545 
546 
547     /**
548      * Returns true if the specified position is a boundary position.  As a side
549      * effect, leaves the iterator pointing to the first boundary position at
550      * or after "offset".
551      * @param offset the offset to check.
552      * @return True if "offset" is a boundary position.
553      * @stable ICU 2.0
554      */
555     @Override
isBoundary(int offset)556     public boolean isBoundary(int offset) {
557         // TODO: behavior difference with ICU4C, which considers out-of-range offsets
558         //       to not be boundaries, and to not be errors.
559         checkOffset(offset, fText);
560 
561         // Adjust offset to be on a code point boundary and not beyond the end of the text.
562         // Note that isBoundary() is always false for offsets that are not on code point boundaries.
563         // But we still need the side effect of leaving iteration at the following boundary.
564         int adjustedOffset = CISetIndex32(fText, offset);
565 
566         boolean result = false;
567         if (fBreakCache.seek(adjustedOffset) || fBreakCache.populateNear(adjustedOffset)) {
568             result = (fBreakCache.current() == offset);
569         }
570 
571         if (!result) {
572             // Not on a boundary. isBoundary() must leave iterator on the following boundary.
573             // fBreakCache.seek(), above, left us on the preceding boundary, so advance one.
574             next();
575         }
576         return result;
577 
578     }
579 
580     /**
581      * Returns the current iteration position.  Note that DONE is never
582      * returned from this function; if iteration has run to the end of a
583      * string, current() will return the length of the string while
584      * next() will return BreakIterator.DONE).
585      * @return The current iteration position.
586      * @stable ICU 2.0
587      */
588     @Override
current()589     public int current() {
590         return (fText != null) ? fPosition : BreakIterator.DONE;
591     }
592 
593 
594     /**
595      * Return the status tag from the break rule that determined the boundary at
596      * the current iteration position.  The values appear in the rule source
597      * within brackets, {123}, for example.  For rules that do not specify a
598      * status, a default value of 0 is returned.  If more than one rule applies,
599      * the numerically largest of the possible status values is returned.
600      * <p>
601      * Of the standard types of ICU break iterators, only the word and line break
602      * iterator provides status values.  The values are defined in
603      * class RuleBasedBreakIterator, and allow distinguishing between words
604      * that contain alphabetic letters, "words" that appear to be numbers,
605      * punctuation and spaces, words containing ideographic characters, and
606      * more.  Call <code>getRuleStatus</code> after obtaining a boundary
607      * position from <code>next()</code>, <code>previous()</code>, or
608      * any other break iterator functions that returns a boundary position.
609      * <p>
610      * Note that <code>getRuleStatus()</code> returns the value corresponding to
611      * <code>current()</code> index even after <code>next()</code> has returned DONE.
612      * <p>
613 
614      * @return the status from the break rule that determined the boundary
615      * at the current iteration position.
616      *
617      * @stable ICU 60
618      */
619 
620     @Override
getRuleStatus()621     public int  getRuleStatus() {
622         //   Status records have this form:
623         //           Count N         <--  fLastRuleStatusIndex points here.
624         //           Status val 0
625         //           Status val 1
626         //              ...
627         //           Status val N-1  <--  the value we need to return
628         //   The status values are sorted in ascending order.
629         //   This function returns the last (largest) of the array of status values.
630         int  idx = fRuleStatusIndex + fRData.fStatusTable[fRuleStatusIndex];
631         int  tagVal = fRData.fStatusTable[idx];
632         return tagVal;
633     }
634 
635     /**
636      * Get the status (tag) values from the break rule(s) that determined the boundary
637      * at the current iteration position.  The values appear in the rule source
638      * within brackets, {123}, for example.  The default status value for rules
639      * that do not explicitly provide one is zero.
640      * <p>
641      * The status values used by the standard ICU break rules are defined
642      * as public constants in class RuleBasedBreakIterator.
643      * <p>
644      * If the size  of the output array is insufficient to hold the data,
645      *  the output will be truncated to the available length.  No exception
646      *  will be thrown.
647      *
648      * @param fillInArray an array to be filled in with the status values.
649      * @return          The number of rule status values from the rules that determined
650      *                  the boundary at the current iteration position.
651      *                  In the event that the array is too small, the return value
652      *                  is the total number of status values that were available,
653      *                  not the reduced number that were actually returned.
654      * @stable ICU 60
655      */
656     @Override
getRuleStatusVec(int[] fillInArray)657     public int getRuleStatusVec(int[] fillInArray) {
658         int numStatusVals = fRData.fStatusTable[fRuleStatusIndex];
659         if (fillInArray != null) {
660             int numToCopy = Math.min(numStatusVals, fillInArray.length);
661             for (int i=0; i<numToCopy; i++) {
662                 fillInArray[i] = fRData.fStatusTable[fRuleStatusIndex + i + 1];
663             }
664         }
665         return numStatusVals;
666     }
667 
668     /**
669      * Returns a CharacterIterator over the text being analyzed.
670      * <p>
671      * <b><i>Caution:</i></b>The state of the returned CharacterIterator
672      * must not be modified in any way while the BreakIterator is still in use.
673      * Doing so will lead to undefined behavior of the BreakIterator.
674      * Clone the returned CharacterIterator first and work with that.
675      * <p>
676      * The returned CharacterIterator is a reference
677      * to the <b>actual iterator being used</b> by the BreakIterator.
678      * No guarantees are made about the current position
679      * of this iterator when it is returned; it may differ from the
680      * BreakIterators current position.  If you need to move that
681      * position to examine the text, clone this function's return value first.
682      * @return An iterator over the text being analyzed.
683      * @stable ICU 2.0
684      */
685     @Override
getText()686     public CharacterIterator getText() {
687         return fText;
688     }
689 
690     /**
691      * Set the iterator to analyze a new piece of text.  This function resets
692      * the current iteration position to the beginning of the text.
693      * (The old iterator is dropped.)
694      * <p>
695      * <b><i>Caution:</i></b> The supplied CharacterIterator is used
696      * directly by the BreakIterator, and must not be altered in any
697      * way by code outside of the BreakIterator.
698      * Doing so will lead to undefined behavior of the BreakIterator.
699      *
700      * @param newText An iterator over the text to analyze.
701      * @stable ICU 2.0
702      */
703     @Override
setText(CharacterIterator newText)704     public void setText(CharacterIterator newText) {
705         if (newText != null) {
706             fBreakCache.reset(newText.getBeginIndex(), 0);
707         } else {
708             fBreakCache.reset();
709         }
710         fDictionaryCache.reset();
711         fText = newText;
712         this.first();
713     }
714 
715      /**
716      * Control debug, trace and dump options.
717      * @internal
718      * @deprecated This API is ICU internal only.
719      */
720     @Deprecated
721     public static final String fDebugEnv = ICUDebug.enabled(RBBI_DEBUG_ARG) ?
722                                            ICUDebug.value(RBBI_DEBUG_ARG) : null;
723 
724 
getLanguageBreakEngine(int c)725     private LanguageBreakEngine getLanguageBreakEngine(int c) {
726 
727         // We have a dictionary character.
728         // Does an already instantiated break engine handle it?
729         for (LanguageBreakEngine candidate : fBreakEngines) {
730             if (candidate.handles(c)) {
731                 return candidate;
732             }
733         }
734 
735         synchronized (gAllBreakEngines) {
736             // This break iterator's list of break engines didn't handle the character.
737             // Check the global list, another break iterator may have instantiated the
738             // desired engine.
739             for (LanguageBreakEngine candidate : gAllBreakEngines) {
740                 if (candidate.handles(c)) {
741                     fBreakEngines.add(candidate);
742                     return candidate;
743                 }
744             }
745 
746             // The global list doesn't have an existing engine, build one.
747             int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
748             if (script == UScript.KATAKANA || script == UScript.HIRAGANA) {
749                 // Katakana, Hiragana and Han are handled by the same dictionary engine.
750                 // Fold them together for mapping from script -> engine.
751                 script = UScript.HAN;
752             }
753 
754             LanguageBreakEngine eng;
755             try {
756                 switch (script) {
757                 case UScript.THAI:
758                     try {
759                         eng = LSTMBreakEngine.create(script, LSTMBreakEngine.createData(script));
760                     } catch (MissingResourceException e) {
761                         eng = new ThaiBreakEngine();
762                     }
763                     break;
764                 case UScript.LAO:
765                     eng = new LaoBreakEngine();
766                     break;
767                 case UScript.MYANMAR:
768                     try {
769                         eng = LSTMBreakEngine.create(script, LSTMBreakEngine.createData(script));
770                     } catch (MissingResourceException e) {
771                         eng = new BurmeseBreakEngine();
772                     }
773                     break;
774                 case UScript.KHMER:
775                     eng = new KhmerBreakEngine();
776                     break;
777                 case UScript.HAN:
778                     eng = new CjkBreakEngine(false);
779                      break;
780                 case UScript.HANGUL:
781                     eng = new CjkBreakEngine(true);
782                     break;
783                 default:
784                     gUnhandledBreakEngine.handleChar(c);
785                     eng = gUnhandledBreakEngine;
786                     break;
787                 }
788             } catch (IOException e) {
789                 eng = null;
790             }
791 
792             if (eng != null && eng != gUnhandledBreakEngine) {
793                 gAllBreakEngines.add(eng);
794                 fBreakEngines.add(eng);
795             }
796             return eng;
797         }   // end synchronized(gAllBreakEngines)
798     }
799 
800     /**
801      * The State Machine Engine for moving forward is here.
802      * This function is the heart of the RBBI run time engine.
803      *
804      * Input
805      *    fPosition, the position in the text to begin from.
806      * Output
807      *    fPosition:           the boundary following the starting position.
808      *    fDictionaryCharCount the number of dictionary characters encountered.
809      *                         If > 0, the segment will be further subdivided
810      *    fRuleStatusIndex     Info from the state table indicating which rules caused the boundary.
811      *
812      * @return the new iterator position
813      *
814      * A note on supplementary characters and the position of underlying
815      * Java CharacterIterator:   Normally, a character iterator is positioned at
816      * the char most recently returned by next().  Within this function, when
817      * a supplementary char is being processed, the char iterator is left
818      * sitting on the trail surrogate, in the middle of the code point.
819      * This is different from everywhere else, where an iterator always
820      * points at the lead surrogate of a supplementary.
821      */
handleNext()822     private int handleNext() {
823         if (TRACE) {
824             System.out.println("Handle Next   pos      char  state category");
825         }
826 
827         // handleNext always sets the break tag value.
828         // Set the default for it.
829         fRuleStatusIndex  = 0;
830         fDictionaryCharCount = 0;
831 
832         // caches for quicker access
833         CharacterIterator text = fText;
834         CodePointTrie trie = fRData.fTrie;
835 
836         char[] stateTable  = fRData.fFTable.fTable;
837         int initialPosition = fPosition;
838         text.setIndex(initialPosition);
839         int result          = initialPosition;
840 
841         // Set up the starting char
842         int c = text.current();
843         if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) {
844             c = nextTrail32(text, c);
845             if (c == DONE32) {
846                 fDone = true;
847                 return BreakIterator.DONE;
848             }
849         }
850 
851         // Set the initial state for the state machine
852         int state           = START_STATE;
853         int row             = fRData.getRowIndex(state);
854         short category      = 3;
855         int flagsState      = fRData.fFTable.fFlags;
856         int dictStart       = fRData.fFTable.fDictCategoriesStart;
857         int mode            = RBBI_RUN;
858         if ((flagsState & RBBIDataWrapper.RBBI_BOF_REQUIRED) != 0) {
859             category = 2;
860             mode     = RBBI_START;
861             if (TRACE) {
862                 System.out.print("            " +  RBBIDataWrapper.intToString(text.getIndex(), 5));
863                 System.out.print(RBBIDataWrapper.intToHexString(c, 10));
864                 System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6));
865             }
866         }
867 
868         // loop until we reach the end of the text or transition to state 0
869         while (state != STOP_STATE) {
870             if (c == DONE32) {
871                 // Reached end of input string.
872                 if (mode == RBBI_END) {
873                     // We have already run the loop one last time with the
874                     // character set to the pseudo {eof} value. Now it is time
875                     // to unconditionally bail out.
876                     break;
877                 }
878                 // Run the loop one last time with the fake end-of-input character category
879                 mode = RBBI_END;
880                 category = 1;
881             }
882             else if (mode == RBBI_RUN) {
883                 // Get the char category.  An incoming category of 1 or 2 mens that
884                 //      we are preset for doing the beginning or end of input, and
885                 //      that we shouldn't get a category from an actual text input character.
886                 //
887 
888                 // look up the current character's character category, which tells us
889                 // which column in the state table to look at.
890                 //
891                 category = (short) trie.get(c);
892 
893                 // Check for categories that require word dictionary handling.
894                 if (category >= dictStart) {
895                     fDictionaryCharCount++;
896                 }
897 
898                 if (TRACE) {
899                     System.out.print("            " +  RBBIDataWrapper.intToString(text.getIndex(), 5));
900                     System.out.print(RBBIDataWrapper.intToHexString(c, 10));
901                     System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6));
902                 }
903 
904                 // Advance to the next character.
905                 // If this is a beginning-of-input loop iteration, don't advance.
906                 //    The next iteration will be processing the first real input character.
907                 c = text.next();
908                 if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) {
909                     c = nextTrail32(text, c);
910                 }
911             }
912             else {
913                 mode = RBBI_RUN;
914             }
915 
916             // look up a state transition in the state table
917             state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category];
918             row   = fRData.getRowIndex(state);
919             int accepting = stateTable[row + RBBIDataWrapper.ACCEPTING];
920             if (accepting == RBBIDataWrapper.ACCEPTING_UNCONDITIONAL) {
921                 // Match found, common case
922                 result = text.getIndex();
923                 if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c <= UTF16.CODEPOINT_MAX_VALUE) {
924                     // The iterator has been left in the middle of a surrogate pair.
925                     // We want the start of it.
926                     result--;
927                 }
928 
929                 //  Remember the break status (tag) values.
930                 fRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGSIDX];
931             } else if (accepting > RBBIDataWrapper.ACCEPTING_UNCONDITIONAL) {
932                 // Lookahead match is completed
933                 int lookaheadResult = fLookAheadMatches[accepting];
934                 if (lookaheadResult >= 0) {
935                     fRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGSIDX];
936                     fPosition = lookaheadResult;
937                     return lookaheadResult;
938                 }
939             }
940 
941 
942             // If we are at the position of the '/' in a look-ahead (hard break) rule;
943             // record the current position, to be returned later, if the full rule matches.
944             // TODO: Move this check before the previous check of fAccepting.
945             //       This would enable hard-break rules with no following context.
946             //       But there are line break test failures when trying this. Investigate.
947             //       Issue ICU-20837
948             int rule = stateTable[row + RBBIDataWrapper.LOOKAHEAD];
949             if (rule != 0) {
950                 int  pos = text.getIndex();
951                 if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c <= UTF16.CODEPOINT_MAX_VALUE) {
952                     // The iterator has been left in the middle of a surrogate pair.
953                     // We want the beginning  of it.
954                     pos--;
955                 }
956                 fLookAheadMatches[rule] = pos;
957             }
958 
959 
960         }        // End of state machine main loop
961 
962         // The state machine is done.  Check whether it found a match...
963 
964         // If the iterator failed to advance in the match engine force it ahead by one.
965         // This indicates a defect in the break rules, which should always match
966         // at least one character.
967 
968         if (result == initialPosition) {
969             if (TRACE) {
970                 System.out.println("Iterator did not move. Advancing by 1.");
971             }
972             text.setIndex(initialPosition);
973             next32(text);
974             result = text.getIndex();
975             fRuleStatusIndex = 0;
976         }
977 
978         // Leave the iterator at our result position.
979         //   (we may have advanced beyond the last accepting position chasing after
980         //    longer matches that never completed.)
981         fPosition = result;
982 
983         if (TRACE) {
984             System.out.println("result = " + result);
985         }
986         return result;
987     }
988 
989     /**
990      * Iterate backwards from an arbitrary position in the input text using the Safe Reverse rules.
991      * This locates a "Safe Position" from which the forward break rules
992      * will operate correctly. A Safe Position is not necessarily a boundary itself.
993      *
994      * The logic of this function is very similar to handleNext(), above, but simpler
995      * because the safe table does not require as many options.
996      *
997      * @param fromPosition the position in the input text to begin the iteration.
998      * @internal
999      */
handleSafePrevious(int fromPosition)1000     private int handleSafePrevious(int fromPosition) {
1001         char            state;
1002         short           category = 0;
1003         int             result = 0;
1004 
1005         // caches for quicker access
1006         CharacterIterator text = fText;
1007         CodePointTrie trie = fRData.fTrie;
1008         char[] stateTable  = fRData.fRTable.fTable;
1009 
1010         CISetIndex32(text, fromPosition);
1011         if (TRACE) {
1012             System.out.print("Handle Previous   pos   char  state category");
1013         }
1014 
1015         // if we're already at the start of the text, return DONE.
1016         if (text.getIndex() == text.getBeginIndex()) {
1017             return BreakIterator.DONE;
1018         }
1019 
1020         //  Set the initial state for the state machine
1021         int c = CharacterIteration.previous32(text);
1022         state = START_STATE;
1023         int row = fRData.getRowIndex(state);
1024 
1025         // loop until we reach the start of the text or transition to state 0
1026         //
1027         for (; c != DONE32; c = CharacterIteration.previous32(text)) {
1028 
1029             // look up the current character's character category, which tells us
1030             // which column in the state table to look at.
1031             //
1032             //  And off the dictionary flag bit. For reverse iteration it is not used.
1033             category = (short) trie.get(c);
1034             if (TRACE) {
1035                 System.out.print("            " +  RBBIDataWrapper.intToString(text.getIndex(), 5));
1036                 System.out.print(RBBIDataWrapper.intToHexString(c, 10));
1037                 System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6));
1038             }
1039 
1040             // State Transition - move machine to its next state
1041             //
1042             assert(category < fRData.fHeader.fCatCount);
1043             state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category];
1044             row   = fRData.getRowIndex(state);
1045 
1046             if (state == STOP_STATE) {
1047                 // This is the normal exit from the lookup state machine.
1048                 // Transition to state zero means we have found a safe point.
1049                 break;
1050             }
1051         }
1052 
1053         // The state machine is done.
1054         result = text.getIndex();
1055         if (TRACE) {
1056             System.out.println("result = " + result);
1057         }
1058         return result;
1059     }
1060 
1061     /**
1062      * Set the index of a CharacterIterator.
1063      * Pin the index to the valid range range of BeginIndex <= index <= EndIndex.
1064      * If the index points to a trail surrogate of a supplementary character, adjust it
1065      * to the start (lead surrogate) index.
1066      *
1067      * @param ci A CharacterIterator to set
1068      * @param index the index to set
1069      * @return the resulting index, possibly pinned or adjusted.
1070      */
1071     private static int CISetIndex32(CharacterIterator ci, int index) {
1072         if (index <= ci.getBeginIndex()) {
1073             ci.first();
1074         } else if (index >= ci.getEndIndex()) {
1075             ci.setIndex(ci.getEndIndex());
1076         } else if (Character.isLowSurrogate(ci.setIndex(index))) {
1077             if (!Character.isHighSurrogate(ci.previous())) {
1078                 ci.next();
1079             }
1080         }
1081         return ci.getIndex();
1082     }
1083 
1084     /** DictionaryCache  stores the boundaries obtained from a run of dictionary characters.
1085      *                 Dictionary boundaries are moved first to this cache, then from here
1086      *                 to the main BreakCache, where they may inter-leave with non-dictionary
1087      *                 boundaries. The public BreakIterator API always fetches directly
1088      *                 from the main BreakCache, not from here.
1089      *
1090      *                 In common situations, the number of boundaries in a single dictionary run
1091      *                 should be quite small, it will be terminated by punctuation, spaces,
1092      *                 or any other non-dictionary characters. The main BreakCache may end
1093      *                 up with boundaries from multiple dictionary based runs.
1094      *
1095      *                 The boundaries are stored in a simple ArrayList (vector), with the
1096      *                 assumption that they will be accessed sequentially.
1097      */
1098     class DictionaryCache  {
1099 
1100          void reset() {
1101              fPositionInCache = -1;
1102              fStart = 0;
1103              fLimit = 0;
1104              fFirstRuleStatusIndex = 0;
1105              fOtherRuleStatusIndex = 0;
1106              fBreaks.removeAllElements();
1107          };
1108 
1109          boolean following(int fromPos) {
1110              if (fromPos >= fLimit || fromPos < fStart) {
1111                  fPositionInCache = -1;
1112                  return false;
1113              }
1114 
1115              // Sequential iteration, move from previous boundary to the following
1116 
1117              int r = 0;
1118              if (fPositionInCache >= 0 && fPositionInCache < fBreaks.size() && fBreaks.elementAt(fPositionInCache) == fromPos) {
1119                  ++fPositionInCache;
1120                  if (fPositionInCache >= fBreaks.size()) {
1121                      fPositionInCache = -1;
1122                      return false;
1123                  }
1124                  r = fBreaks.elementAt(fPositionInCache);
1125                  assert(r > fromPos);
1126                  fBoundary = r;
1127                  fStatusIndex = fOtherRuleStatusIndex;
1128                  return true;
1129              }
1130 
1131              // Random indexing. Linear search for the boundary following the given position.
1132 
1133              for (fPositionInCache = 0; fPositionInCache < fBreaks.size(); ++fPositionInCache) {
1134                  r= fBreaks.elementAt(fPositionInCache);
1135                  if (r > fromPos) {
1136                      fBoundary = r;
1137                      fStatusIndex = fOtherRuleStatusIndex;
1138                      return true;
1139                  }
1140              }
1141 
1142              // Internal error. fStart <= fromPos < fLimit, but no cached boundary.
1143              assert(false);
1144              fPositionInCache = -1;
1145              return false;
1146          };
1147 
preceding(int fromPos)1148          boolean preceding(int fromPos) {
1149              if (fromPos <= fStart || fromPos > fLimit) {
1150                  fPositionInCache = -1;
1151                  return false;
1152              }
1153 
1154              if (fromPos == fLimit) {
1155                  fPositionInCache = fBreaks.size() - 1;
1156                  if (fPositionInCache >= 0) {
1157                      assert(fBreaks.elementAt(fPositionInCache) == fromPos);
1158                  }
1159              }
1160 
1161              int r;
1162              if (fPositionInCache > 0 && fPositionInCache < fBreaks.size() && fBreaks.elementAt(fPositionInCache) == fromPos) {
1163                  --fPositionInCache;
1164                  r = fBreaks.elementAt(fPositionInCache);
1165                  assert(r < fromPos);
1166                  fBoundary = r;
1167                  fStatusIndex = ( r== fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex;
1168                  return true;
1169              }
1170 
1171              if (fPositionInCache == 0) {
1172                  fPositionInCache = -1;
1173                  return false;
1174              }
1175 
1176              for (fPositionInCache = fBreaks.size()-1; fPositionInCache >= 0; --fPositionInCache) {
1177                  r = fBreaks.elementAt(fPositionInCache);
1178                  if (r < fromPos) {
1179                      fBoundary = r;
1180                      fStatusIndex = ( r == fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex;
1181                      return true;
1182                  }
1183              }
1184              assert(false);
1185              fPositionInCache = -1;
1186              return false;
1187          };
1188 
1189         /**
1190          * Populate the cache with the dictionary based boundaries within a region of text.
1191          * @param startPos  The start position of a range of text
1192          * @param endPos    The end position of a range of text
1193          * @param firstRuleStatus The rule status index that applies to the break at startPos
1194          * @param otherRuleStatus The rule status index that applies to boundaries other than startPos
1195          * @internal
1196          */
populateDictionary(int startPos, int endPos, int firstRuleStatus, int otherRuleStatus)1197         void populateDictionary(int startPos, int endPos,
1198                                 int firstRuleStatus, int otherRuleStatus) {
1199             if ((endPos - startPos) <= 1) {
1200                 return;
1201             }
1202 
1203             reset();
1204             fFirstRuleStatusIndex = firstRuleStatus;
1205             fOtherRuleStatusIndex = otherRuleStatus;
1206 
1207             int rangeStart = startPos;
1208             int rangeEnd = endPos;
1209 
1210             int         category;
1211             int         current;
1212             int         foundBreakCount = 0;
1213 
1214             // Loop through the text, looking for ranges of dictionary characters.
1215             // For each span, find the appropriate break engine, and ask it to find
1216             // any breaks within the span.
1217 
1218             fText.setIndex(rangeStart);
1219             int     c = CharacterIteration.current32(fText);
1220             category = (short)fRData.fTrie.get(c);
1221             int dictStart = fRData.fFTable.fDictCategoriesStart;
1222 
1223             while(true) {
1224                 while((current = fText.getIndex()) < rangeEnd && (category < dictStart)) {
1225                     c = CharacterIteration.next32(fText);    // pre-increment
1226                     category = (short)fRData.fTrie.get(c);
1227                 }
1228                 if (current >= rangeEnd) {
1229                     break;
1230                 }
1231 
1232                 // We now have a dictionary character. Get the appropriate language object
1233                 // to deal with it.
1234                 LanguageBreakEngine lbe = getLanguageBreakEngine(c);
1235 
1236                 // Ask the language object if there are any breaks. It will add them to the cache and
1237                 // leave the text pointer on the other side of its range, ready to search for the next one.
1238                 if (lbe != null) {
1239                     foundBreakCount += lbe.findBreaks(fText, rangeStart, rangeEnd, fBreaks, fPhraseBreaking);
1240                 }
1241 
1242                 // Reload the loop variables for the next go-round
1243                 c = CharacterIteration.current32(fText);
1244                 category = (short)fRData.fTrie.get(c);
1245             }
1246 
1247             // If we found breaks, ensure that the first and last entries are
1248             // the original starting and ending position. And initialize the
1249             // cache iteration position to the first entry.
1250 
1251             // System.out.printf("foundBreakCount = %d%n", foundBreakCount);
1252             if (foundBreakCount > 0) {
1253                 assert(foundBreakCount == fBreaks.size());
1254                 if (startPos < fBreaks.elementAt(0)) {
1255                     // The dictionary did not place a boundary at the start of the segment of text.
1256                     // Add one now. This should not commonly happen, but it would be easy for interactions
1257                     // of the rules for dictionary segments and the break engine implementations to
1258                     // inadvertently cause it. Cover it here, just in case.
1259                     fBreaks.offer(startPos);
1260                 }
1261                 if (endPos > fBreaks.peek()) {
1262                     fBreaks.push(endPos);
1263                 }
1264                 fPositionInCache = 0;
1265                 // Note: Dictionary matching may extend beyond the original limit.
1266                 fStart = fBreaks.elementAt(0);
1267                 fLimit = fBreaks.peek();
1268             } else {
1269                 // there were no language-based breaks, even though the segment contained
1270                 // dictionary characters. Subsequent attempts to fetch boundaries from the dictionary cache
1271                 // for this range will fail, and the calling code will fall back to the rule based boundaries.
1272             }
1273 
1274         };
1275 
1276 
DictionaryCache()1277         DictionaryCache() {
1278             fPositionInCache = -1;
1279             fBreaks = new DictionaryBreakEngine.DequeI();
1280         }
1281 
1282         /**
1283          * copy constructor. Used by RuleBasedBreakIterator.clone().
1284          *
1285          * @param src the source object to be copied.
1286          */
DictionaryCache(DictionaryCache src)1287         DictionaryCache(DictionaryCache src)  {
1288             try {
1289                 fBreaks = (DictionaryBreakEngine.DequeI)src.fBreaks.clone();
1290             }
1291             catch (CloneNotSupportedException e) {
1292                 throw new RuntimeException(e);
1293             }
1294             fPositionInCache      = src.fPositionInCache;
1295             fStart                = src.fStart;
1296             fLimit                = src.fLimit;
1297             fFirstRuleStatusIndex = src.fFirstRuleStatusIndex;
1298             fOtherRuleStatusIndex = src.fOtherRuleStatusIndex;
1299             fBoundary             = src.fBoundary;
1300             fStatusIndex          = src.fStatusIndex;
1301         }
1302 
1303         // A data structure containing the boundaries themselves. Essentially a vector of raw ints.
1304         DictionaryBreakEngine.DequeI fBreaks;
1305         int             fPositionInCache;       // Index in fBreaks of last boundary returned by following()
1306         //                                      //    or preceding(). Optimizes sequential access.
1307         int             fStart;                 // Text position of first boundary in cache.
1308         int             fLimit;                 // Last boundary in cache. Which is the limit of the
1309         //                                      //    text segment being handled by the dictionary.
1310         int             fFirstRuleStatusIndex;  // Rule status info for first boundary.
1311         int             fOtherRuleStatusIndex;  // Rule status info for 2nd through last boundaries.
1312         int             fBoundary;              // Current boundary. Set by preceding(), following().
1313         int             fStatusIndex;           // Current rule status index. Set by preceding, following().
1314     };
1315 
1316 
1317 
1318 
1319 /*
1320  * class BreakCache
1321  *
1322  * Cache of break boundary positions and rule status values.
1323  * Break iterator API functions, next(), previous(), etc., will use cached results
1324  * when possible, and otherwise cache new results as they are obtained.
1325  *
1326  * Uniformly caches both dictionary and rule based (non-dictionary) boundaries.
1327  *
1328  * The cache is implemented as a single circular buffer.
1329  */
1330 
1331 /*
1332  * size of the circular cache buffer.
1333  */
1334 
1335 class BreakCache {
1336 
BreakCache()1337     BreakCache() {
1338         reset();
1339     };
1340 
reset(int pos, int ruleStatus)1341     void  reset(int pos, int ruleStatus) {
1342         fStartBufIdx = 0;
1343         fEndBufIdx = 0;
1344         fTextIdx = pos;
1345         fBufIdx = 0;
1346         fBoundaries[0] = pos;
1347         fStatuses[0] = (short)ruleStatus;
1348     }
1349 
reset()1350     void  reset() {reset(0, 0); };
1351 
next()1352     void  next() {
1353         if (fBufIdx == fEndBufIdx) {
1354             fDone = !populateFollowing();
1355             fPosition = fTextIdx;
1356             fRuleStatusIndex = fStatuses[fBufIdx];
1357         } else {
1358             fBufIdx = modChunkSize(fBufIdx + 1);
1359             fTextIdx = fPosition = fBoundaries[fBufIdx];
1360             fRuleStatusIndex = fStatuses[fBufIdx];
1361         }
1362     };
1363 
previous()1364     void  previous() {
1365         int initialBufIdx = fBufIdx;
1366         if (fBufIdx == fStartBufIdx) {
1367             // At start of cache. Prepend to it.
1368             populatePreceding();
1369         } else {
1370             // Cache already holds the next boundary
1371             fBufIdx = modChunkSize(fBufIdx - 1);
1372             fTextIdx = fBoundaries[fBufIdx];
1373         }
1374         fDone = (fBufIdx == initialBufIdx);
1375         fPosition = fTextIdx;
1376         fRuleStatusIndex = fStatuses[fBufIdx];
1377         return;
1378     };
1379 
1380     // Move the iteration state to the position following the startPosition.
1381     // Input position must be pinned to the input length.
following(int startPos)1382     void following(int startPos) {
1383         if (startPos == fTextIdx || seek(startPos) || populateNear(startPos)) {
1384             // startPos is in the cache. Do a next() from that position.
1385             // TODO: an awkward set of interactions with bi->fDone
1386             //       seek() does not clear it; it can't because of interactions with populateNear().
1387             //       next() does not clear it in the fast-path case, where everything matters. Maybe it should.
1388             //       So clear it here, for the case where seek() succeeded on an iterator that had previously run off the end.
1389             fDone = false;
1390             next();
1391         }
1392 
1393     };
1394 
preceding(int startPos)1395     void  preceding(int startPos) {
1396         if (startPos == fTextIdx || seek(startPos) || populateNear(startPos)) {
1397             if (startPos == fTextIdx) {
1398                 previous();
1399             } else {
1400                 // seek() leaves the BreakCache positioned at the preceding boundary
1401                 //        if the requested position is between two boundaries.
1402                 // current() pushes the BreakCache position out to the BreakIterator itself.
1403                 assert(startPos > fTextIdx);
1404                 current();
1405             }
1406         }
1407         return;
1408     };
1409 
1410     /**
1411      * Update the state of the public BreakIterator (fBI) to reflect the
1412      * current state of the break iterator cache (this).
1413      */
current()1414     int current() {
1415         fPosition = fTextIdx;
1416         fRuleStatusIndex = fStatuses[fBufIdx];
1417         fDone = false;
1418         return fTextIdx;
1419     };
1420 
1421     /**
1422      * Add boundaries to the cache near the specified position.
1423      * The given position need not be a boundary itself.
1424      * The input position must be within the range of the text, and
1425      * on a code point boundary.
1426      * If the requested position is a break boundary, leave the iteration
1427      * position on it.
1428      * If the requested position is not a boundary, leave the iteration
1429      * position on the preceding boundary and include both the the
1430      * preceding and following boundaries in the cache.
1431      * Additional boundaries, either preceding or following, may be added
1432      * to the cache as a side effect.
1433      *
1434      * Return false if the operation failed.
1435      */
populateNear(int position)1436     boolean populateNear(int position) {
1437         assert(position < fBoundaries[fStartBufIdx] || position > fBoundaries[fEndBufIdx]);
1438 
1439         // Add boundaries to the cache near the specified position.
1440         // The given position need not be a boundary itself.
1441         // The input position must be within the range of the text, and
1442         // on a code point boundary.
1443         // If the requested position is a break boundary, leave the iteration
1444         // position on it.
1445         // If the requested position is not a boundary, leave the iteration
1446         // position on the preceding boundary and include both the
1447         // preceding and following boundaries in the cache.
1448         // Additional boundaries, either preceding or following, may be added
1449         // to the cache as a side effect.
1450 
1451         // If the requested position is not near already cached positions, clear the existing cache,
1452         // find a near-by boundary and begin new cache contents there.
1453 
1454         // Threshold for a text position to be considered near to existing cache contents.
1455         // TODO: See issue ICU-22024 "perf tuning of Cache needed."
1456         //       This value is subject to change. See the ticket for more details.
1457         final int CACHE_NEAR = 15;
1458 
1459         int startOfText = fText.getBeginIndex();
1460         int aBoundary = -1;
1461         int ruleStatusIndex = 0;
1462         boolean retainCache = false;
1463         if ((position > fBoundaries[fStartBufIdx] - CACHE_NEAR) && position < (fBoundaries[fEndBufIdx] + CACHE_NEAR)) {
1464             // Requested position is near the existing cache. Retain it.
1465             retainCache = true;
1466         } else if (position <= startOfText + CACHE_NEAR) {
1467             // Requested position is near the start of the text. Fill cache from start, skipping
1468             // the need to find a safe point.
1469             retainCache = false;
1470             aBoundary = startOfText;
1471         } else {
1472             // Requested position is not near the existing cache.
1473             // Find a safe point to refill the cache from.
1474             int backupPos = handleSafePrevious(position);
1475 
1476             if (fBoundaries[fEndBufIdx] < position && fBoundaries[fEndBufIdx] >= (backupPos - CACHE_NEAR)) {
1477                 // The requested position is beyond the end of the existing cache, but the
1478                 // reverse rules produced a position near or before the cached region.
1479                 // Retain the existing cache, and fill from the end of it.
1480                 retainCache = true;
1481             } else if (backupPos < startOfText + CACHE_NEAR) {
1482                 // The safe reverse rules moved us to near the start of text.
1483                 // Take that (index 0) as the backup boundary, avoiding the complication
1484                 // (in the following block) of moving forward from the safe point to a known boundary.
1485                 //
1486                 // Retain the cache if it begins not too far from the requested position.
1487                 aBoundary = startOfText;
1488                 retainCache = (fBoundaries[fStartBufIdx] <= (position + CACHE_NEAR));
1489             } else {
1490                 // The safe reverse rules produced a position that is neither near the existing
1491                 // cache, nor near the start of text.
1492                 // Advance to the boundary following.
1493                 // There is a complication: the safe reverse rules identify pairs of code points
1494                 // that are safe. If advancing from the safe point moves forwards by less than
1495                 // two code points, we need to advance one more time to ensure that the boundary
1496                 // is good, including a correct rules status value.
1497                 //
1498                 retainCache = false;
1499                 fPosition = backupPos;
1500                 aBoundary = handleNext();
1501                 if (aBoundary == backupPos + 1 ||
1502                         (aBoundary == backupPos + 2 &&
1503                         Character.isHighSurrogate(fText.setIndex(backupPos)) &&
1504                         Character.isLowSurrogate(fText.next()))) {
1505                     // The initial handleNext() only advanced by a single code point. Go again.
1506                     // Safe rules identify safe pairs.
1507                     aBoundary = handleNext();
1508                 }
1509                 if (aBoundary == BreakIterator.DONE) {
1510                     aBoundary = fText.getEndIndex();
1511                 }
1512                 ruleStatusIndex = fRuleStatusIndex;
1513             }
1514         }
1515 
1516         if (!retainCache) {
1517             assert(aBoundary != -1);
1518             reset(aBoundary, ruleStatusIndex);               // Reset cache to hold aBoundary as a single starting point.
1519         }
1520 
1521 
1522         // Fill in boundaries between existing cache content and the new requested position.
1523 
1524         if (fBoundaries[fEndBufIdx] < position) {
1525             // The last position in the cache precedes the requested position.
1526             // Add following position(s) to the cache.
1527             while (fBoundaries[fEndBufIdx] < position) {
1528                 if (!populateFollowing()) {
1529                     assert false;
1530                     return false;
1531                 }
1532             }
1533             fBufIdx = fEndBufIdx;                      // Set iterator position to the end of the buffer.
1534             fTextIdx = fBoundaries[fBufIdx];           // Required because populateFollowing may add extra boundaries.
1535             while (fTextIdx > position) {              // Move backwards to a position at or preceding the requested pos.
1536                 previous();
1537             }
1538             return true;
1539         }
1540 
1541         if (fBoundaries[fStartBufIdx] > position) {
1542             // The first position in the cache is beyond the requested position.
1543             // back up more until we get a boundary <= the requested position.
1544             while (fBoundaries[fStartBufIdx] > position) {
1545                 populatePreceding();
1546             }
1547             fBufIdx = fStartBufIdx;                    // Set iterator position to the start of the buffer.
1548             fTextIdx = fBoundaries[fBufIdx];           // Required because populatePreceding may add extra boundaries.
1549             while (fTextIdx < position) {              // Move forwards to a position at or following the requested pos.
1550                 next();
1551             }
1552             if (fTextIdx > position) {
1553                 // If position is not itself a boundary, the next() loop above will overshoot.
1554                 // Back up one, leaving cache position at the boundary preceding the requested position.
1555                 previous();
1556             }
1557             return true;
1558         }
1559 
1560         assert fTextIdx == position;
1561         return true;
1562 
1563     };
1564 
1565     /**
1566      *  Add boundary(s) to the cache following the current last boundary.
1567      *  Return false if at the end of the text, and no more boundaries can be added.
1568      *  Leave iteration position at the first newly added boundary, or unchanged if no boundary was added.
1569      */
populateFollowing()1570     boolean populateFollowing() {
1571         int fromPosition = fBoundaries[fEndBufIdx];
1572         int fromRuleStatusIdx = fStatuses[fEndBufIdx];
1573         int pos = 0;
1574         int ruleStatusIdx = 0;
1575 
1576         if (fDictionaryCache.following(fromPosition)) {
1577             addFollowing(fDictionaryCache.fBoundary, fDictionaryCache.fStatusIndex, UpdateCachePosition);
1578             return true;
1579         }
1580 
1581         fPosition = fromPosition;
1582         pos = handleNext();
1583         if (pos == BreakIterator.DONE) {
1584             return false;
1585         }
1586 
1587         ruleStatusIdx = fRuleStatusIndex;
1588         if (fDictionaryCharCount > 0) {
1589             // The text segment obtained from the rules includes dictionary characters.
1590             // Subdivide it, with subdivided results going into the dictionary cache.
1591             fDictionaryCache.populateDictionary(fromPosition, pos, fromRuleStatusIdx, ruleStatusIdx);
1592             if (fDictionaryCache.following(fromPosition)) {
1593                 addFollowing(fDictionaryCache.fBoundary, fDictionaryCache.fStatusIndex, UpdateCachePosition);
1594                 return true;
1595                 // TODO: may want to move a sizable chunk of the dictionary cache to the break cache at this point.
1596                 //       But be careful with interactions with populateNear().
1597             }
1598         }
1599 
1600         // Rule based segment did not include dictionary characters.
1601         // Or, it did contain dictionary chars, but the dictionary segmenter didn't handle them,
1602         //    meaning that we didn't take the return, above.
1603         // Add its end point to the cache.
1604         addFollowing(pos, ruleStatusIdx, UpdateCachePosition);
1605 
1606         // Add several non-dictionary boundaries at this point, to optimize straight forward iteration.
1607         //    (subsequent calls to BreakIterator::next() will take the fast path, getting cached results.
1608         //
1609         for (int count=0; count<6; ++count) {
1610             pos = handleNext();
1611             if (pos == BreakIterator.DONE || fDictionaryCharCount > 0) {
1612                 break;
1613             }
1614             addFollowing(pos, fRuleStatusIndex, RetainCachePosition);
1615         }
1616         return true;
1617     };
1618 
1619     /**
1620      *  Add one or more boundaries to the cache preceding the first currently cached boundary.
1621      *  Leave the iteration position on the first added boundary.
1622      *  Return false if no boundaries could be added (if at the start of the text.)
1623      */
populatePreceding()1624     boolean populatePreceding() {
1625         int textBegin = fText.getBeginIndex();
1626         int fromPosition = fBoundaries[fStartBufIdx];
1627         if (fromPosition == textBegin) {
1628             return false;
1629         }
1630 
1631         int position = textBegin;
1632         int positionStatusIdx = 0;
1633 
1634         if (fDictionaryCache.preceding(fromPosition)) {
1635             addPreceding(fDictionaryCache.fBoundary, fDictionaryCache.fStatusIndex, UpdateCachePosition);
1636             return true;
1637         }
1638 
1639         int backupPosition = fromPosition;
1640 
1641         // Find a boundary somewhere preceding the first already-cached boundary
1642         do {
1643             backupPosition = backupPosition - 30;
1644             if (backupPosition <= textBegin) {
1645                 backupPosition = textBegin;
1646             } else {
1647                 backupPosition = handleSafePrevious(backupPosition);
1648             }
1649             if (backupPosition == BreakIterator.DONE || backupPosition == textBegin) {
1650                 position = textBegin;
1651                 positionStatusIdx = 0;
1652             } else {
1653                 // Advance to the boundary following the backup position.
1654                 // There is a complication: the safe reverse rules identify pairs of code points
1655                 // that are safe. If advancing from the safe point moves forwards by less than
1656                 // two code points, we need to advance one more time to ensure that the boundary
1657                 // is good, including a correct rules status value.
1658                 //
1659                 fPosition = backupPosition;  // TODO: pass starting position in a clearer way.
1660                 position = handleNext();
1661                 if (position == backupPosition + 1 ||
1662                         (position == backupPosition + 2 &&
1663                         Character.isHighSurrogate(fText.setIndex(backupPosition)) &&
1664                         Character.isLowSurrogate(fText.next()))) {
1665                     // The initial handleNext() only advanced by a single code point. Go again.
1666                     // Safe rules identify safe pairs.
1667                     position = handleNext();
1668                 }
1669                 positionStatusIdx = fRuleStatusIndex;
1670             }
1671         } while (position >= fromPosition);
1672 
1673         // Find boundaries between the one we just located and the first already-cached boundary
1674         // Put them in a side buffer, because we don't yet know where they will fall in the circular cache buffer.
1675 
1676         fSideBuffer.removeAllElements();
1677         fSideBuffer.push(position);
1678         fSideBuffer.push(positionStatusIdx);
1679 
1680         do {
1681             int prevPosition = fPosition = position;
1682             int prevStatusIdx = positionStatusIdx;
1683             position = handleNext();
1684             positionStatusIdx = fRuleStatusIndex;
1685             if (position == BreakIterator.DONE) {
1686                 break;
1687             }
1688 
1689             boolean segmentHandledByDictionary = false;
1690             if (fDictionaryCharCount != 0) {
1691                 // Segment from the rules includes dictionary characters.
1692                 // Subdivide it, with subdivided results going into the dictionary cache.
1693                 int dictSegEndPosition = position;
1694                 fDictionaryCache.populateDictionary(prevPosition, dictSegEndPosition, prevStatusIdx, positionStatusIdx);
1695                 while (fDictionaryCache.following(prevPosition)) {
1696                     position = fDictionaryCache.fBoundary;
1697                     positionStatusIdx = fDictionaryCache.fStatusIndex;
1698                     segmentHandledByDictionary = true;
1699                     assert(position > prevPosition);
1700                     if (position >= fromPosition) {
1701                         break;
1702                     }
1703                     assert(position <= dictSegEndPosition);
1704                     fSideBuffer.push(position);
1705                     fSideBuffer.push(positionStatusIdx);
1706                     prevPosition = position;
1707                 }
1708                 assert(position==dictSegEndPosition || position>=fromPosition);
1709             }
1710 
1711             if (!segmentHandledByDictionary && position < fromPosition) {
1712                 fSideBuffer.push(position);
1713                 fSideBuffer.push(positionStatusIdx);
1714             }
1715         } while (position < fromPosition);
1716 
1717         // Move boundaries from the side buffer to the main circular buffer.
1718         boolean success = false;
1719         if (!fSideBuffer.isEmpty()) {
1720             positionStatusIdx = fSideBuffer.pop();
1721             position = fSideBuffer.pop();
1722             addPreceding(position, positionStatusIdx, UpdateCachePosition);
1723             success = true;
1724         }
1725 
1726         while (!fSideBuffer.isEmpty()) {
1727             positionStatusIdx = fSideBuffer.pop();
1728             position = fSideBuffer.pop();
1729             if (!addPreceding(position, positionStatusIdx, RetainCachePosition)) {
1730                 // No space in circular buffer to hold a new preceding result while
1731                 // also retaining the current cache (iteration) position.
1732                 // Bailing out is safe; the cache will refill again if needed.
1733                 break;
1734             }
1735         }
1736         return success;
1737     };
1738 
1739 
1740     static final boolean RetainCachePosition = false;
1741     static final boolean UpdateCachePosition = true;
1742 
1743     /**
1744      * Add the boundary following the current position.
1745      * The current position can be left as it was, or changed to the newly added boundary,
1746      * as specified by the update parameter.
1747      */
addFollowing(int position, int ruleStatusIdx, boolean update)1748     void addFollowing(int position, int ruleStatusIdx, boolean update) {
1749         assert(position > fBoundaries[fEndBufIdx]);
1750         assert(ruleStatusIdx <= Short.MAX_VALUE);
1751         int nextIdx = modChunkSize(fEndBufIdx + 1);
1752         if (nextIdx == fStartBufIdx) {
1753             fStartBufIdx = modChunkSize(fStartBufIdx + 6);    // TODO: experiment. Probably revert to 1.
1754         }
1755         fBoundaries[nextIdx] = position;
1756         fStatuses[nextIdx] = (short)ruleStatusIdx;
1757         fEndBufIdx = nextIdx;
1758         if (update == UpdateCachePosition) {
1759             // Set current position to the newly added boundary.
1760             fBufIdx = nextIdx;
1761             fTextIdx = position;
1762         } else {
1763             // Retaining the original cache position.
1764             // Check if the added boundary wraps around the buffer, and would over-write the original position.
1765             // It's the responsibility of callers of this function to not add too many.
1766             assert(nextIdx != fBufIdx);
1767         }
1768 
1769     };
1770 
1771 
1772     /**
1773      * Add the boundary preceding the current position.
1774      * The current position can be left as it was, or changed to the newly added boundary,
1775      * as specified by the update parameter.
1776      */
addPreceding(int position, int ruleStatusIdx, boolean update)1777     boolean addPreceding(int position, int ruleStatusIdx, boolean update) {
1778         assert(position < fBoundaries[fStartBufIdx]);
1779         assert(ruleStatusIdx <= Short.MAX_VALUE);
1780         int nextIdx = modChunkSize(fStartBufIdx - 1);
1781         if (nextIdx == fEndBufIdx) {
1782             if (fBufIdx == fEndBufIdx && update == RetainCachePosition) {
1783                 // Failure. The insertion of the new boundary would claim the buffer position that is the
1784                 // current iteration position. And we also want to retain the current iteration position.
1785                 // (The buffer is already completely full of entries that precede the iteration position.)
1786                 return false;
1787             }
1788             fEndBufIdx = modChunkSize(fEndBufIdx - 1);
1789         }
1790         fBoundaries[nextIdx] = position;
1791         fStatuses[nextIdx] = (short)ruleStatusIdx;
1792         fStartBufIdx = nextIdx;
1793         if (update == UpdateCachePosition) {
1794             fBufIdx = nextIdx;
1795             fTextIdx = position;
1796         }
1797         return true;
1798     };
1799 
1800     /**
1801      *  Set the cache position to the specified position, or, if the position
1802      *  falls between to cached boundaries, to the preceding boundary.
1803      *  Fails if the requested position is outside of the range of boundaries currently held by the cache.
1804      *  The startPosition must be on a code point boundary.
1805      *
1806      *  Return true if successful, false if the specified position is after
1807      *  the last cached boundary or before the first.
1808      */
1809     boolean seek(int pos) {
1810         if (pos < fBoundaries[fStartBufIdx] || pos > fBoundaries[fEndBufIdx]) {
1811             return false;
1812         }
1813         if (pos == fBoundaries[fStartBufIdx]) {
1814             // Common case: seek(0), from BreakIterator::first()
1815             fBufIdx = fStartBufIdx;
1816             fTextIdx = fBoundaries[fBufIdx];
1817             return true;
1818         }
1819         if (pos == fBoundaries[fEndBufIdx]) {
1820             fBufIdx = fEndBufIdx;
1821             fTextIdx = fBoundaries[fBufIdx];
1822             return true;
1823         }
1824 
1825         int min = fStartBufIdx;
1826         int max = fEndBufIdx;
1827         while (min != max) {
1828             int probe = (min + max + (min>max ? CACHE_SIZE : 0)) / 2;
1829             probe = modChunkSize(probe);
1830             if (fBoundaries[probe] > pos) {
1831                 max = probe;
1832             } else {
1833                 min = modChunkSize(probe + 1);
1834             }
1835         }
1836         assert(fBoundaries[max] > pos);
1837         fBufIdx = modChunkSize(max - 1);
1838         fTextIdx = fBoundaries[fBufIdx];
1839         assert(fTextIdx <= pos);
1840         return true;
1841 
1842     };
1843 
1844 
1845     /**
1846      * copy constructor, used from RuleBasedBreakIterator.clone().
1847      *
1848      * @param src
1849      */
BreakCache(BreakCache src)1850     BreakCache(BreakCache src)  {
1851         fStartBufIdx = src.fStartBufIdx;
1852         fEndBufIdx = src.fEndBufIdx;
1853         fTextIdx = src.fTextIdx;
1854         fBufIdx = src.fBufIdx;
1855         fBoundaries = src.fBoundaries.clone();
1856         fStatuses = src.fStatuses.clone();
1857         fSideBuffer = new DictionaryBreakEngine.DequeI();  // Transient, no need to clone contents.
1858     }
1859 
dumpCache()1860     void dumpCache() {
1861         System.out.printf("fTextIdx:%d   fBufIdx:%d%n", fTextIdx, fBufIdx);
1862         for (int i=fStartBufIdx; ; i=modChunkSize(i+1)) {
1863             System.out.printf("%d  %d%n", i, fBoundaries[i]);
1864             if (i == fEndBufIdx) {
1865                 break;
1866             }
1867         }
1868     };
1869 
modChunkSize(int index)1870     private final int   modChunkSize(int index) { return index & (CACHE_SIZE - 1); };
1871 
1872     static final int CACHE_SIZE = 128;
1873     // static_assert((CACHE_SIZE & (CACHE_SIZE-1)) == 0, "CACHE_SIZE must be power of two.");
1874 
1875     int                 fStartBufIdx;
1876     int                 fEndBufIdx;    // inclusive
1877 
1878     int                 fTextIdx;
1879     int                 fBufIdx;
1880 
1881     int[]               fBoundaries = new int[CACHE_SIZE];
1882     short[]             fStatuses = new short[CACHE_SIZE];
1883 
1884     DictionaryBreakEngine.DequeI   fSideBuffer = new DictionaryBreakEngine.DequeI();
1885 };
1886 
1887 
1888 
1889 
1890 }
1891 
1892