• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* GENERATED SOURCE. DO NOT MODIFY. */
2 // © 2016 and later: Unicode, Inc. and others.
3 // License & terms of use: http://www.unicode.org/copyright.html#License
4 /*
5  *******************************************************************************
6  * Copyright (C) 2005-2016 International Business Machines Corporation and
7  * others. All Rights Reserved.
8  *******************************************************************************
9  */
10 
11 package ohos.global.icu.text;
12 
13 import static ohos.global.icu.impl.CharacterIteration.DONE32;
14 import static ohos.global.icu.impl.CharacterIteration.next32;
15 import static ohos.global.icu.impl.CharacterIteration.nextTrail32;
16 
17 import java.io.ByteArrayOutputStream;
18 import java.io.IOException;
19 import java.io.InputStream;
20 import java.io.OutputStream;
21 import java.nio.ByteBuffer;
22 import java.text.CharacterIterator;
23 import java.util.ArrayList;
24 import java.util.List;
25 
26 import ohos.global.icu.impl.CharacterIteration;
27 import ohos.global.icu.impl.ICUBinary;
28 import ohos.global.icu.impl.ICUDebug;
29 import ohos.global.icu.impl.RBBIDataWrapper;
30 import ohos.global.icu.impl.Trie2;
31 import ohos.global.icu.lang.UCharacter;
32 import ohos.global.icu.lang.UProperty;
33 import ohos.global.icu.lang.UScript;
34 
35 /**
36  * Rule Based Break Iterator
37  * This is a port of the C++ class RuleBasedBreakIterator from ICU4C.
38  *
39  * @hide exposed on OHOS
40  */
41 public class RuleBasedBreakIterator extends BreakIterator {
42     //=======================================================================
43     // Constructors & Factories
44     //=======================================================================
45 
46     /**
47      * private constructor
48      */
RuleBasedBreakIterator()49     private RuleBasedBreakIterator() {
50         fDictionaryCharCount  = 0;
51         synchronized(gAllBreakEngines) {
52             fBreakEngines = new ArrayList<>(gAllBreakEngines);
53         }
54     }
55 
56     /**
57      * Create a break iterator from a precompiled set of break rules.
58      *
59      * Creating a break iterator from the binary rules is much faster than
60      * creating one from source rules.
61      *
62      * The binary rules are generated by the RuleBasedBreakIterator.compileRules() function.
63      * Binary break iterator rules are not guaranteed to be compatible between
64      * different versions of ICU.
65      *
66      * @param is an input stream supplying the compiled binary rules.
67      * @throws IOException if there is an error while reading the rules from the InputStream.
68      * @see    #compileRules(String, OutputStream)
69      */
getInstanceFromCompiledRules(InputStream is)70     public static RuleBasedBreakIterator getInstanceFromCompiledRules(InputStream is) throws IOException {
71         RuleBasedBreakIterator  This = new RuleBasedBreakIterator();
72         This.fRData = RBBIDataWrapper.get(ICUBinary.getByteBufferFromInputStreamAndCloseStream(is));
73         return This;
74     }
75 
76     /**
77      * Create a break iterator from a precompiled set of break rules.
78      *
79      * Creating a break iterator from the binary rules is much faster than
80      * creating one from source rules.
81      *
82      * The binary rules are generated by the RuleBasedBreakIterator.compileRules() function.
83      * Binary break iterator rules are not guaranteed to be compatible between
84      * different versions of ICU.
85      *
86      * @param bytes a buffer supplying the compiled binary rules.
87      * @throws IOException if there is an error while reading the rules from the buffer.
88      * @see    #compileRules(String, OutputStream)
89      * @deprecated This API is ICU internal only.
90      * @hide draft / provisional / internal are hidden on OHOS
91      */
92     @Deprecated
getInstanceFromCompiledRules(ByteBuffer bytes)93     public static RuleBasedBreakIterator getInstanceFromCompiledRules(ByteBuffer bytes) throws IOException {
94         RuleBasedBreakIterator  This = new RuleBasedBreakIterator();
95         This.fRData = RBBIDataWrapper.get(bytes);
96         return This;
97     }
98 
99     /**
100      * Construct a RuleBasedBreakIterator from a set of rules supplied as a string.
101      * @param rules The break rules to be used.
102      */
RuleBasedBreakIterator(String rules)103     public RuleBasedBreakIterator(String rules)  {
104         this();
105         try {
106             ByteArrayOutputStream ruleOS = new ByteArrayOutputStream();
107             compileRules(rules, ruleOS);
108             fRData = RBBIDataWrapper.get(ByteBuffer.wrap(ruleOS.toByteArray()));
109         } catch (IOException e) {
110             ///CLOVER:OFF
111             // An IO exception can only arrive here if there is a bug in the RBBI Rule compiler,
112             //  causing bogus compiled rules to be produced, but with no compile error raised.
113             RuntimeException rte = new RuntimeException("RuleBasedBreakIterator rule compilation internal error: "
114                     + e.getMessage());
115             throw rte;
116             ///CLOVER:ON
117         }
118     }
119 
120     //=======================================================================
121     // Boilerplate
122     //=======================================================================
123 
124     /**
125      * Clones this iterator.
126      * @return A newly-constructed RuleBasedBreakIterator with the same
127      * behavior as this one.
128      */
129     @Override
clone()130     public Object clone()  {
131         RuleBasedBreakIterator result;
132         result = (RuleBasedBreakIterator)super.clone();
133         if (fText != null) {
134             result.fText = (CharacterIterator)(fText.clone());
135         }
136         synchronized (gAllBreakEngines)  {
137             result.fBreakEngines = new ArrayList<>(gAllBreakEngines);
138         }
139         result.fLookAheadMatches = new LookAheadResults();
140         result.fBreakCache = result.new BreakCache(fBreakCache);
141         result.fDictionaryCache = result.new DictionaryCache(fDictionaryCache);
142         return result;
143     }
144 
145 
146     /**
147      * Returns true if both BreakIterators are of the same class, have the same
148      * rules, and iterate over the same text.
149      */
150     @Override
equals(Object that)151     public boolean equals(Object that) {
152         if (that == null) {
153             return false;
154         }
155         if (this == that) {
156             return true;
157         }
158         try {
159             RuleBasedBreakIterator other = (RuleBasedBreakIterator) that;
160             if (fRData != other.fRData && (fRData == null || other.fRData == null)) {
161                 return false;
162             }
163             if (fRData != null && other.fRData != null &&
164                     (!fRData.fRuleSource.equals(other.fRData.fRuleSource))) {
165                 return false;
166             }
167             if (fText == null && other.fText == null) {
168                 return true;
169             }
170             if (fText == null || other.fText == null || !fText.equals(other.fText)) {
171                 return false;
172             }
173             return fPosition == other.fPosition;
174         }
175         catch(ClassCastException e) {
176             return false;
177         }
178      }
179 
180     /**
181      * Returns the description (rules) used to create this iterator.
182      * (In ICU4C, the same function is RuleBasedBreakIterator::getRules())
183      */
184     @Override
toString()185     public String toString() {
186         String retStr = "";
187         if (fRData != null) {
188             retStr =  fRData.fRuleSource;
189         }
190         return retStr;
191     }
192 
193     /**
194      * Compute a hashcode for this BreakIterator
195      * @return A hash code
196      */
197     @Override
hashCode()198     public int hashCode()
199     {
200         return fRData.fRuleSource.hashCode();
201     }
202 
203 
204     private static final int  START_STATE = 1;     // The state number of the starting state
205     private static final int  STOP_STATE  = 0;     // The state-transition value indicating "stop"
206 
207     // RBBIRunMode - the state machine runs an extra iteration at the beginning and end
208     //               of user text.  A variable with this enum type keeps track of where we
209     //               are.  The state machine only fetches user text input while in RUN mode.
210     private static final int  RBBI_START  = 0;
211     private static final int  RBBI_RUN    = 1;
212     private static final int  RBBI_END    = 2;
213 
214     /**
215      * The character iterator through which this BreakIterator accesses the text.
216      */
217     private CharacterIterator   fText = new java.text.StringCharacterIterator("");
218 
219     /**
220      * The rule data for this BreakIterator instance.
221      * Not intended for public use. Declared public for testing purposes only.
222      * @deprecated This API is ICU internal only.
223      * @hide draft / provisional / internal are hidden on OHOS
224      */
225     @Deprecated
226     public RBBIDataWrapper    fRData;
227 
228     /**
229      *  The iteration state - current position, rule status for the current position,
230      *                        and whether the iterator ran off the end, yielding UBRK_DONE.
231      *                        Current position is pinned to be 0 < position <= text.length.
232      *                        Current position is always set to a boundary.
233      *
234      *  The current  position of the iterator. Pinned, 0 < fPosition <= text.length.
235      *  Never has the value UBRK_DONE (-1).
236      */
237     private int                fPosition;
238 
239     /**
240      * Index of the Rule {tag} values for the most recent match.
241      */
242     private int                fRuleStatusIndex;
243 
244     /**
245      * True when iteration has run off the end, and iterator functions should return UBRK_DONE.
246      */
247     private boolean            fDone;
248 
249     /**
250      *   Cache of previously determined boundary positions.
251      */
252     private BreakCache         fBreakCache = new BreakCache();
253 
254 
255     /**
256      * Counter for the number of characters encountered with the "dictionary"
257      *   flag set.  Normal RBBI iterators don't use it, although the code
258      *   for updating it is live.  Dictionary Based break iterators (a subclass
259      *   of us) access this field directly.
260      * @hide draft / provisional / internal are hidden on OHOS
261      */
262     private int fDictionaryCharCount;
263 
264     private DictionaryCache     fDictionaryCache = new DictionaryCache();
265 
266     /**
267      * ICU debug argument name for RBBI
268      */
269     private static final String RBBI_DEBUG_ARG = "rbbi";
270 
271     /**
272      * Debugging flag.  Trace operation of state machine when true.
273      */
274     private static final boolean TRACE = ICUDebug.enabled(RBBI_DEBUG_ARG)
275             && ICUDebug.value(RBBI_DEBUG_ARG).indexOf("trace") >= 0;
276 
277     /**
278      * The "default" break engine - just skips over ranges of dictionary words,
279      * producing no breaks. Should only be used if characters need to be handled
280      * by a dictionary but we have no dictionary implementation for them.
281      *
282      * Only one instance; shared by all break iterators.
283      */
284     private static final UnhandledBreakEngine gUnhandledBreakEngine;
285 
286     /**
287      * List of all known break engines, common for all break iterators.
288      * Lazily updated as break engines are needed, because instantiation of
289      * break engines is expensive.
290      *
291      * Because gAllBreakEngines can be referenced concurrently from different
292      * BreakIterator instances, all access is synchronized.
293      */
294     private static final List<LanguageBreakEngine> gAllBreakEngines;
295 
296     static {
297         gUnhandledBreakEngine = new UnhandledBreakEngine();
298         gAllBreakEngines = new ArrayList<>();
299         gAllBreakEngines.add(gUnhandledBreakEngine);
300     }
301 
302     /**
303      * List of all known break engines. Similar to gAllBreakEngines, but local to a
304      * break iterator, allowing it to be used without synchronization.
305      */
306     private List<LanguageBreakEngine> fBreakEngines;
307 
308     /**
309      * Dump the contents of the state table and character classes for this break iterator.
310      * For debugging only.
311      * @deprecated This API is ICU internal only.
312      * @hide draft / provisional / internal are hidden on OHOS
313      */
314     @Deprecated
dump(java.io.PrintStream out)315     public void dump(java.io.PrintStream out) {
316         if (out == null) {
317             out = System.out;
318         }
319         this.fRData.dump(out);
320     }
321 
322     /**
323      * Compile a set of source break rules into the binary state tables used
324      * by the break iterator engine.  Creating a break iterator from precompiled
325      * rules is much faster than creating one from source rules.
326      *
327      * Binary break rules are not guaranteed to be compatible between different
328      * versions of ICU.
329      *
330      *
331      * @param rules  The source form of the break rules
332      * @param ruleBinary  An output stream to receive the compiled rules.
333      * @throws IOException If there is an error writing the output.
334      * @see #getInstanceFromCompiledRules(InputStream)
335      */
compileRules(String rules, OutputStream ruleBinary)336     public static void compileRules(String rules, OutputStream ruleBinary) throws IOException {
337         RBBIRuleBuilder.compileRules(rules, ruleBinary);
338     }
339 
340     //=======================================================================
341     // BreakIterator overrides
342     //=======================================================================
343 
344     /**
345      * Sets the current iteration position to the beginning of the text.
346      * (i.e., the CharacterIterator's starting offset).
347      * @return The offset of the beginning of the text.
348      */
349     @Override
first()350     public int first() {
351         if (fText == null) {
352             return BreakIterator.DONE;
353         }
354         fText.first();
355         int start =  fText.getIndex();
356         if (!fBreakCache.seek(start)) {
357             fBreakCache.populateNear(start);
358         }
359         fBreakCache.current();
360         assert(fPosition == start);
361         return fPosition;
362     }
363 
364     /**
365      * Sets the current iteration position to the end of the text.
366      * (i.e., the CharacterIterator's ending offset).
367      * @return The text's past-the-end offset.
368      */
369     @Override
last()370     public int last() {
371         if (fText == null) {
372             return BreakIterator.DONE;
373         }
374         int endPos = fText.getEndIndex();
375         boolean endShouldBeBoundary = isBoundary(endPos);      // Has side effect of setting iterator position.
376         assert(endShouldBeBoundary);
377         if (fPosition != endPos) {
378             assert(fPosition == endPos);
379         }
380         return endPos;
381     }
382 
383     /**
384      * Advances the iterator either forward or backward the specified number of steps.
385      * Negative values move backward, and positive values move forward.  This is
386      * equivalent to repeatedly calling next() or previous().
387      * @param n The number of steps to move.  The sign indicates the direction
388      * (negative is backwards, and positive is forwards).
389      * @return The character offset of the boundary position n boundaries away from
390      * the current one.
391      */
392     @Override
next(int n)393     public int next(int n) {
394         int result = 0;
395         if (n > 0) {
396             for (; n > 0 && result != DONE; --n) {
397                 result = next();
398             }
399         } else if (n < 0) {
400             for (; n < 0 && result != DONE; ++n) {
401                 result = previous();
402             }
403         } else {
404             result = current();
405         }
406         return result;
407     }
408 
409     /**
410      * Advances the iterator to the next boundary position.
411      * @return The position of the first boundary after this one.
412      */
413     @Override
next()414     public int next() {
415         fBreakCache.next();
416         return fDone ? DONE : fPosition;
417     }
418 
419     /**
420      * Moves the iterator backwards, to the boundary preceding the current one.
421      * @return The position of the boundary position immediately preceding the starting position.
422      */
423     @Override
previous()424     public int previous() {
425         fBreakCache.previous();
426         return fDone ? DONE : fPosition;
427     }
428 
429     /**
430      * Sets the iterator to refer to the first boundary position following
431      * the specified position.
432      * @param startPos The position from which to begin searching for a break position.
433      * @return The position of the first break after the current position.
434      */
435     @Override
following(int startPos)436     public int following(int startPos) {
437         // if the supplied position is before the beginning, return the
438         // text's starting offset
439         if (startPos < fText.getBeginIndex()) {
440             return first();
441         }
442 
443         // Move requested offset to a code point start. It might be between a lead and trail surrogate.
444         // Or it may be beyond the end of the text.
445         startPos = CISetIndex32(fText, startPos);
446         fBreakCache.following(startPos);
447         return fDone ? DONE : fPosition;
448     }
449 
450 
451     /**
452      * Sets the iterator to refer to the last boundary position before the
453      * specified position.
454      * @param offset The position to begin searching for a break from.
455      * @return The position of the last boundary before the starting position.
456      */
457     @Override
preceding(int offset)458     public int preceding(int offset) {
459         if (fText == null || offset > fText.getEndIndex()) {
460             return last();
461         } else if (offset < fText.getBeginIndex()) {
462             return first();
463         }
464 
465         // Move requested offset to a code point start. It might be between a lead and trail surrogate.
466         // int adjustedOffset = CISetIndex32(fText, offset);    // TODO: restore to match ICU4C behavior.
467         int adjustedOffset = offset;
468         fBreakCache.preceding(adjustedOffset);
469         return fDone ? DONE : fPosition;
470 
471     }
472 
473 
474     /**
475      * Throw IllegalArgumentException unless begin &lt;= offset &lt; end.
476      */
checkOffset(int offset, CharacterIterator text)477     protected static final void checkOffset(int offset, CharacterIterator text) {
478         if (offset < text.getBeginIndex() || offset > text.getEndIndex()) {
479             throw new IllegalArgumentException("offset out of bounds");
480         }
481     }
482 
483 
484     /**
485      * Returns true if the specified position is a boundary position.  As a side
486      * effect, leaves the iterator pointing to the first boundary position at
487      * or after "offset".
488      * @param offset the offset to check.
489      * @return True if "offset" is a boundary position.
490      */
491     @Override
isBoundary(int offset)492     public boolean isBoundary(int offset) {
493         // TODO: behavior difference with ICU4C, which considers out-of-range offsets
494         //       to not be boundaries, and to not be errors.
495         checkOffset(offset, fText);
496 
497         // Adjust offset to be on a code point boundary and not beyond the end of the text.
498         // Note that isBoundary() is always false for offsets that are not on code point boundaries.
499         // But we still need the side effect of leaving iteration at the following boundary.
500         int adjustedOffset = CISetIndex32(fText, offset);
501 
502         boolean result = false;
503         if (fBreakCache.seek(adjustedOffset) || fBreakCache.populateNear(adjustedOffset)) {
504             result = (fBreakCache.current() == offset);
505         }
506 
507         if (!result) {
508             // Not on a boundary. isBoundary() must leave iterator on the following boundary.
509             // fBreakCache.seek(), above, left us on the preceding boundary, so advance one.
510             next();
511         }
512         return result;
513 
514     }
515 
516     /**
517      * Returns the current iteration position.  Note that DONE is never
518      * returned from this function; if iteration has run to the end of a
519      * string, current() will return the length of the string while
520      * next() will return BreakIterator.DONE).
521      * @return The current iteration position.
522      */
523     @Override
current()524     public int current() {
525         return (fText != null) ? fPosition : BreakIterator.DONE;
526     }
527 
528 
529     /**
530      * Return the status tag from the break rule that determined the boundary at
531      * the current iteration position.  The values appear in the rule source
532      * within brackets, {123}, for example.  For rules that do not specify a
533      * status, a default value of 0 is returned.  If more than one rule applies,
534      * the numerically largest of the possible status values is returned.
535      * <p>
536      * Of the standard types of ICU break iterators, only the word and line break
537      * iterator provides status values.  The values are defined in
538      * class RuleBasedBreakIterator, and allow distinguishing between words
539      * that contain alphabetic letters, "words" that appear to be numbers,
540      * punctuation and spaces, words containing ideographic characters, and
541      * more.  Call <code>getRuleStatus</code> after obtaining a boundary
542      * position from <code>next()</code>, <code>previous()</code>, or
543      * any other break iterator functions that returns a boundary position.
544      * <p>
545      * Note that <code>getRuleStatus()</code> returns the value corresponding to
546      * <code>current()</code> index even after <code>next()</code> has returned DONE.
547      * <p>
548 
549      * @return the status from the break rule that determined the boundary
550      * at the current iteration position.
551      */
552 
553     @Override
getRuleStatus()554     public int  getRuleStatus() {
555         //   Status records have this form:
556         //           Count N         <--  fLastRuleStatusIndex points here.
557         //           Status val 0
558         //           Status val 1
559         //              ...
560         //           Status val N-1  <--  the value we need to return
561         //   The status values are sorted in ascending order.
562         //   This function returns the last (largest) of the array of status values.
563         int  idx = fRuleStatusIndex + fRData.fStatusTable[fRuleStatusIndex];
564         int  tagVal = fRData.fStatusTable[idx];
565         return tagVal;
566     }
567 
568     /**
569      * Get the status (tag) values from the break rule(s) that determined the boundary
570      * at the current iteration position.  The values appear in the rule source
571      * within brackets, {123}, for example.  The default status value for rules
572      * that do not explicitly provide one is zero.
573      * <p>
574      * The status values used by the standard ICU break rules are defined
575      * as public constants in class RuleBasedBreakIterator.
576      * <p>
577      * If the size  of the output array is insufficient to hold the data,
578      *  the output will be truncated to the available length.  No exception
579      *  will be thrown.
580      *
581      * @param fillInArray an array to be filled in with the status values.
582      * @return          The number of rule status values from the rules that determined
583      *                  the boundary at the current iteration position.
584      *                  In the event that the array is too small, the return value
585      *                  is the total number of status values that were available,
586      *                  not the reduced number that were actually returned.
587      */
588     @Override
getRuleStatusVec(int[] fillInArray)589     public int getRuleStatusVec(int[] fillInArray) {
590         int numStatusVals = fRData.fStatusTable[fRuleStatusIndex];
591         if (fillInArray != null) {
592             int numToCopy = Math.min(numStatusVals, fillInArray.length);
593             for (int i=0; i<numToCopy; i++) {
594                 fillInArray[i] = fRData.fStatusTable[fRuleStatusIndex + i + 1];
595             }
596         }
597         return numStatusVals;
598     }
599 
600     /**
601      * Returns a CharacterIterator over the text being analyzed.
602      * <p>
603      * <b><i>Caution:</i></b>The state of the returned CharacterIterator
604      * must not be modified in any way while the BreakIterator is still in use.
605      * Doing so will lead to undefined behavior of the BreakIterator.
606      * Clone the returned CharacterIterator first and work with that.
607      * <p>
608      * The returned CharacterIterator is a reference
609      * to the <b>actual iterator being used</b> by the BreakIterator.
610      * No guarantees are made about the current position
611      * of this iterator when it is returned; it may differ from the
612      * BreakIterators current position.  If you need to move that
613      * position to examine the text, clone this function's return value first.
614      * @return An iterator over the text being analyzed.
615      */
616     @Override
getText()617     public CharacterIterator getText() {
618         return fText;
619     }
620 
621     /**
622      * Set the iterator to analyze a new piece of text.  This function resets
623      * the current iteration position to the beginning of the text.
624      * (The old iterator is dropped.)
625      * <p>
626      * <b><i>Caution:</i></b> The supplied CharacterIterator is used
627      * directly by the BreakIterator, and must not be altered in any
628      * way by code outside of the BreakIterator.
629      * Doing so will lead to undefined behavior of the BreakIterator.
630      *
631      * @param newText An iterator over the text to analyze.
632      */
633     @Override
setText(CharacterIterator newText)634     public void setText(CharacterIterator newText) {
635         if (newText != null) {
636             fBreakCache.reset(newText.getBeginIndex(), 0);
637         } else {
638             fBreakCache.reset();
639         }
640         fDictionaryCache.reset();
641         fText = newText;
642         this.first();
643     }
644 
645      /**
646      * Control debug, trace and dump options.
647      * @deprecated This API is ICU internal only.
648      * @hide draft / provisional / internal are hidden on OHOS
649      */
650     @Deprecated
651     public static final String fDebugEnv = ICUDebug.enabled(RBBI_DEBUG_ARG) ?
652                                            ICUDebug.value(RBBI_DEBUG_ARG) : null;
653 
654 
getLanguageBreakEngine(int c)655     private LanguageBreakEngine getLanguageBreakEngine(int c) {
656 
657         // We have a dictionary character.
658         // Does an already instantiated break engine handle it?
659         for (LanguageBreakEngine candidate : fBreakEngines) {
660             if (candidate.handles(c)) {
661                 return candidate;
662             }
663         }
664 
665         synchronized (gAllBreakEngines) {
666             // This break iterator's list of break engines didn't handle the character.
667             // Check the global list, another break iterator may have instantiated the
668             // desired engine.
669             for (LanguageBreakEngine candidate : gAllBreakEngines) {
670                 if (candidate.handles(c)) {
671                     fBreakEngines.add(candidate);
672                     return candidate;
673                 }
674             }
675 
676             // The global list doesn't have an existing engine, build one.
677             int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
678             if (script == UScript.KATAKANA || script == UScript.HIRAGANA) {
679                 // Katakana, Hiragana and Han are handled by the same dictionary engine.
680                 // Fold them together for mapping from script -> engine.
681                 script = UScript.HAN;
682             }
683 
684             LanguageBreakEngine eng;
685             try {
686                 switch (script) {
687                 case UScript.THAI:
688                     eng = new ThaiBreakEngine();
689                     break;
690                 case UScript.LAO:
691                     eng = new LaoBreakEngine();
692                     break;
693                 case UScript.MYANMAR:
694                     eng = new BurmeseBreakEngine();
695                     break;
696                 case UScript.KHMER:
697                     eng = new KhmerBreakEngine();
698                     break;
699                 case UScript.HAN:
700                     eng = new CjkBreakEngine(false);
701                      break;
702                 case UScript.HANGUL:
703                     eng = new CjkBreakEngine(true);
704                     break;
705                 default:
706                     gUnhandledBreakEngine.handleChar(c);
707                     eng = gUnhandledBreakEngine;
708                     break;
709                 }
710             } catch (IOException e) {
711                 eng = null;
712             }
713 
714             if (eng != null && eng != gUnhandledBreakEngine) {
715                 gAllBreakEngines.add(eng);
716                 fBreakEngines.add(eng);
717             }
718             return eng;
719         }   // end synchronized(gAllBreakEngines)
720     }
721 
722     private static final int kMaxLookaheads = 8;
723     private static class LookAheadResults {
724         int      fUsedSlotLimit;
725         int[]    fPositions;
726         int[]    fKeys;
727 
LookAheadResults()728         LookAheadResults() {
729             fUsedSlotLimit= 0;
730             fPositions = new int[kMaxLookaheads];
731             fKeys = new int[kMaxLookaheads];
732         }
733 
getPosition(int key)734         int getPosition(int key) {
735             for (int i=0; i<fUsedSlotLimit; ++i) {
736                 if (fKeys[i] == key) {
737                     return fPositions[i];
738                 }
739             }
740             assert(false);
741             return -1;
742         }
743 
setPosition(int key, int position)744         void setPosition(int key, int position) {
745             int i;
746             for (i=0; i<fUsedSlotLimit; ++i) {
747                 if (fKeys[i] == key) {
748                     fPositions[i] = position;
749                     return;
750                 }
751             }
752             if (i >= kMaxLookaheads) {
753                 assert(false);
754                 i = kMaxLookaheads - 1;
755             }
756             fKeys[i] = key;
757             fPositions[i] = position;
758             assert(fUsedSlotLimit == i);
759             fUsedSlotLimit = i + 1;
760         }
761 
reset()762         void reset() {
763             fUsedSlotLimit = 0;
764         }
765     };
766     private LookAheadResults fLookAheadMatches = new LookAheadResults();
767 
768 
769     /**
770      * The State Machine Engine for moving forward is here.
771      * This function is the heart of the RBBI run time engine.
772      *
773      * Input
774      *    fPosition, the position in the text to begin from.
775      * Output
776      *    fPosition:           the boundary following the starting position.
777      *    fDictionaryCharCount the number of dictionary characters encountered.
778      *                         If > 0, the segment will be further subdivided
779      *    fRuleStatusIndex     Info from the state table indicating which rules caused the boundary.
780      *
781      * @return the new iterator position
782      *
783      * A note on supplementary characters and the position of underlying
784      * Java CharacterIterator:   Normally, a character iterator is positioned at
785      * the char most recently returned by next().  Within this function, when
786      * a supplementary char is being processed, the char iterator is left
787      * sitting on the trail surrogate, in the middle of the code point.
788      * This is different from everywhere else, where an iterator always
789      * points at the lead surrogate of a supplementary.
790      */
handleNext()791     private int handleNext() {
792         if (TRACE) {
793             System.out.println("Handle Next   pos      char  state category");
794         }
795 
796         // handleNext always sets the break tag value.
797         // Set the default for it.
798         fRuleStatusIndex  = 0;
799         fDictionaryCharCount = 0;
800 
801         // caches for quicker access
802         CharacterIterator text = fText;
803         Trie2 trie = fRData.fTrie;
804 
805         short[] stateTable  = fRData.fFTable.fTable;
806         int initialPosition = fPosition;
807         text.setIndex(initialPosition);
808         int result          = initialPosition;
809 
810         // Set up the starting char
811         int c = text.current();
812         if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) {
813             c = nextTrail32(text, c);
814             if (c == DONE32) {
815                 fDone = true;
816                 return BreakIterator.DONE;
817             }
818         }
819 
820         // Set the initial state for the state machine
821         int state           = START_STATE;
822         int row             = fRData.getRowIndex(state);
823         short category      = 3;
824         int flagsState      = fRData.fFTable.fFlags;
825         int mode            = RBBI_RUN;
826         if ((flagsState & RBBIDataWrapper.RBBI_BOF_REQUIRED) != 0) {
827             category = 2;
828             mode     = RBBI_START;
829             if (TRACE) {
830                 System.out.print("            " +  RBBIDataWrapper.intToString(text.getIndex(), 5));
831                 System.out.print(RBBIDataWrapper.intToHexString(c, 10));
832                 System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6));
833             }
834         }
835         fLookAheadMatches.reset();
836 
837         // loop until we reach the end of the text or transition to state 0
838         while (state != STOP_STATE) {
839             if (c == DONE32) {
840                 // Reached end of input string.
841                 if (mode == RBBI_END) {
842                     // We have already run the loop one last time with the
843                     // character set to the pseudo {eof} value. Now it is time
844                     // to unconditionally bail out.
845                     break;
846                 }
847                 // Run the loop one last time with the fake end-of-input character category
848                 mode = RBBI_END;
849                 category = 1;
850             }
851             else if (mode == RBBI_RUN) {
852                 // Get the char category.  An incoming category of 1 or 2 mens that
853                 //      we are preset for doing the beginning or end of input, and
854                 //      that we shouldn't get a category from an actual text input character.
855                 //
856 
857                 // look up the current character's character category, which tells us
858                 // which column in the state table to look at.
859                 //
860                 category = (short) trie.get(c);
861 
862                 // Check the dictionary bit in the character's category.
863                 //    Counter is only used by dictionary based iterators (subclasses).
864                 //    Chars that need to be handled by a dictionary have a flag bit set
865                 //    in their category values.
866                 //
867                 if ((category & 0x4000) != 0)  {
868                     fDictionaryCharCount++;
869                     //  And off the dictionary flag bit.
870                     category &= ~0x4000;
871                 }
872 
873                 if (TRACE) {
874                     System.out.print("            " +  RBBIDataWrapper.intToString(text.getIndex(), 5));
875                     System.out.print(RBBIDataWrapper.intToHexString(c, 10));
876                     System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6));
877                 }
878 
879                 // Advance to the next character.
880                 // If this is a beginning-of-input loop iteration, don't advance.
881                 //    The next iteration will be processing the first real input character.
882                 c = text.next();
883                 if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) {
884                     c = nextTrail32(text, c);
885                 }
886             }
887             else {
888                 mode = RBBI_RUN;
889             }
890 
891             // look up a state transition in the state table
892             state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category];
893             row   = fRData.getRowIndex(state);
894 
895             if (stateTable[row + RBBIDataWrapper.ACCEPTING] == -1) {
896                 // Match found, common case
897                 result = text.getIndex();
898                 if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c <= UTF16.CODEPOINT_MAX_VALUE) {
899                     // The iterator has been left in the middle of a surrogate pair.
900                     // We want the start of it.
901                     result--;
902                 }
903 
904                 //  Remember the break status (tag) values.
905                 fRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGIDX];
906             }
907 
908             int completedRule = stateTable[row + RBBIDataWrapper.ACCEPTING];
909             if (completedRule > 0) {
910                 // Lookahead match is completed
911                 int lookaheadResult = fLookAheadMatches.getPosition(completedRule);
912                 if (lookaheadResult >= 0) {
913                     fRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGIDX];
914                     fPosition = lookaheadResult;
915                     return lookaheadResult;
916                 }
917             }
918 
919             // If we are at the position of the '/' in a look-ahead (hard break) rule;
920             // record the current position, to be returned later, if the full rule matches.
921             // TODO: Move this check before the previous check of fAccepting.
922             //       This would enable hard-break rules with no following context.
923             //       But there are line break test failures when trying this. Investigate.
924             //       Issue ICU-20837
925             int rule =  stateTable[row + RBBIDataWrapper.LOOKAHEAD];
926             if (rule != 0) {
927                 int  pos = text.getIndex();
928                 if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c <= UTF16.CODEPOINT_MAX_VALUE) {
929                     // The iterator has been left in the middle of a surrogate pair.
930                     // We want the beginning  of it.
931                     pos--;
932                 }
933                 fLookAheadMatches.setPosition(rule, pos);
934             }
935 
936 
937         }        // End of state machine main loop
938 
939         // The state machine is done.  Check whether it found a match...
940 
941         // If the iterator failed to advance in the match engine force it ahead by one.
942         // This indicates a defect in the break rules, which should always match
943         // at least one character.
944 
945         if (result == initialPosition) {
946             if (TRACE) {
947                 System.out.println("Iterator did not move. Advancing by 1.");
948             }
949             text.setIndex(initialPosition);
950             next32(text);
951             result = text.getIndex();
952             fRuleStatusIndex = 0;
953         }
954 
955         // Leave the iterator at our result position.
956         //   (we may have advanced beyond the last accepting position chasing after
957         //    longer matches that never completed.)
958         fPosition = result;
959 
960         if (TRACE) {
961             System.out.println("result = " + result);
962         }
963         return result;
964     }
965 
966     /**
967      * Iterate backwards from an arbitrary position in the input text using the Safe Reverse rules.
968      * This locates a "Safe Position" from which the forward break rules
969      * will operate correctly. A Safe Position is not necessarily a boundary itself.
970      *
971      * The logic of this function is very similar to handleNext(), above, but simpler
972      * because the safe table does not require as many options.
973      *
974      * @param fromPosition the position in the input text to begin the iteration.
975      * @hide draft / provisional / internal are hidden on OHOS
976      */
handleSafePrevious(int fromPosition)977     private int handleSafePrevious(int fromPosition) {
978         int             state;
979         short           category = 0;
980         int             result = 0;
981 
982         // caches for quicker access
983         CharacterIterator text = fText;
984         Trie2 trie = fRData.fTrie;
985         short[] stateTable  = fRData.fRTable.fTable;
986 
987         CISetIndex32(text, fromPosition);
988         if (TRACE) {
989             System.out.print("Handle Previous   pos   char  state category");
990         }
991 
992         // if we're already at the start of the text, return DONE.
993         if (text.getIndex() == text.getBeginIndex()) {
994             return BreakIterator.DONE;
995         }
996 
997         //  Set the initial state for the state machine
998         int c = CharacterIteration.previous32(text);
999         state = START_STATE;
1000         int row = fRData.getRowIndex(state);
1001 
1002         // loop until we reach the start of the text or transition to state 0
1003         //
1004         for (; c != DONE32; c = CharacterIteration.previous32(text)) {
1005 
1006             // look up the current character's character category, which tells us
1007             // which column in the state table to look at.
1008             //
1009             //  And off the dictionary flag bit. For reverse iteration it is not used.
1010             category = (short) trie.get(c);
1011             category &= ~0x4000;
1012             if (TRACE) {
1013                 System.out.print("            " +  RBBIDataWrapper.intToString(text.getIndex(), 5));
1014                 System.out.print(RBBIDataWrapper.intToHexString(c, 10));
1015                 System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6));
1016             }
1017 
1018             // State Transition - move machine to its next state
1019             //
1020             assert(category < fRData.fHeader.fCatCount);
1021             state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category];
1022             row   = fRData.getRowIndex(state);
1023 
1024             if (state == STOP_STATE) {
1025                 // This is the normal exit from the lookup state machine.
1026                 // Transition to state zero means we have found a safe point.
1027                 break;
1028             }
1029         }
1030 
1031         // The state machine is done.
1032         result = text.getIndex();
1033         if (TRACE) {
1034             System.out.println("result = " + result);
1035         }
1036         return result;
1037     }
1038 
1039     /**
1040      * Set the index of a CharacterIterator.
1041      * Pin the index to the valid range range of BeginIndex <= index <= EndIndex.
1042      * If the index points to a trail surrogate of a supplementary character, adjust it
1043      * to the start (lead surrogate) index.
1044      *
1045      * @param ci A CharacterIterator to set
1046      * @param index the index to set
1047      * @return the resulting index, possibly pinned or adjusted.
1048      */
1049     private static int CISetIndex32(CharacterIterator ci, int index) {
1050         if (index <= ci.getBeginIndex()) {
1051             ci.first();
1052         } else if (index >= ci.getEndIndex()) {
1053             ci.setIndex(ci.getEndIndex());
1054         } else if (Character.isLowSurrogate(ci.setIndex(index))) {
1055             if (!Character.isHighSurrogate(ci.previous())) {
1056                 ci.next();
1057             }
1058         }
1059         return ci.getIndex();
1060     }
1061 
1062     /** DictionaryCache  stores the boundaries obtained from a run of dictionary characters.
1063      *                 Dictionary boundaries are moved first to this cache, then from here
1064      *                 to the main BreakCache, where they may inter-leave with non-dictionary
1065      *                 boundaries. The public BreakIterator API always fetches directly
1066      *                 from the main BreakCache, not from here.
1067      *
1068      *                 In common situations, the number of boundaries in a single dictionary run
1069      *                 should be quite small, it will be terminated by punctuation, spaces,
1070      *                 or any other non-dictionary characters. The main BreakCache may end
1071      *                 up with boundaries from multiple dictionary based runs.
1072      *
1073      *                 The boundaries are stored in a simple ArrayList (vector), with the
1074      *                 assumption that they will be accessed sequentially.
1075      */
1076     class DictionaryCache  {
1077 
1078          void reset() {
1079              fPositionInCache = -1;
1080              fStart = 0;
1081              fLimit = 0;
1082              fFirstRuleStatusIndex = 0;
1083              fOtherRuleStatusIndex = 0;
1084              fBreaks.removeAllElements();
1085          };
1086 
1087          boolean following(int fromPos) {
1088              if (fromPos >= fLimit || fromPos < fStart) {
1089                  fPositionInCache = -1;
1090                  return false;
1091              }
1092 
1093              // Sequential iteration, move from previous boundary to the following
1094 
1095              int r = 0;
1096              if (fPositionInCache >= 0 && fPositionInCache < fBreaks.size() && fBreaks.elementAt(fPositionInCache) == fromPos) {
1097                  ++fPositionInCache;
1098                  if (fPositionInCache >= fBreaks.size()) {
1099                      fPositionInCache = -1;
1100                      return false;
1101                  }
1102                  r = fBreaks.elementAt(fPositionInCache);
1103                  assert(r > fromPos);
1104                  fBoundary = r;
1105                  fStatusIndex = fOtherRuleStatusIndex;
1106                  return true;
1107              }
1108 
1109              // Random indexing. Linear search for the boundary following the given position.
1110 
1111              for (fPositionInCache = 0; fPositionInCache < fBreaks.size(); ++fPositionInCache) {
1112                  r= fBreaks.elementAt(fPositionInCache);
1113                  if (r > fromPos) {
1114                      fBoundary = r;
1115                      fStatusIndex = fOtherRuleStatusIndex;
1116                      return true;
1117                  }
1118              }
1119 
1120              // Internal error. fStart <= fromPos < fLimit, but no cached boundary.
1121              assert(false);
1122              fPositionInCache = -1;
1123              return false;
1124          };
1125 
preceding(int fromPos)1126          boolean preceding(int fromPos) {
1127              if (fromPos <= fStart || fromPos > fLimit) {
1128                  fPositionInCache = -1;
1129                  return false;
1130              }
1131 
1132              if (fromPos == fLimit) {
1133                  fPositionInCache = fBreaks.size() - 1;
1134                  if (fPositionInCache >= 0) {
1135                      assert(fBreaks.elementAt(fPositionInCache) == fromPos);
1136                  }
1137              }
1138 
1139              int r;
1140              if (fPositionInCache > 0 && fPositionInCache < fBreaks.size() && fBreaks.elementAt(fPositionInCache) == fromPos) {
1141                  --fPositionInCache;
1142                  r = fBreaks.elementAt(fPositionInCache);
1143                  assert(r < fromPos);
1144                  fBoundary = r;
1145                  fStatusIndex = ( r== fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex;
1146                  return true;
1147              }
1148 
1149              if (fPositionInCache == 0) {
1150                  fPositionInCache = -1;
1151                  return false;
1152              }
1153 
1154              for (fPositionInCache = fBreaks.size()-1; fPositionInCache >= 0; --fPositionInCache) {
1155                  r = fBreaks.elementAt(fPositionInCache);
1156                  if (r < fromPos) {
1157                      fBoundary = r;
1158                      fStatusIndex = ( r == fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex;
1159                      return true;
1160                  }
1161              }
1162              assert(false);
1163              fPositionInCache = -1;
1164              return false;
1165          };
1166 
1167         /**
1168          * Populate the cache with the dictionary based boundaries within a region of text.
1169          * @param startPos  The start position of a range of text
1170          * @param endPos    The end position of a range of text
1171          * @param firstRuleStatus The rule status index that applies to the break at startPos
1172          * @param otherRuleStatus The rule status index that applies to boundaries other than startPos
1173          * @hide draft / provisional / internal are hidden on OHOS
1174          */
populateDictionary(int startPos, int endPos, int firstRuleStatus, int otherRuleStatus)1175         void populateDictionary(int startPos, int endPos,
1176                                 int firstRuleStatus, int otherRuleStatus) {
1177             if ((endPos - startPos) <= 1) {
1178                 return;
1179             }
1180 
1181             reset();
1182             fFirstRuleStatusIndex = firstRuleStatus;
1183             fOtherRuleStatusIndex = otherRuleStatus;
1184 
1185             int rangeStart = startPos;
1186             int rangeEnd = endPos;
1187 
1188             int         category;
1189             int         current;
1190             int         foundBreakCount = 0;
1191 
1192             // Loop through the text, looking for ranges of dictionary characters.
1193             // For each span, find the appropriate break engine, and ask it to find
1194             // any breaks within the span.
1195 
1196             fText.setIndex(rangeStart);
1197             int     c = CharacterIteration.current32(fText);
1198             category = (short)fRData.fTrie.get(c);
1199 
1200             while(true) {
1201                 while((current = fText.getIndex()) < rangeEnd && (category & 0x4000) == 0) {
1202                     c = CharacterIteration.next32(fText);    // pre-increment
1203                     category = (short)fRData.fTrie.get(c);
1204                 }
1205                 if (current >= rangeEnd) {
1206                     break;
1207                 }
1208 
1209                 // We now have a dictionary character. Get the appropriate language object
1210                 // to deal with it.
1211                 LanguageBreakEngine lbe = getLanguageBreakEngine(c);
1212 
1213                 // Ask the language object if there are any breaks. It will add them to the cache and
1214                 // leave the text pointer on the other side of its range, ready to search for the next one.
1215                 if (lbe != null) {
1216                     foundBreakCount += lbe.findBreaks(fText, rangeStart, rangeEnd, fBreaks);
1217                 }
1218 
1219                 // Reload the loop variables for the next go-round
1220                 c = CharacterIteration.current32(fText);
1221                 category = (short)fRData.fTrie.get(c);
1222             }
1223 
1224             // If we found breaks, ensure that the first and last entries are
1225             // the original starting and ending position. And initialize the
1226             // cache iteration position to the first entry.
1227 
1228             // System.out.printf("foundBreakCount = %d%n", foundBreakCount);
1229             if (foundBreakCount > 0) {
1230                 assert(foundBreakCount == fBreaks.size());
1231                 if (startPos < fBreaks.elementAt(0)) {
1232                     // The dictionary did not place a boundary at the start of the segment of text.
1233                     // Add one now. This should not commonly happen, but it would be easy for interactions
1234                     // of the rules for dictionary segments and the break engine implementations to
1235                     // inadvertently cause it. Cover it here, just in case.
1236                     fBreaks.offer(startPos);
1237                 }
1238                 if (endPos > fBreaks.peek()) {
1239                     fBreaks.push(endPos);
1240                 }
1241                 fPositionInCache = 0;
1242                 // Note: Dictionary matching may extend beyond the original limit.
1243                 fStart = fBreaks.elementAt(0);
1244                 fLimit = fBreaks.peek();
1245             } else {
1246                 // there were no language-based breaks, even though the segment contained
1247                 // dictionary characters. Subsequent attempts to fetch boundaries from the dictionary cache
1248                 // for this range will fail, and the calling code will fall back to the rule based boundaries.
1249             }
1250 
1251         };
1252 
1253 
DictionaryCache()1254         DictionaryCache() {
1255             fPositionInCache = -1;
1256             fBreaks = new DictionaryBreakEngine.DequeI();
1257         }
1258 
1259         /**
1260          * copy constructor. Used by RuleBasedBreakIterator.clone().
1261          *
1262          * @param src the source object to be copied.
1263          */
DictionaryCache(DictionaryCache src)1264         DictionaryCache(DictionaryCache src)  {
1265             try {
1266                 fBreaks = (DictionaryBreakEngine.DequeI)src.fBreaks.clone();
1267             }
1268             catch (CloneNotSupportedException e) {
1269                 throw new RuntimeException(e);
1270             }
1271             fPositionInCache      = src.fPositionInCache;
1272             fStart                = src.fStart;
1273             fLimit                = src.fLimit;
1274             fFirstRuleStatusIndex = src.fFirstRuleStatusIndex;
1275             fOtherRuleStatusIndex = src.fOtherRuleStatusIndex;
1276             fBoundary             = src.fBoundary;
1277             fStatusIndex          = src.fStatusIndex;
1278         }
1279 
1280         // A data structure containing the boundaries themselves. Essentially a vector of raw ints.
1281         DictionaryBreakEngine.DequeI fBreaks;
1282         int             fPositionInCache;       // Index in fBreaks of last boundary returned by following()
1283         //                                      //    or preceding(). Optimizes sequential access.
1284         int             fStart;                 // Text position of first boundary in cache.
1285         int             fLimit;                 // Last boundary in cache. Which is the limit of the
1286         //                                      //    text segment being handled by the dictionary.
1287         int             fFirstRuleStatusIndex;  // Rule status info for first boundary.
1288         int             fOtherRuleStatusIndex;  // Rule status info for 2nd through last boundaries.
1289         int             fBoundary;              // Current boundary. Set by preceding(), following().
1290         int             fStatusIndex;           // Current rule status index. Set by preceding, following().
1291     };
1292 
1293 
1294 
1295 
1296 /*
1297  * class BreakCache
1298  *
1299  * Cache of break boundary positions and rule status values.
1300  * Break iterator API functions, next(), previous(), etc., will use cached results
1301  * when possible, and otherwise cache new results as they are obtained.
1302  *
1303  * Uniformly caches both dictionary and rule based (non-dictionary) boundaries.
1304  *
1305  * The cache is implemented as a single circular buffer.
1306  */
1307 
1308 /*
1309  * size of the circular cache buffer.
1310  */
1311 
1312 class BreakCache {
1313 
BreakCache()1314     BreakCache() {
1315         reset();
1316     };
1317 
reset(int pos, int ruleStatus)1318     void  reset(int pos, int ruleStatus) {
1319         fStartBufIdx = 0;
1320         fEndBufIdx = 0;
1321         fTextIdx = pos;
1322         fBufIdx = 0;
1323         fBoundaries[0] = pos;
1324         fStatuses[0] = (short)ruleStatus;
1325     }
1326 
reset()1327     void  reset() {reset(0, 0); };
1328 
next()1329     void  next() {
1330         if (fBufIdx == fEndBufIdx) {
1331             fDone = !populateFollowing();
1332             fPosition = fTextIdx;
1333             fRuleStatusIndex = fStatuses[fBufIdx];
1334         } else {
1335             fBufIdx = modChunkSize(fBufIdx + 1);
1336             fTextIdx = fPosition = fBoundaries[fBufIdx];
1337             fRuleStatusIndex = fStatuses[fBufIdx];
1338         }
1339     };
1340 
previous()1341     void  previous() {
1342         int initialBufIdx = fBufIdx;
1343         if (fBufIdx == fStartBufIdx) {
1344             // At start of cache. Prepend to it.
1345             populatePreceding();
1346         } else {
1347             // Cache already holds the next boundary
1348             fBufIdx = modChunkSize(fBufIdx - 1);
1349             fTextIdx = fBoundaries[fBufIdx];
1350         }
1351         fDone = (fBufIdx == initialBufIdx);
1352         fPosition = fTextIdx;
1353         fRuleStatusIndex = fStatuses[fBufIdx];
1354         return;
1355     };
1356 
1357     // Move the iteration state to the position following the startPosition.
1358     // Input position must be pinned to the input length.
following(int startPos)1359     void following(int startPos) {
1360         if (startPos == fTextIdx || seek(startPos) || populateNear(startPos)) {
1361             // startPos is in the cache. Do a next() from that position.
1362             // TODO: an awkward set of interactions with bi->fDone
1363             //       seek() does not clear it; it can't because of interactions with populateNear().
1364             //       next() does not clear it in the fast-path case, where everything matters. Maybe it should.
1365             //       So clear it here, for the case where seek() succeeded on an iterator that had previously run off the end.
1366             fDone = false;
1367             next();
1368         }
1369 
1370     };
1371 
preceding(int startPos)1372     void  preceding(int startPos) {
1373         if (startPos == fTextIdx || seek(startPos) || populateNear(startPos)) {
1374             if (startPos == fTextIdx) {
1375                 previous();
1376             } else {
1377                 // seek() leaves the BreakCache positioned at the preceding boundary
1378                 //        if the requested position is between two bounaries.
1379                 // current() pushes the BreakCache position out to the BreakIterator itself.
1380                 assert(startPos > fTextIdx);
1381                 current();
1382             }
1383         }
1384         return;
1385     };
1386 
1387     /**
1388      * Update the state of the public BreakIterator (fBI) to reflect the
1389      * current state of the break iterator cache (this).
1390      */
current()1391     int current() {
1392         fPosition = fTextIdx;
1393         fRuleStatusIndex = fStatuses[fBufIdx];
1394         fDone = false;
1395         return fTextIdx;
1396     };
1397 
1398     /**
1399      * Add boundaries to the cache near the specified position.
1400      * The given position need not be a boundary itself.
1401      * The input position must be within the range of the text, and
1402      * on a code point boundary.
1403      * If the requested position is a break boundary, leave the iteration
1404      * position on it.
1405      * If the requested position is not a boundary, leave the iteration
1406      * position on the preceding boundary and include both the the
1407      * preceding and following boundaries in the cache.
1408      * Additional boundaries, either preceding or following, may be added
1409      * to the cache as a side effect.
1410      *
1411      * Return false if the operation failed.
1412      */
populateNear(int position)1413     boolean populateNear(int position) {
1414         assert(position < fBoundaries[fStartBufIdx] || position > fBoundaries[fEndBufIdx]);
1415 
1416         // Find a boundary somewhere in the vicinity of the requested position.
1417         // Depending on the safe rules and the text data, it could be either before, at, or after
1418         // the requested position.
1419 
1420 
1421         // If the requested position is not near already cached positions, clear the existing cache,
1422         // find a near-by boundary and begin new cache contents there.
1423 
1424         if ((position < fBoundaries[fStartBufIdx] - 15) || position > (fBoundaries[fEndBufIdx] + 15)) {
1425             int aBoundary = fText.getBeginIndex();
1426             int ruleStatusIndex = 0;
1427             if (position > aBoundary + 20) {
1428                 int backupPos = handleSafePrevious(position);
1429                 if (backupPos > aBoundary) {
1430                     // Advance to the boundary following the backup position.
1431                     // There is a complication: the safe reverse rules identify pairs of code points
1432                     // that are safe. If advancing from the safe point moves forwards by less than
1433                     // two code points, we need to advance one more time to ensure that the boundary
1434                     // is good, including a correct rules status value.
1435                     //
1436                     fPosition = backupPos;
1437                     aBoundary = handleNext();
1438                     if (aBoundary == backupPos + 1 ||
1439                             (aBoundary == backupPos + 2 &&
1440                             Character.isHighSurrogate(fText.setIndex(backupPos)) &&
1441                             Character.isLowSurrogate(fText.next()))) {
1442                         // The initial handleNext() only advanced by a single code point. Go again.
1443                         // Safe rules identify safe pairs.
1444                         aBoundary = handleNext();
1445                     }
1446                 }
1447                 ruleStatusIndex = fRuleStatusIndex;
1448             }
1449             reset(aBoundary, ruleStatusIndex);               // Reset cache to hold aBoundary as a single starting point.
1450         }
1451 
1452         // Fill in boundaries between existing cache content and the new requested position.
1453 
1454         if (fBoundaries[fEndBufIdx] < position) {
1455             // The last position in the cache precedes the requested position.
1456             // Add following position(s) to the cache.
1457             while (fBoundaries[fEndBufIdx] < position) {
1458                 if (!populateFollowing()) {
1459                     assert false;
1460                     return false;
1461                 }
1462             }
1463             fBufIdx = fEndBufIdx;                      // Set iterator position to the end of the buffer.
1464             fTextIdx = fBoundaries[fBufIdx];           // Required because populateFollowing may add extra boundaries.
1465             while (fTextIdx > position) {              // Move backwards to a position at or preceding the requested pos.
1466                 previous();
1467             }
1468             return true;
1469         }
1470 
1471         if (fBoundaries[fStartBufIdx] > position) {
1472             // The first position in the cache is beyond the requested position.
1473             // back up more until we get a boundary <= the requested position.
1474             while (fBoundaries[fStartBufIdx] > position) {
1475                 populatePreceding();
1476             }
1477             fBufIdx = fStartBufIdx;                    // Set iterator position to the start of the buffer.
1478             fTextIdx = fBoundaries[fBufIdx];           // Required because populatePreceding may add extra boundaries.
1479             while (fTextIdx < position) {              // Move forwards to a position at or following the requested pos.
1480                 next();
1481             }
1482             if (fTextIdx > position) {
1483                 // If position is not itself a boundary, the next() loop above will overshoot.
1484                 // Back up one, leaving cache position at the boundary preceding the requested position.
1485                 previous();
1486             }
1487             return true;
1488         }
1489 
1490         assert fTextIdx == position;
1491         return true;
1492 
1493     };
1494 
1495     /**
1496      *  Add boundary(s) to the cache following the current last boundary.
1497      *  Return false if at the end of the text, and no more boundaries can be added.
1498      *  Leave iteration position at the first newly added boundary, or unchanged if no boundary was added.
1499      */
populateFollowing()1500     boolean populateFollowing() {
1501         int fromPosition = fBoundaries[fEndBufIdx];
1502         int fromRuleStatusIdx = fStatuses[fEndBufIdx];
1503         int pos = 0;
1504         int ruleStatusIdx = 0;
1505 
1506         if (fDictionaryCache.following(fromPosition)) {
1507             addFollowing(fDictionaryCache.fBoundary, fDictionaryCache.fStatusIndex, UpdateCachePosition);
1508             return true;
1509         }
1510 
1511         fPosition = fromPosition;
1512         pos = handleNext();
1513         if (pos == BreakIterator.DONE) {
1514             return false;
1515         }
1516 
1517         ruleStatusIdx = fRuleStatusIndex;
1518         if (fDictionaryCharCount > 0) {
1519             // The text segment obtained from the rules includes dictionary characters.
1520             // Subdivide it, with subdivided results going into the dictionary cache.
1521             fDictionaryCache.populateDictionary(fromPosition, pos, fromRuleStatusIdx, ruleStatusIdx);
1522             if (fDictionaryCache.following(fromPosition)) {
1523                 addFollowing(fDictionaryCache.fBoundary, fDictionaryCache.fStatusIndex, UpdateCachePosition);
1524                 return true;
1525                 // TODO: may want to move a sizable chunk of the dictionary cache to the break cache at this point.
1526                 //       But be careful with interactions with populateNear().
1527             }
1528         }
1529 
1530         // Rule based segment did not include dictionary characters.
1531         // Or, it did contain dictionary chars, but the dictionary segmenter didn't handle them,
1532         //    meaning that we didn't take the return, above.
1533         // Add its end point to the cache.
1534         addFollowing(pos, ruleStatusIdx, UpdateCachePosition);
1535 
1536         // Add several non-dictionary boundaries at this point, to optimize straight forward iteration.
1537         //    (subsequent calls to BreakIterator::next() will take the fast path, getting cached results.
1538         //
1539         for (int count=0; count<6; ++count) {
1540             pos = handleNext();
1541             if (pos == BreakIterator.DONE || fDictionaryCharCount > 0) {
1542                 break;
1543             }
1544             addFollowing(pos, fRuleStatusIndex, RetainCachePosition);
1545         }
1546         return true;
1547     };
1548 
1549     /**
1550      *  Add one or more boundaries to the cache preceding the first currently cached boundary.
1551      *  Leave the iteration position on the first added boundary.
1552      *  Return false if no boundaries could be added (if at the start of the text.)
1553      */
populatePreceding()1554     boolean populatePreceding() {
1555         int textBegin = fText.getBeginIndex();
1556         int fromPosition = fBoundaries[fStartBufIdx];
1557         if (fromPosition == textBegin) {
1558             return false;
1559         }
1560 
1561         int position = textBegin;
1562         int positionStatusIdx = 0;
1563 
1564         if (fDictionaryCache.preceding(fromPosition)) {
1565             addPreceding(fDictionaryCache.fBoundary, fDictionaryCache.fStatusIndex, UpdateCachePosition);
1566             return true;
1567         }
1568 
1569         int backupPosition = fromPosition;
1570 
1571         // Find a boundary somewhere preceding the first already-cached boundary
1572         do {
1573             backupPosition = backupPosition - 30;
1574             if (backupPosition <= textBegin) {
1575                 backupPosition = textBegin;
1576             } else {
1577                 backupPosition = handleSafePrevious(backupPosition);
1578             }
1579             if (backupPosition == BreakIterator.DONE || backupPosition == textBegin) {
1580                 position = textBegin;
1581                 positionStatusIdx = 0;
1582             } else {
1583                 // Advance to the boundary following the backup position.
1584                 // There is a complication: the safe reverse rules identify pairs of code points
1585                 // that are safe. If advancing from the safe point moves forwards by less than
1586                 // two code points, we need to advance one more time to ensure that the boundary
1587                 // is good, including a correct rules status value.
1588                 //
1589                 fPosition = backupPosition;  // TODO: pass starting position in a clearer way.
1590                 position = handleNext();
1591                 if (position == backupPosition + 1 ||
1592                         (position == backupPosition + 2 &&
1593                         Character.isHighSurrogate(fText.setIndex(backupPosition)) &&
1594                         Character.isLowSurrogate(fText.next()))) {
1595                     // The initial handleNext() only advanced by a single code point. Go again.
1596                     // Safe rules identify safe pairs.
1597                     position = handleNext();
1598                 }
1599                 positionStatusIdx = fRuleStatusIndex;
1600             }
1601         } while (position >= fromPosition);
1602 
1603         // Find boundaries between the one we just located and the first already-cached boundary
1604         // Put them in a side buffer, because we don't yet know where they will fall in the circular cache buffer.
1605 
1606         fSideBuffer.removeAllElements();
1607         fSideBuffer.push(position);
1608         fSideBuffer.push(positionStatusIdx);
1609 
1610         do {
1611             int prevPosition = fPosition = position;
1612             int prevStatusIdx = positionStatusIdx;
1613             position = handleNext();
1614             positionStatusIdx = fRuleStatusIndex;
1615             if (position == BreakIterator.DONE) {
1616                 break;
1617             }
1618 
1619             boolean segmentHandledByDictionary = false;
1620             if (fDictionaryCharCount != 0) {
1621                 // Segment from the rules includes dictionary characters.
1622                 // Subdivide it, with subdivided results going into the dictionary cache.
1623                 int dictSegEndPosition = position;
1624                 fDictionaryCache.populateDictionary(prevPosition, dictSegEndPosition, prevStatusIdx, positionStatusIdx);
1625                 while (fDictionaryCache.following(prevPosition)) {
1626                     position = fDictionaryCache.fBoundary;
1627                     positionStatusIdx = fDictionaryCache.fStatusIndex;
1628                     segmentHandledByDictionary = true;
1629                     assert(position > prevPosition);
1630                     if (position >= fromPosition) {
1631                         break;
1632                     }
1633                     assert(position <= dictSegEndPosition);
1634                     fSideBuffer.push(position);
1635                     fSideBuffer.push(positionStatusIdx);
1636                     prevPosition = position;
1637                 }
1638                 assert(position==dictSegEndPosition || position>=fromPosition);
1639             }
1640 
1641             if (!segmentHandledByDictionary && position < fromPosition) {
1642                 fSideBuffer.push(position);
1643                 fSideBuffer.push(positionStatusIdx);
1644             }
1645         } while (position < fromPosition);
1646 
1647         // Move boundaries from the side buffer to the main circular buffer.
1648         boolean success = false;
1649         if (!fSideBuffer.isEmpty()) {
1650             positionStatusIdx = fSideBuffer.pop();
1651             position = fSideBuffer.pop();
1652             addPreceding(position, positionStatusIdx, UpdateCachePosition);
1653             success = true;
1654         }
1655 
1656         while (!fSideBuffer.isEmpty()) {
1657             positionStatusIdx = fSideBuffer.pop();
1658             position = fSideBuffer.pop();
1659             if (!addPreceding(position, positionStatusIdx, RetainCachePosition)) {
1660                 // No space in circular buffer to hold a new preceding result while
1661                 // also retaining the current cache (iteration) position.
1662                 // Bailing out is safe; the cache will refill again if needed.
1663                 break;
1664             }
1665         }
1666         return success;
1667     };
1668 
1669 
1670     static final boolean RetainCachePosition = false;
1671     static final boolean UpdateCachePosition = true;
1672 
1673     /**
1674      * Add the boundary following the current position.
1675      * The current position can be left as it was, or changed to the newly added boundary,
1676      * as specified by the update parameter.
1677      */
addFollowing(int position, int ruleStatusIdx, boolean update)1678     void addFollowing(int position, int ruleStatusIdx, boolean update) {
1679         assert(position > fBoundaries[fEndBufIdx]);
1680         assert(ruleStatusIdx <= Short.MAX_VALUE);
1681         int nextIdx = modChunkSize(fEndBufIdx + 1);
1682         if (nextIdx == fStartBufIdx) {
1683             fStartBufIdx = modChunkSize(fStartBufIdx + 6);    // TODO: experiment. Probably revert to 1.
1684         }
1685         fBoundaries[nextIdx] = position;
1686         fStatuses[nextIdx] = (short)ruleStatusIdx;
1687         fEndBufIdx = nextIdx;
1688         if (update == UpdateCachePosition) {
1689             // Set current position to the newly added boundary.
1690             fBufIdx = nextIdx;
1691             fTextIdx = position;
1692         } else {
1693             // Retaining the original cache position.
1694             // Check if the added boundary wraps around the buffer, and would over-write the original position.
1695             // It's the responsibility of callers of this function to not add too many.
1696             assert(nextIdx != fBufIdx);
1697         }
1698 
1699     };
1700 
1701 
1702     /**
1703      * Add the boundary preceding the current position.
1704      * The current position can be left as it was, or changed to the newly added boundary,
1705      * as specified by the update parameter.
1706      */
addPreceding(int position, int ruleStatusIdx, boolean update)1707     boolean addPreceding(int position, int ruleStatusIdx, boolean update) {
1708         assert(position < fBoundaries[fStartBufIdx]);
1709         assert(ruleStatusIdx <= Short.MAX_VALUE);
1710         int nextIdx = modChunkSize(fStartBufIdx - 1);
1711         if (nextIdx == fEndBufIdx) {
1712             if (fBufIdx == fEndBufIdx && update == RetainCachePosition) {
1713                 // Failure. The insertion of the new boundary would claim the buffer position that is the
1714                 // current iteration position. And we also want to retain the current iteration position.
1715                 // (The buffer is already completely full of entries that precede the iteration position.)
1716                 return false;
1717             }
1718             fEndBufIdx = modChunkSize(fEndBufIdx - 1);
1719         }
1720         fBoundaries[nextIdx] = position;
1721         fStatuses[nextIdx] = (short)ruleStatusIdx;
1722         fStartBufIdx = nextIdx;
1723         if (update == UpdateCachePosition) {
1724             fBufIdx = nextIdx;
1725             fTextIdx = position;
1726         }
1727         return true;
1728     };
1729 
1730     /**
1731      *  Set the cache position to the specified position, or, if the position
1732      *  falls between to cached boundaries, to the preceding boundary.
1733      *  Fails if the requested position is outside of the range of boundaries currently held by the cache.
1734      *  The startPosition must be on a code point boundary.
1735      *
1736      *  Return true if successful, false if the specified position is after
1737      *  the last cached boundary or before the first.
1738      */
1739     boolean seek(int pos) {
1740         if (pos < fBoundaries[fStartBufIdx] || pos > fBoundaries[fEndBufIdx]) {
1741             return false;
1742         }
1743         if (pos == fBoundaries[fStartBufIdx]) {
1744             // Common case: seek(0), from BreakIterator::first()
1745             fBufIdx = fStartBufIdx;
1746             fTextIdx = fBoundaries[fBufIdx];
1747             return true;
1748         }
1749         if (pos == fBoundaries[fEndBufIdx]) {
1750             fBufIdx = fEndBufIdx;
1751             fTextIdx = fBoundaries[fBufIdx];
1752             return true;
1753         }
1754 
1755         int min = fStartBufIdx;
1756         int max = fEndBufIdx;
1757         while (min != max) {
1758             int probe = (min + max + (min>max ? CACHE_SIZE : 0)) / 2;
1759             probe = modChunkSize(probe);
1760             if (fBoundaries[probe] > pos) {
1761                 max = probe;
1762             } else {
1763                 min = modChunkSize(probe + 1);
1764             }
1765         }
1766         assert(fBoundaries[max] > pos);
1767         fBufIdx = modChunkSize(max - 1);
1768         fTextIdx = fBoundaries[fBufIdx];
1769         assert(fTextIdx <= pos);
1770         return true;
1771 
1772     };
1773 
1774 
1775     /**
1776      * copy constructor, used from RuleBasedBreakIterator.clone().
1777      *
1778      * @param src
1779      */
BreakCache(BreakCache src)1780     BreakCache(BreakCache src)  {
1781         fStartBufIdx = src.fStartBufIdx;
1782         fEndBufIdx = src.fEndBufIdx;
1783         fTextIdx = src.fTextIdx;
1784         fBufIdx = src.fBufIdx;
1785         fBoundaries = src.fBoundaries.clone();
1786         fStatuses = src.fStatuses.clone();
1787         fSideBuffer = new DictionaryBreakEngine.DequeI();  // Transient, no need to clone contents.
1788     }
1789 
dumpCache()1790     void dumpCache() {
1791         System.out.printf("fTextIdx:%d   fBufIdx:%d%n", fTextIdx, fBufIdx);
1792         for (int i=fStartBufIdx; ; i=modChunkSize(i+1)) {
1793             System.out.printf("%d  %d%n", i, fBoundaries[i]);
1794             if (i == fEndBufIdx) {
1795                 break;
1796             }
1797         }
1798     };
1799 
modChunkSize(int index)1800     private final int   modChunkSize(int index) { return index & (CACHE_SIZE - 1); };
1801 
1802     static final int CACHE_SIZE = 128;
1803     // static_assert((CACHE_SIZE & (CACHE_SIZE-1)) == 0, "CACHE_SIZE must be power of two.");
1804 
1805     int                 fStartBufIdx;
1806     int                 fEndBufIdx;    // inclusive
1807 
1808     int                 fTextIdx;
1809     int                 fBufIdx;
1810 
1811     int[]               fBoundaries = new int[CACHE_SIZE];
1812     short[]             fStatuses = new short[CACHE_SIZE];
1813 
1814     DictionaryBreakEngine.DequeI   fSideBuffer = new DictionaryBreakEngine.DequeI();
1815 };
1816 
1817 
1818 
1819 
1820 }
1821 
1822