• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* GENERATED SOURCE. DO NOT MODIFY. */
2 // © 2016 and later: Unicode, Inc. and others.
3 // License & terms of use: http://www.unicode.org/copyright.html
4 /*
5  *******************************************************************************
6  * Copyright (C) 2005-2016 International Business Machines Corporation and
7  * others. All Rights Reserved.
8  *******************************************************************************
9  */
10 
11 package android.icu.text;
12 
13 import static android.icu.impl.CharacterIteration.DONE32;
14 import static android.icu.impl.CharacterIteration.next32;
15 import static android.icu.impl.CharacterIteration.nextTrail32;
16 
17 import java.io.ByteArrayOutputStream;
18 import java.io.IOException;
19 import java.io.InputStream;
20 import java.io.OutputStream;
21 import java.nio.ByteBuffer;
22 import java.text.CharacterIterator;
23 import java.util.ArrayList;
24 import java.util.List;
25 
26 import android.icu.impl.CharacterIteration;
27 import android.icu.impl.ICUBinary;
28 import android.icu.impl.ICUDebug;
29 import android.icu.impl.RBBIDataWrapper;
30 import android.icu.lang.UCharacter;
31 import android.icu.lang.UProperty;
32 import android.icu.lang.UScript;
33 import android.icu.util.CodePointTrie;
34 
35 /**
36  * Rule Based Break Iterator
37  * This is a port of the C++ class RuleBasedBreakIterator from ICU4C.
38  *
39  * @hide Only a subset of ICU is exposed in Android
40  */
41 public class RuleBasedBreakIterator extends BreakIterator {
42     //=======================================================================
43     // Constructors & Factories
44     //=======================================================================
45 
46     /**
47      * private constructor
48      */
RuleBasedBreakIterator()49     private RuleBasedBreakIterator() {
50         fDictionaryCharCount  = 0;
51         synchronized(gAllBreakEngines) {
52             fBreakEngines = new ArrayList<>(gAllBreakEngines);
53         }
54     }
55 
56     /**
57      * Create a break iterator from a precompiled set of break rules.
58      *
59      * Creating a break iterator from the binary rules is much faster than
60      * creating one from source rules.
61      *
62      * The binary rules are generated by the RuleBasedBreakIterator.compileRules() function.
63      * Binary break iterator rules are not guaranteed to be compatible between
64      * different versions of ICU.
65      *
66      * @param is an input stream supplying the compiled binary rules.
67      * @throws IOException if there is an error while reading the rules from the InputStream.
68      * @see    #compileRules(String, OutputStream)
69      */
getInstanceFromCompiledRules(InputStream is)70     public static RuleBasedBreakIterator getInstanceFromCompiledRules(InputStream is) throws IOException {
71         RuleBasedBreakIterator  This = new RuleBasedBreakIterator();
72         This.fRData = RBBIDataWrapper.get(ICUBinary.getByteBufferFromInputStreamAndCloseStream(is));
73         This.fLookAheadMatches = new int[This.fRData.fFTable.fLookAheadResultsSize];
74         return This;
75     }
76 
77     /**
78      * Create a break iterator from a precompiled set of break rules.
79      *
80      * Creating a break iterator from the binary rules is much faster than
81      * creating one from source rules.
82      *
83      * The binary rules are generated by the RuleBasedBreakIterator.compileRules() function.
84      * Binary break iterator rules are not guaranteed to be compatible between
85      * different versions of ICU.
86      *
87      * @param bytes a buffer supplying the compiled binary rules.
88      * @throws IOException if there is an error while reading the rules from the buffer.
89      * @see    #compileRules(String, OutputStream)
90      * @deprecated This API is ICU internal only.
91      * @hide draft / provisional / internal are hidden on Android
92      */
93     @Deprecated
getInstanceFromCompiledRules(ByteBuffer bytes)94     public static RuleBasedBreakIterator getInstanceFromCompiledRules(ByteBuffer bytes) throws IOException {
95         RuleBasedBreakIterator  This = new RuleBasedBreakIterator();
96         This.fRData = RBBIDataWrapper.get(bytes);
97         This.fLookAheadMatches = new int[This.fRData.fFTable.fLookAheadResultsSize];
98         return This;
99     }
100 
101     /**
102      * Construct a RuleBasedBreakIterator from a set of rules supplied as a string.
103      * @param rules The break rules to be used.
104      */
RuleBasedBreakIterator(String rules)105     public RuleBasedBreakIterator(String rules)  {
106         this();
107         try {
108             ByteArrayOutputStream ruleOS = new ByteArrayOutputStream();
109             compileRules(rules, ruleOS);
110             fRData = RBBIDataWrapper.get(ByteBuffer.wrap(ruleOS.toByteArray()));
111             fLookAheadMatches = new int[fRData.fFTable.fLookAheadResultsSize];
112         } catch (IOException e) {
113             ///CLOVER:OFF
114             // An IO exception can only arrive here if there is a bug in the RBBI Rule compiler,
115             //  causing bogus compiled rules to be produced, but with no compile error raised.
116             RuntimeException rte = new RuntimeException("RuleBasedBreakIterator rule compilation internal error: "
117                     + e.getMessage());
118             throw rte;
119             ///CLOVER:ON
120         }
121     }
122 
123     //=======================================================================
124     // Boilerplate
125     //=======================================================================
126 
127     /**
128      * Clones this iterator.
129      * @return A newly-constructed RuleBasedBreakIterator with the same
130      * behavior as this one.
131      */
132     @Override
clone()133     public Object clone()  {
134         RuleBasedBreakIterator result;
135         result = (RuleBasedBreakIterator)super.clone();
136         if (fText != null) {
137             result.fText = (CharacterIterator)(fText.clone());
138         }
139         synchronized (gAllBreakEngines)  {
140             result.fBreakEngines = new ArrayList<>(gAllBreakEngines);
141         }
142         result.fLookAheadMatches = new int[fRData.fFTable.fLookAheadResultsSize];
143         result.fBreakCache = result.new BreakCache(fBreakCache);
144         result.fDictionaryCache = result.new DictionaryCache(fDictionaryCache);
145         return result;
146     }
147 
148 
149     /**
150      * Returns true if both BreakIterators are of the same class, have the same
151      * rules, and iterate over the same text.
152      */
153     @Override
equals(Object that)154     public boolean equals(Object that) {
155         if (that == null) {
156             return false;
157         }
158         if (this == that) {
159             return true;
160         }
161         try {
162             RuleBasedBreakIterator other = (RuleBasedBreakIterator) that;
163             if (fRData != other.fRData && (fRData == null || other.fRData == null)) {
164                 return false;
165             }
166             if (fRData != null && other.fRData != null &&
167                     (!fRData.fRuleSource.equals(other.fRData.fRuleSource))) {
168                 return false;
169             }
170             if (fText == null && other.fText == null) {
171                 return true;
172             }
173             if (fText == null || other.fText == null || !fText.equals(other.fText)) {
174                 return false;
175             }
176             return fPosition == other.fPosition;
177         }
178         catch(ClassCastException e) {
179             return false;
180         }
181      }
182 
183     /**
184      * Returns the description (rules) used to create this iterator.
185      * (In ICU4C, the same function is RuleBasedBreakIterator::getRules())
186      */
187     @Override
toString()188     public String toString() {
189         String retStr = "";
190         if (fRData != null) {
191             retStr =  fRData.fRuleSource;
192         }
193         return retStr;
194     }
195 
196     /**
197      * Compute a hashcode for this BreakIterator
198      * @return A hash code
199      */
200     @Override
hashCode()201     public int hashCode()
202     {
203         return fRData.fRuleSource.hashCode();
204     }
205 
206 
207     private static final int  START_STATE = 1;     // The state number of the starting state
208     private static final int  STOP_STATE  = 0;     // The state-transition value indicating "stop"
209 
210     // RBBIRunMode - the state machine runs an extra iteration at the beginning and end
211     //               of user text.  A variable with this enum type keeps track of where we
212     //               are.  The state machine only fetches user text input while in RUN mode.
213     private static final int  RBBI_START  = 0;
214     private static final int  RBBI_RUN    = 1;
215     private static final int  RBBI_END    = 2;
216 
217     /**
218      * The character iterator through which this BreakIterator accesses the text.
219      */
220     private CharacterIterator   fText = new java.text.StringCharacterIterator("");
221 
222     /**
223      * The rule data for this BreakIterator instance.
224      * Not intended for public use. Declared public for testing purposes only.
225      * @deprecated This API is ICU internal only.
226      * @hide draft / provisional / internal are hidden on Android
227      */
228     @Deprecated
229     public RBBIDataWrapper    fRData;
230 
231     /**
232      *  The iteration state - current position, rule status for the current position,
233      *                        and whether the iterator ran off the end, yielding UBRK_DONE.
234      *                        Current position is pinned to be 0 < position <= text.length.
235      *                        Current position is always set to a boundary.
236      *
237      *  The current  position of the iterator. Pinned, 0 < fPosition <= text.length.
238      *  Never has the value UBRK_DONE (-1).
239      */
240     private int                fPosition;
241 
242     /**
243      * Index of the Rule {tag} values for the most recent match.
244      */
245     private int                fRuleStatusIndex;
246 
247     /**
248      * True when iteration has run off the end, and iterator functions should return UBRK_DONE.
249      */
250     private boolean            fDone;
251 
252     /**
253      *  Array of look-ahead tentative results.
254      */
255     private int[]              fLookAheadMatches;
256 
257     /**
258      *   Cache of previously determined boundary positions.
259      */
260     private BreakCache         fBreakCache = new BreakCache();
261 
262 
263     /**
264      * Counter for the number of characters encountered with the "dictionary"
265      *   flag set.  Normal RBBI iterators don't use it, although the code
266      *   for updating it is live.  Dictionary Based break iterators (a subclass
267      *   of us) access this field directly.
268      * @hide draft / provisional / internal are hidden on Android
269      */
270     private int fDictionaryCharCount;
271 
272     private DictionaryCache     fDictionaryCache = new DictionaryCache();
273 
274     /**
275      * ICU debug argument name for RBBI
276      */
277     private static final String RBBI_DEBUG_ARG = "rbbi";
278 
279     /**
280      * Debugging flag.  Trace operation of state machine when true.
281      */
282     private static final boolean TRACE = ICUDebug.enabled(RBBI_DEBUG_ARG)
283             && ICUDebug.value(RBBI_DEBUG_ARG).indexOf("trace") >= 0;
284 
285     /**
286      * The "default" break engine - just skips over ranges of dictionary words,
287      * producing no breaks. Should only be used if characters need to be handled
288      * by a dictionary but we have no dictionary implementation for them.
289      *
290      * Only one instance; shared by all break iterators.
291      */
292     private static final UnhandledBreakEngine gUnhandledBreakEngine;
293 
294     /**
295      * List of all known break engines, common for all break iterators.
296      * Lazily updated as break engines are needed, because instantiation of
297      * break engines is expensive.
298      *
299      * Because gAllBreakEngines can be referenced concurrently from different
300      * BreakIterator instances, all access is synchronized.
301      */
302     private static final List<LanguageBreakEngine> gAllBreakEngines;
303 
304     static {
305         gUnhandledBreakEngine = new UnhandledBreakEngine();
306         gAllBreakEngines = new ArrayList<>();
307         gAllBreakEngines.add(gUnhandledBreakEngine);
308     }
309 
310     /**
311      * List of all known break engines. Similar to gAllBreakEngines, but local to a
312      * break iterator, allowing it to be used without synchronization.
313      */
314     private List<LanguageBreakEngine> fBreakEngines;
315 
316     /**
317      * Dump the contents of the state table and character classes for this break iterator.
318      * For debugging only.
319      * @deprecated This API is ICU internal only.
320      * @hide draft / provisional / internal are hidden on Android
321      */
322     @Deprecated
dump(java.io.PrintStream out)323     public void dump(java.io.PrintStream out) {
324         if (out == null) {
325             out = System.out;
326         }
327         this.fRData.dump(out);
328     }
329 
330     /**
331      * Compile a set of source break rules into the binary state tables used
332      * by the break iterator engine.  Creating a break iterator from precompiled
333      * rules is much faster than creating one from source rules.
334      *
335      * Binary break rules are not guaranteed to be compatible between different
336      * versions of ICU.
337      *
338      *
339      * @param rules  The source form of the break rules
340      * @param ruleBinary  An output stream to receive the compiled rules.
341      * @throws IOException If there is an error writing the output.
342      * @see #getInstanceFromCompiledRules(InputStream)
343      */
compileRules(String rules, OutputStream ruleBinary)344     public static void compileRules(String rules, OutputStream ruleBinary) throws IOException {
345         RBBIRuleBuilder.compileRules(rules, ruleBinary);
346     }
347 
348     //=======================================================================
349     // BreakIterator overrides
350     //=======================================================================
351 
352     /**
353      * Sets the current iteration position to the beginning of the text.
354      * (i.e., the CharacterIterator's starting offset).
355      * @return The offset of the beginning of the text.
356      */
357     @Override
first()358     public int first() {
359         if (fText == null) {
360             return BreakIterator.DONE;
361         }
362         fText.first();
363         int start =  fText.getIndex();
364         if (!fBreakCache.seek(start)) {
365             fBreakCache.populateNear(start);
366         }
367         fBreakCache.current();
368         assert(fPosition == start);
369         return fPosition;
370     }
371 
372     /**
373      * Sets the current iteration position to the end of the text.
374      * (i.e., the CharacterIterator's ending offset).
375      * @return The text's past-the-end offset.
376      */
377     @Override
last()378     public int last() {
379         if (fText == null) {
380             return BreakIterator.DONE;
381         }
382         int endPos = fText.getEndIndex();
383         boolean endShouldBeBoundary = isBoundary(endPos);      // Has side effect of setting iterator position.
384         assert(endShouldBeBoundary);
385         if (fPosition != endPos) {
386             assert(fPosition == endPos);
387         }
388         return endPos;
389     }
390 
391     /**
392      * Advances the iterator either forward or backward the specified number of steps.
393      * Negative values move backward, and positive values move forward.  This is
394      * equivalent to repeatedly calling next() or previous().
395      * @param n The number of steps to move.  The sign indicates the direction
396      * (negative is backwards, and positive is forwards).
397      * @return The character offset of the boundary position n boundaries away from
398      * the current one.
399      */
400     @Override
next(int n)401     public int next(int n) {
402         int result = 0;
403         if (n > 0) {
404             for (; n > 0 && result != DONE; --n) {
405                 result = next();
406             }
407         } else if (n < 0) {
408             for (; n < 0 && result != DONE; ++n) {
409                 result = previous();
410             }
411         } else {
412             result = current();
413         }
414         return result;
415     }
416 
417     /**
418      * Advances the iterator to the next boundary position.
419      * @return The position of the first boundary after this one.
420      */
421     @Override
next()422     public int next() {
423         fBreakCache.next();
424         return fDone ? DONE : fPosition;
425     }
426 
427     /**
428      * Moves the iterator backwards, to the boundary preceding the current one.
429      * @return The position of the boundary position immediately preceding the starting position.
430      */
431     @Override
previous()432     public int previous() {
433         fBreakCache.previous();
434         return fDone ? DONE : fPosition;
435     }
436 
437     /**
438      * Sets the iterator to refer to the first boundary position following
439      * the specified position.
440      * @param startPos The position from which to begin searching for a break position.
441      * @return The position of the first break after the current position.
442      */
443     @Override
following(int startPos)444     public int following(int startPos) {
445         // if the supplied position is before the beginning, return the
446         // text's starting offset
447         if (startPos < fText.getBeginIndex()) {
448             return first();
449         }
450 
451         // Move requested offset to a code point start. It might be between a lead and trail surrogate.
452         // Or it may be beyond the end of the text.
453         startPos = CISetIndex32(fText, startPos);
454         fBreakCache.following(startPos);
455         return fDone ? DONE : fPosition;
456     }
457 
458 
459     /**
460      * Sets the iterator to refer to the last boundary position before the
461      * specified position.
462      * @param offset The position to begin searching for a break from.
463      * @return The position of the last boundary before the starting position.
464      */
465     @Override
preceding(int offset)466     public int preceding(int offset) {
467         if (fText == null || offset > fText.getEndIndex()) {
468             return last();
469         } else if (offset < fText.getBeginIndex()) {
470             return first();
471         }
472 
473         // Move requested offset to a code point start. It might be between a lead and trail surrogate.
474         // int adjustedOffset = CISetIndex32(fText, offset);    // TODO: restore to match ICU4C behavior.
475         int adjustedOffset = offset;
476         fBreakCache.preceding(adjustedOffset);
477         return fDone ? DONE : fPosition;
478 
479     }
480 
481 
482     /**
483      * Throw IllegalArgumentException unless begin &lt;= offset &lt; end.
484      */
checkOffset(int offset, CharacterIterator text)485     protected static final void checkOffset(int offset, CharacterIterator text) {
486         if (offset < text.getBeginIndex() || offset > text.getEndIndex()) {
487             throw new IllegalArgumentException("offset out of bounds");
488         }
489     }
490 
491 
492     /**
493      * Returns true if the specified position is a boundary position.  As a side
494      * effect, leaves the iterator pointing to the first boundary position at
495      * or after "offset".
496      * @param offset the offset to check.
497      * @return True if "offset" is a boundary position.
498      */
499     @Override
isBoundary(int offset)500     public boolean isBoundary(int offset) {
501         // TODO: behavior difference with ICU4C, which considers out-of-range offsets
502         //       to not be boundaries, and to not be errors.
503         checkOffset(offset, fText);
504 
505         // Adjust offset to be on a code point boundary and not beyond the end of the text.
506         // Note that isBoundary() is always false for offsets that are not on code point boundaries.
507         // But we still need the side effect of leaving iteration at the following boundary.
508         int adjustedOffset = CISetIndex32(fText, offset);
509 
510         boolean result = false;
511         if (fBreakCache.seek(adjustedOffset) || fBreakCache.populateNear(adjustedOffset)) {
512             result = (fBreakCache.current() == offset);
513         }
514 
515         if (!result) {
516             // Not on a boundary. isBoundary() must leave iterator on the following boundary.
517             // fBreakCache.seek(), above, left us on the preceding boundary, so advance one.
518             next();
519         }
520         return result;
521 
522     }
523 
524     /**
525      * Returns the current iteration position.  Note that DONE is never
526      * returned from this function; if iteration has run to the end of a
527      * string, current() will return the length of the string while
528      * next() will return BreakIterator.DONE).
529      * @return The current iteration position.
530      */
531     @Override
current()532     public int current() {
533         return (fText != null) ? fPosition : BreakIterator.DONE;
534     }
535 
536 
537     /**
538      * Return the status tag from the break rule that determined the boundary at
539      * the current iteration position.  The values appear in the rule source
540      * within brackets, {123}, for example.  For rules that do not specify a
541      * status, a default value of 0 is returned.  If more than one rule applies,
542      * the numerically largest of the possible status values is returned.
543      * <p>
544      * Of the standard types of ICU break iterators, only the word and line break
545      * iterator provides status values.  The values are defined in
546      * class RuleBasedBreakIterator, and allow distinguishing between words
547      * that contain alphabetic letters, "words" that appear to be numbers,
548      * punctuation and spaces, words containing ideographic characters, and
549      * more.  Call <code>getRuleStatus</code> after obtaining a boundary
550      * position from <code>next()</code>, <code>previous()</code>, or
551      * any other break iterator functions that returns a boundary position.
552      * <p>
553      * Note that <code>getRuleStatus()</code> returns the value corresponding to
554      * <code>current()</code> index even after <code>next()</code> has returned DONE.
555      * <p>
556 
557      * @return the status from the break rule that determined the boundary
558      * at the current iteration position.
559      */
560 
561     @Override
getRuleStatus()562     public int  getRuleStatus() {
563         //   Status records have this form:
564         //           Count N         <--  fLastRuleStatusIndex points here.
565         //           Status val 0
566         //           Status val 1
567         //              ...
568         //           Status val N-1  <--  the value we need to return
569         //   The status values are sorted in ascending order.
570         //   This function returns the last (largest) of the array of status values.
571         int  idx = fRuleStatusIndex + fRData.fStatusTable[fRuleStatusIndex];
572         int  tagVal = fRData.fStatusTable[idx];
573         return tagVal;
574     }
575 
576     /**
577      * Get the status (tag) values from the break rule(s) that determined the boundary
578      * at the current iteration position.  The values appear in the rule source
579      * within brackets, {123}, for example.  The default status value for rules
580      * that do not explicitly provide one is zero.
581      * <p>
582      * The status values used by the standard ICU break rules are defined
583      * as public constants in class RuleBasedBreakIterator.
584      * <p>
585      * If the size  of the output array is insufficient to hold the data,
586      *  the output will be truncated to the available length.  No exception
587      *  will be thrown.
588      *
589      * @param fillInArray an array to be filled in with the status values.
590      * @return          The number of rule status values from the rules that determined
591      *                  the boundary at the current iteration position.
592      *                  In the event that the array is too small, the return value
593      *                  is the total number of status values that were available,
594      *                  not the reduced number that were actually returned.
595      */
596     @Override
getRuleStatusVec(int[] fillInArray)597     public int getRuleStatusVec(int[] fillInArray) {
598         int numStatusVals = fRData.fStatusTable[fRuleStatusIndex];
599         if (fillInArray != null) {
600             int numToCopy = Math.min(numStatusVals, fillInArray.length);
601             for (int i=0; i<numToCopy; i++) {
602                 fillInArray[i] = fRData.fStatusTable[fRuleStatusIndex + i + 1];
603             }
604         }
605         return numStatusVals;
606     }
607 
608     /**
609      * Returns a CharacterIterator over the text being analyzed.
610      * <p>
611      * <b><i>Caution:</i></b>The state of the returned CharacterIterator
612      * must not be modified in any way while the BreakIterator is still in use.
613      * Doing so will lead to undefined behavior of the BreakIterator.
614      * Clone the returned CharacterIterator first and work with that.
615      * <p>
616      * The returned CharacterIterator is a reference
617      * to the <b>actual iterator being used</b> by the BreakIterator.
618      * No guarantees are made about the current position
619      * of this iterator when it is returned; it may differ from the
620      * BreakIterators current position.  If you need to move that
621      * position to examine the text, clone this function's return value first.
622      * @return An iterator over the text being analyzed.
623      */
624     @Override
getText()625     public CharacterIterator getText() {
626         return fText;
627     }
628 
629     /**
630      * Set the iterator to analyze a new piece of text.  This function resets
631      * the current iteration position to the beginning of the text.
632      * (The old iterator is dropped.)
633      * <p>
634      * <b><i>Caution:</i></b> The supplied CharacterIterator is used
635      * directly by the BreakIterator, and must not be altered in any
636      * way by code outside of the BreakIterator.
637      * Doing so will lead to undefined behavior of the BreakIterator.
638      *
639      * @param newText An iterator over the text to analyze.
640      */
641     @Override
setText(CharacterIterator newText)642     public void setText(CharacterIterator newText) {
643         if (newText != null) {
644             fBreakCache.reset(newText.getBeginIndex(), 0);
645         } else {
646             fBreakCache.reset();
647         }
648         fDictionaryCache.reset();
649         fText = newText;
650         this.first();
651     }
652 
653      /**
654      * Control debug, trace and dump options.
655      * @deprecated This API is ICU internal only.
656      * @hide draft / provisional / internal are hidden on Android
657      */
658     @Deprecated
659     public static final String fDebugEnv = ICUDebug.enabled(RBBI_DEBUG_ARG) ?
660                                            ICUDebug.value(RBBI_DEBUG_ARG) : null;
661 
662 
getLanguageBreakEngine(int c)663     private LanguageBreakEngine getLanguageBreakEngine(int c) {
664 
665         // We have a dictionary character.
666         // Does an already instantiated break engine handle it?
667         for (LanguageBreakEngine candidate : fBreakEngines) {
668             if (candidate.handles(c)) {
669                 return candidate;
670             }
671         }
672 
673         synchronized (gAllBreakEngines) {
674             // This break iterator's list of break engines didn't handle the character.
675             // Check the global list, another break iterator may have instantiated the
676             // desired engine.
677             for (LanguageBreakEngine candidate : gAllBreakEngines) {
678                 if (candidate.handles(c)) {
679                     fBreakEngines.add(candidate);
680                     return candidate;
681                 }
682             }
683 
684             // The global list doesn't have an existing engine, build one.
685             int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
686             if (script == UScript.KATAKANA || script == UScript.HIRAGANA) {
687                 // Katakana, Hiragana and Han are handled by the same dictionary engine.
688                 // Fold them together for mapping from script -> engine.
689                 script = UScript.HAN;
690             }
691 
692             LanguageBreakEngine eng;
693             try {
694                 switch (script) {
695                 case UScript.THAI:
696                     eng = new ThaiBreakEngine();
697                     break;
698                 case UScript.LAO:
699                     eng = new LaoBreakEngine();
700                     break;
701                 case UScript.MYANMAR:
702                     eng = new BurmeseBreakEngine();
703                     break;
704                 case UScript.KHMER:
705                     eng = new KhmerBreakEngine();
706                     break;
707                 case UScript.HAN:
708                     eng = new CjkBreakEngine(false);
709                      break;
710                 case UScript.HANGUL:
711                     eng = new CjkBreakEngine(true);
712                     break;
713                 default:
714                     gUnhandledBreakEngine.handleChar(c);
715                     eng = gUnhandledBreakEngine;
716                     break;
717                 }
718             } catch (IOException e) {
719                 eng = null;
720             }
721 
722             if (eng != null && eng != gUnhandledBreakEngine) {
723                 gAllBreakEngines.add(eng);
724                 fBreakEngines.add(eng);
725             }
726             return eng;
727         }   // end synchronized(gAllBreakEngines)
728     }
729 
730     /**
731      * The State Machine Engine for moving forward is here.
732      * This function is the heart of the RBBI run time engine.
733      *
734      * Input
735      *    fPosition, the position in the text to begin from.
736      * Output
737      *    fPosition:           the boundary following the starting position.
738      *    fDictionaryCharCount the number of dictionary characters encountered.
739      *                         If > 0, the segment will be further subdivided
740      *    fRuleStatusIndex     Info from the state table indicating which rules caused the boundary.
741      *
742      * @return the new iterator position
743      *
744      * A note on supplementary characters and the position of underlying
745      * Java CharacterIterator:   Normally, a character iterator is positioned at
746      * the char most recently returned by next().  Within this function, when
747      * a supplementary char is being processed, the char iterator is left
748      * sitting on the trail surrogate, in the middle of the code point.
749      * This is different from everywhere else, where an iterator always
750      * points at the lead surrogate of a supplementary.
751      */
handleNext()752     private int handleNext() {
753         if (TRACE) {
754             System.out.println("Handle Next   pos      char  state category");
755         }
756 
757         // handleNext always sets the break tag value.
758         // Set the default for it.
759         fRuleStatusIndex  = 0;
760         fDictionaryCharCount = 0;
761 
762         // caches for quicker access
763         CharacterIterator text = fText;
764         CodePointTrie trie = fRData.fTrie;
765 
766         char[] stateTable  = fRData.fFTable.fTable;
767         int initialPosition = fPosition;
768         text.setIndex(initialPosition);
769         int result          = initialPosition;
770 
771         // Set up the starting char
772         int c = text.current();
773         if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) {
774             c = nextTrail32(text, c);
775             if (c == DONE32) {
776                 fDone = true;
777                 return BreakIterator.DONE;
778             }
779         }
780 
781         // Set the initial state for the state machine
782         int state           = START_STATE;
783         int row             = fRData.getRowIndex(state);
784         short category      = 3;
785         int flagsState      = fRData.fFTable.fFlags;
786         int dictStart       = fRData.fFTable.fDictCategoriesStart;
787         int mode            = RBBI_RUN;
788         if ((flagsState & RBBIDataWrapper.RBBI_BOF_REQUIRED) != 0) {
789             category = 2;
790             mode     = RBBI_START;
791             if (TRACE) {
792                 System.out.print("            " +  RBBIDataWrapper.intToString(text.getIndex(), 5));
793                 System.out.print(RBBIDataWrapper.intToHexString(c, 10));
794                 System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6));
795             }
796         }
797 
798         // loop until we reach the end of the text or transition to state 0
799         while (state != STOP_STATE) {
800             if (c == DONE32) {
801                 // Reached end of input string.
802                 if (mode == RBBI_END) {
803                     // We have already run the loop one last time with the
804                     // character set to the pseudo {eof} value. Now it is time
805                     // to unconditionally bail out.
806                     break;
807                 }
808                 // Run the loop one last time with the fake end-of-input character category
809                 mode = RBBI_END;
810                 category = 1;
811             }
812             else if (mode == RBBI_RUN) {
813                 // Get the char category.  An incoming category of 1 or 2 mens that
814                 //      we are preset for doing the beginning or end of input, and
815                 //      that we shouldn't get a category from an actual text input character.
816                 //
817 
818                 // look up the current character's character category, which tells us
819                 // which column in the state table to look at.
820                 //
821                 category = (short) trie.get(c);
822 
823                 // Check for categories that require word dictionary handling.
824                 if (category >= dictStart) {
825                     fDictionaryCharCount++;
826                 }
827 
828                 if (TRACE) {
829                     System.out.print("            " +  RBBIDataWrapper.intToString(text.getIndex(), 5));
830                     System.out.print(RBBIDataWrapper.intToHexString(c, 10));
831                     System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6));
832                 }
833 
834                 // Advance to the next character.
835                 // If this is a beginning-of-input loop iteration, don't advance.
836                 //    The next iteration will be processing the first real input character.
837                 c = text.next();
838                 if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) {
839                     c = nextTrail32(text, c);
840                 }
841             }
842             else {
843                 mode = RBBI_RUN;
844             }
845 
846             // look up a state transition in the state table
847             state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category];
848             row   = fRData.getRowIndex(state);
849             int accepting = stateTable[row + RBBIDataWrapper.ACCEPTING];
850             if (accepting == RBBIDataWrapper.ACCEPTING_UNCONDITIONAL) {
851                 // Match found, common case
852                 result = text.getIndex();
853                 if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c <= UTF16.CODEPOINT_MAX_VALUE) {
854                     // The iterator has been left in the middle of a surrogate pair.
855                     // We want the start of it.
856                     result--;
857                 }
858 
859                 //  Remember the break status (tag) values.
860                 fRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGSIDX];
861             } else if (accepting > RBBIDataWrapper.ACCEPTING_UNCONDITIONAL) {
862                 // Lookahead match is completed
863                 int lookaheadResult = fLookAheadMatches[accepting];
864                 if (lookaheadResult >= 0) {
865                     fRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGSIDX];
866                     fPosition = lookaheadResult;
867                     return lookaheadResult;
868                 }
869             }
870 
871 
872             // If we are at the position of the '/' in a look-ahead (hard break) rule;
873             // record the current position, to be returned later, if the full rule matches.
874             // TODO: Move this check before the previous check of fAccepting.
875             //       This would enable hard-break rules with no following context.
876             //       But there are line break test failures when trying this. Investigate.
877             //       Issue ICU-20837
878             int rule = stateTable[row + RBBIDataWrapper.LOOKAHEAD];
879             if (rule != 0) {
880                 int  pos = text.getIndex();
881                 if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c <= UTF16.CODEPOINT_MAX_VALUE) {
882                     // The iterator has been left in the middle of a surrogate pair.
883                     // We want the beginning  of it.
884                     pos--;
885                 }
886                 fLookAheadMatches[rule] = pos;
887             }
888 
889 
890         }        // End of state machine main loop
891 
892         // The state machine is done.  Check whether it found a match...
893 
894         // If the iterator failed to advance in the match engine force it ahead by one.
895         // This indicates a defect in the break rules, which should always match
896         // at least one character.
897 
898         if (result == initialPosition) {
899             if (TRACE) {
900                 System.out.println("Iterator did not move. Advancing by 1.");
901             }
902             text.setIndex(initialPosition);
903             next32(text);
904             result = text.getIndex();
905             fRuleStatusIndex = 0;
906         }
907 
908         // Leave the iterator at our result position.
909         //   (we may have advanced beyond the last accepting position chasing after
910         //    longer matches that never completed.)
911         fPosition = result;
912 
913         if (TRACE) {
914             System.out.println("result = " + result);
915         }
916         return result;
917     }
918 
919     /**
920      * Iterate backwards from an arbitrary position in the input text using the Safe Reverse rules.
921      * This locates a "Safe Position" from which the forward break rules
922      * will operate correctly. A Safe Position is not necessarily a boundary itself.
923      *
924      * The logic of this function is very similar to handleNext(), above, but simpler
925      * because the safe table does not require as many options.
926      *
927      * @param fromPosition the position in the input text to begin the iteration.
928      * @hide draft / provisional / internal are hidden on Android
929      */
handleSafePrevious(int fromPosition)930     private int handleSafePrevious(int fromPosition) {
931         char            state;
932         short           category = 0;
933         int             result = 0;
934 
935         // caches for quicker access
936         CharacterIterator text = fText;
937         CodePointTrie trie = fRData.fTrie;
938         char[] stateTable  = fRData.fRTable.fTable;
939 
940         CISetIndex32(text, fromPosition);
941         if (TRACE) {
942             System.out.print("Handle Previous   pos   char  state category");
943         }
944 
945         // if we're already at the start of the text, return DONE.
946         if (text.getIndex() == text.getBeginIndex()) {
947             return BreakIterator.DONE;
948         }
949 
950         //  Set the initial state for the state machine
951         int c = CharacterIteration.previous32(text);
952         state = START_STATE;
953         int row = fRData.getRowIndex(state);
954 
955         // loop until we reach the start of the text or transition to state 0
956         //
957         for (; c != DONE32; c = CharacterIteration.previous32(text)) {
958 
959             // look up the current character's character category, which tells us
960             // which column in the state table to look at.
961             //
962             //  And off the dictionary flag bit. For reverse iteration it is not used.
963             category = (short) trie.get(c);
964             if (TRACE) {
965                 System.out.print("            " +  RBBIDataWrapper.intToString(text.getIndex(), 5));
966                 System.out.print(RBBIDataWrapper.intToHexString(c, 10));
967                 System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6));
968             }
969 
970             // State Transition - move machine to its next state
971             //
972             assert(category < fRData.fHeader.fCatCount);
973             state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category];
974             row   = fRData.getRowIndex(state);
975 
976             if (state == STOP_STATE) {
977                 // This is the normal exit from the lookup state machine.
978                 // Transition to state zero means we have found a safe point.
979                 break;
980             }
981         }
982 
983         // The state machine is done.
984         result = text.getIndex();
985         if (TRACE) {
986             System.out.println("result = " + result);
987         }
988         return result;
989     }
990 
991     /**
992      * Set the index of a CharacterIterator.
993      * Pin the index to the valid range range of BeginIndex <= index <= EndIndex.
994      * If the index points to a trail surrogate of a supplementary character, adjust it
995      * to the start (lead surrogate) index.
996      *
997      * @param ci A CharacterIterator to set
998      * @param index the index to set
999      * @return the resulting index, possibly pinned or adjusted.
1000      */
1001     private static int CISetIndex32(CharacterIterator ci, int index) {
1002         if (index <= ci.getBeginIndex()) {
1003             ci.first();
1004         } else if (index >= ci.getEndIndex()) {
1005             ci.setIndex(ci.getEndIndex());
1006         } else if (Character.isLowSurrogate(ci.setIndex(index))) {
1007             if (!Character.isHighSurrogate(ci.previous())) {
1008                 ci.next();
1009             }
1010         }
1011         return ci.getIndex();
1012     }
1013 
1014     /** DictionaryCache  stores the boundaries obtained from a run of dictionary characters.
1015      *                 Dictionary boundaries are moved first to this cache, then from here
1016      *                 to the main BreakCache, where they may inter-leave with non-dictionary
1017      *                 boundaries. The public BreakIterator API always fetches directly
1018      *                 from the main BreakCache, not from here.
1019      *
1020      *                 In common situations, the number of boundaries in a single dictionary run
1021      *                 should be quite small, it will be terminated by punctuation, spaces,
1022      *                 or any other non-dictionary characters. The main BreakCache may end
1023      *                 up with boundaries from multiple dictionary based runs.
1024      *
1025      *                 The boundaries are stored in a simple ArrayList (vector), with the
1026      *                 assumption that they will be accessed sequentially.
1027      */
1028     class DictionaryCache  {
1029 
1030          void reset() {
1031              fPositionInCache = -1;
1032              fStart = 0;
1033              fLimit = 0;
1034              fFirstRuleStatusIndex = 0;
1035              fOtherRuleStatusIndex = 0;
1036              fBreaks.removeAllElements();
1037          };
1038 
1039          boolean following(int fromPos) {
1040              if (fromPos >= fLimit || fromPos < fStart) {
1041                  fPositionInCache = -1;
1042                  return false;
1043              }
1044 
1045              // Sequential iteration, move from previous boundary to the following
1046 
1047              int r = 0;
1048              if (fPositionInCache >= 0 && fPositionInCache < fBreaks.size() && fBreaks.elementAt(fPositionInCache) == fromPos) {
1049                  ++fPositionInCache;
1050                  if (fPositionInCache >= fBreaks.size()) {
1051                      fPositionInCache = -1;
1052                      return false;
1053                  }
1054                  r = fBreaks.elementAt(fPositionInCache);
1055                  assert(r > fromPos);
1056                  fBoundary = r;
1057                  fStatusIndex = fOtherRuleStatusIndex;
1058                  return true;
1059              }
1060 
1061              // Random indexing. Linear search for the boundary following the given position.
1062 
1063              for (fPositionInCache = 0; fPositionInCache < fBreaks.size(); ++fPositionInCache) {
1064                  r= fBreaks.elementAt(fPositionInCache);
1065                  if (r > fromPos) {
1066                      fBoundary = r;
1067                      fStatusIndex = fOtherRuleStatusIndex;
1068                      return true;
1069                  }
1070              }
1071 
1072              // Internal error. fStart <= fromPos < fLimit, but no cached boundary.
1073              assert(false);
1074              fPositionInCache = -1;
1075              return false;
1076          };
1077 
preceding(int fromPos)1078          boolean preceding(int fromPos) {
1079              if (fromPos <= fStart || fromPos > fLimit) {
1080                  fPositionInCache = -1;
1081                  return false;
1082              }
1083 
1084              if (fromPos == fLimit) {
1085                  fPositionInCache = fBreaks.size() - 1;
1086                  if (fPositionInCache >= 0) {
1087                      assert(fBreaks.elementAt(fPositionInCache) == fromPos);
1088                  }
1089              }
1090 
1091              int r;
1092              if (fPositionInCache > 0 && fPositionInCache < fBreaks.size() && fBreaks.elementAt(fPositionInCache) == fromPos) {
1093                  --fPositionInCache;
1094                  r = fBreaks.elementAt(fPositionInCache);
1095                  assert(r < fromPos);
1096                  fBoundary = r;
1097                  fStatusIndex = ( r== fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex;
1098                  return true;
1099              }
1100 
1101              if (fPositionInCache == 0) {
1102                  fPositionInCache = -1;
1103                  return false;
1104              }
1105 
1106              for (fPositionInCache = fBreaks.size()-1; fPositionInCache >= 0; --fPositionInCache) {
1107                  r = fBreaks.elementAt(fPositionInCache);
1108                  if (r < fromPos) {
1109                      fBoundary = r;
1110                      fStatusIndex = ( r == fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex;
1111                      return true;
1112                  }
1113              }
1114              assert(false);
1115              fPositionInCache = -1;
1116              return false;
1117          };
1118 
1119         /**
1120          * Populate the cache with the dictionary based boundaries within a region of text.
1121          * @param startPos  The start position of a range of text
1122          * @param endPos    The end position of a range of text
1123          * @param firstRuleStatus The rule status index that applies to the break at startPos
1124          * @param otherRuleStatus The rule status index that applies to boundaries other than startPos
1125          * @hide draft / provisional / internal are hidden on Android
1126          */
populateDictionary(int startPos, int endPos, int firstRuleStatus, int otherRuleStatus)1127         void populateDictionary(int startPos, int endPos,
1128                                 int firstRuleStatus, int otherRuleStatus) {
1129             if ((endPos - startPos) <= 1) {
1130                 return;
1131             }
1132 
1133             reset();
1134             fFirstRuleStatusIndex = firstRuleStatus;
1135             fOtherRuleStatusIndex = otherRuleStatus;
1136 
1137             int rangeStart = startPos;
1138             int rangeEnd = endPos;
1139 
1140             int         category;
1141             int         current;
1142             int         foundBreakCount = 0;
1143 
1144             // Loop through the text, looking for ranges of dictionary characters.
1145             // For each span, find the appropriate break engine, and ask it to find
1146             // any breaks within the span.
1147 
1148             fText.setIndex(rangeStart);
1149             int     c = CharacterIteration.current32(fText);
1150             category = (short)fRData.fTrie.get(c);
1151             int dictStart = fRData.fFTable.fDictCategoriesStart;
1152 
1153             while(true) {
1154                 while((current = fText.getIndex()) < rangeEnd && (category < dictStart)) {
1155                     c = CharacterIteration.next32(fText);    // pre-increment
1156                     category = (short)fRData.fTrie.get(c);
1157                 }
1158                 if (current >= rangeEnd) {
1159                     break;
1160                 }
1161 
1162                 // We now have a dictionary character. Get the appropriate language object
1163                 // to deal with it.
1164                 LanguageBreakEngine lbe = getLanguageBreakEngine(c);
1165 
1166                 // Ask the language object if there are any breaks. It will add them to the cache and
1167                 // leave the text pointer on the other side of its range, ready to search for the next one.
1168                 if (lbe != null) {
1169                     foundBreakCount += lbe.findBreaks(fText, rangeStart, rangeEnd, fBreaks);
1170                 }
1171 
1172                 // Reload the loop variables for the next go-round
1173                 c = CharacterIteration.current32(fText);
1174                 category = (short)fRData.fTrie.get(c);
1175             }
1176 
1177             // If we found breaks, ensure that the first and last entries are
1178             // the original starting and ending position. And initialize the
1179             // cache iteration position to the first entry.
1180 
1181             // System.out.printf("foundBreakCount = %d%n", foundBreakCount);
1182             if (foundBreakCount > 0) {
1183                 assert(foundBreakCount == fBreaks.size());
1184                 if (startPos < fBreaks.elementAt(0)) {
1185                     // The dictionary did not place a boundary at the start of the segment of text.
1186                     // Add one now. This should not commonly happen, but it would be easy for interactions
1187                     // of the rules for dictionary segments and the break engine implementations to
1188                     // inadvertently cause it. Cover it here, just in case.
1189                     fBreaks.offer(startPos);
1190                 }
1191                 if (endPos > fBreaks.peek()) {
1192                     fBreaks.push(endPos);
1193                 }
1194                 fPositionInCache = 0;
1195                 // Note: Dictionary matching may extend beyond the original limit.
1196                 fStart = fBreaks.elementAt(0);
1197                 fLimit = fBreaks.peek();
1198             } else {
1199                 // there were no language-based breaks, even though the segment contained
1200                 // dictionary characters. Subsequent attempts to fetch boundaries from the dictionary cache
1201                 // for this range will fail, and the calling code will fall back to the rule based boundaries.
1202             }
1203 
1204         };
1205 
1206 
DictionaryCache()1207         DictionaryCache() {
1208             fPositionInCache = -1;
1209             fBreaks = new DictionaryBreakEngine.DequeI();
1210         }
1211 
1212         /**
1213          * copy constructor. Used by RuleBasedBreakIterator.clone().
1214          *
1215          * @param src the source object to be copied.
1216          */
DictionaryCache(DictionaryCache src)1217         DictionaryCache(DictionaryCache src)  {
1218             try {
1219                 fBreaks = (DictionaryBreakEngine.DequeI)src.fBreaks.clone();
1220             }
1221             catch (CloneNotSupportedException e) {
1222                 throw new RuntimeException(e);
1223             }
1224             fPositionInCache      = src.fPositionInCache;
1225             fStart                = src.fStart;
1226             fLimit                = src.fLimit;
1227             fFirstRuleStatusIndex = src.fFirstRuleStatusIndex;
1228             fOtherRuleStatusIndex = src.fOtherRuleStatusIndex;
1229             fBoundary             = src.fBoundary;
1230             fStatusIndex          = src.fStatusIndex;
1231         }
1232 
1233         // A data structure containing the boundaries themselves. Essentially a vector of raw ints.
1234         DictionaryBreakEngine.DequeI fBreaks;
1235         int             fPositionInCache;       // Index in fBreaks of last boundary returned by following()
1236         //                                      //    or preceding(). Optimizes sequential access.
1237         int             fStart;                 // Text position of first boundary in cache.
1238         int             fLimit;                 // Last boundary in cache. Which is the limit of the
1239         //                                      //    text segment being handled by the dictionary.
1240         int             fFirstRuleStatusIndex;  // Rule status info for first boundary.
1241         int             fOtherRuleStatusIndex;  // Rule status info for 2nd through last boundaries.
1242         int             fBoundary;              // Current boundary. Set by preceding(), following().
1243         int             fStatusIndex;           // Current rule status index. Set by preceding, following().
1244     };
1245 
1246 
1247 
1248 
1249 /*
1250  * class BreakCache
1251  *
1252  * Cache of break boundary positions and rule status values.
1253  * Break iterator API functions, next(), previous(), etc., will use cached results
1254  * when possible, and otherwise cache new results as they are obtained.
1255  *
1256  * Uniformly caches both dictionary and rule based (non-dictionary) boundaries.
1257  *
1258  * The cache is implemented as a single circular buffer.
1259  */
1260 
1261 /*
1262  * size of the circular cache buffer.
1263  */
1264 
1265 class BreakCache {
1266 
BreakCache()1267     BreakCache() {
1268         reset();
1269     };
1270 
reset(int pos, int ruleStatus)1271     void  reset(int pos, int ruleStatus) {
1272         fStartBufIdx = 0;
1273         fEndBufIdx = 0;
1274         fTextIdx = pos;
1275         fBufIdx = 0;
1276         fBoundaries[0] = pos;
1277         fStatuses[0] = (short)ruleStatus;
1278     }
1279 
reset()1280     void  reset() {reset(0, 0); };
1281 
next()1282     void  next() {
1283         if (fBufIdx == fEndBufIdx) {
1284             fDone = !populateFollowing();
1285             fPosition = fTextIdx;
1286             fRuleStatusIndex = fStatuses[fBufIdx];
1287         } else {
1288             fBufIdx = modChunkSize(fBufIdx + 1);
1289             fTextIdx = fPosition = fBoundaries[fBufIdx];
1290             fRuleStatusIndex = fStatuses[fBufIdx];
1291         }
1292     };
1293 
previous()1294     void  previous() {
1295         int initialBufIdx = fBufIdx;
1296         if (fBufIdx == fStartBufIdx) {
1297             // At start of cache. Prepend to it.
1298             populatePreceding();
1299         } else {
1300             // Cache already holds the next boundary
1301             fBufIdx = modChunkSize(fBufIdx - 1);
1302             fTextIdx = fBoundaries[fBufIdx];
1303         }
1304         fDone = (fBufIdx == initialBufIdx);
1305         fPosition = fTextIdx;
1306         fRuleStatusIndex = fStatuses[fBufIdx];
1307         return;
1308     };
1309 
1310     // Move the iteration state to the position following the startPosition.
1311     // Input position must be pinned to the input length.
following(int startPos)1312     void following(int startPos) {
1313         if (startPos == fTextIdx || seek(startPos) || populateNear(startPos)) {
1314             // startPos is in the cache. Do a next() from that position.
1315             // TODO: an awkward set of interactions with bi->fDone
1316             //       seek() does not clear it; it can't because of interactions with populateNear().
1317             //       next() does not clear it in the fast-path case, where everything matters. Maybe it should.
1318             //       So clear it here, for the case where seek() succeeded on an iterator that had previously run off the end.
1319             fDone = false;
1320             next();
1321         }
1322 
1323     };
1324 
preceding(int startPos)1325     void  preceding(int startPos) {
1326         if (startPos == fTextIdx || seek(startPos) || populateNear(startPos)) {
1327             if (startPos == fTextIdx) {
1328                 previous();
1329             } else {
1330                 // seek() leaves the BreakCache positioned at the preceding boundary
1331                 //        if the requested position is between two bounaries.
1332                 // current() pushes the BreakCache position out to the BreakIterator itself.
1333                 assert(startPos > fTextIdx);
1334                 current();
1335             }
1336         }
1337         return;
1338     };
1339 
1340     /**
1341      * Update the state of the public BreakIterator (fBI) to reflect the
1342      * current state of the break iterator cache (this).
1343      */
current()1344     int current() {
1345         fPosition = fTextIdx;
1346         fRuleStatusIndex = fStatuses[fBufIdx];
1347         fDone = false;
1348         return fTextIdx;
1349     };
1350 
1351     /**
1352      * Add boundaries to the cache near the specified position.
1353      * The given position need not be a boundary itself.
1354      * The input position must be within the range of the text, and
1355      * on a code point boundary.
1356      * If the requested position is a break boundary, leave the iteration
1357      * position on it.
1358      * If the requested position is not a boundary, leave the iteration
1359      * position on the preceding boundary and include both the the
1360      * preceding and following boundaries in the cache.
1361      * Additional boundaries, either preceding or following, may be added
1362      * to the cache as a side effect.
1363      *
1364      * Return false if the operation failed.
1365      */
populateNear(int position)1366     boolean populateNear(int position) {
1367         assert(position < fBoundaries[fStartBufIdx] || position > fBoundaries[fEndBufIdx]);
1368 
1369         // Find a boundary somewhere in the vicinity of the requested position.
1370         // Depending on the safe rules and the text data, it could be either before, at, or after
1371         // the requested position.
1372 
1373 
1374         // If the requested position is not near already cached positions, clear the existing cache,
1375         // find a near-by boundary and begin new cache contents there.
1376 
1377         if ((position < fBoundaries[fStartBufIdx] - 15) || position > (fBoundaries[fEndBufIdx] + 15)) {
1378             int aBoundary = fText.getBeginIndex();
1379             int ruleStatusIndex = 0;
1380             if (position > aBoundary + 20) {
1381                 int backupPos = handleSafePrevious(position);
1382                 if (backupPos > aBoundary) {
1383                     // Advance to the boundary following the backup position.
1384                     // There is a complication: the safe reverse rules identify pairs of code points
1385                     // that are safe. If advancing from the safe point moves forwards by less than
1386                     // two code points, we need to advance one more time to ensure that the boundary
1387                     // is good, including a correct rules status value.
1388                     //
1389                     fPosition = backupPos;
1390                     aBoundary = handleNext();
1391                     if (aBoundary == backupPos + 1 ||
1392                             (aBoundary == backupPos + 2 &&
1393                             Character.isHighSurrogate(fText.setIndex(backupPos)) &&
1394                             Character.isLowSurrogate(fText.next()))) {
1395                         // The initial handleNext() only advanced by a single code point. Go again.
1396                         // Safe rules identify safe pairs.
1397                         aBoundary = handleNext();
1398                     }
1399                 }
1400                 ruleStatusIndex = fRuleStatusIndex;
1401             }
1402             reset(aBoundary, ruleStatusIndex);               // Reset cache to hold aBoundary as a single starting point.
1403         }
1404 
1405         // Fill in boundaries between existing cache content and the new requested position.
1406 
1407         if (fBoundaries[fEndBufIdx] < position) {
1408             // The last position in the cache precedes the requested position.
1409             // Add following position(s) to the cache.
1410             while (fBoundaries[fEndBufIdx] < position) {
1411                 if (!populateFollowing()) {
1412                     assert false;
1413                     return false;
1414                 }
1415             }
1416             fBufIdx = fEndBufIdx;                      // Set iterator position to the end of the buffer.
1417             fTextIdx = fBoundaries[fBufIdx];           // Required because populateFollowing may add extra boundaries.
1418             while (fTextIdx > position) {              // Move backwards to a position at or preceding the requested pos.
1419                 previous();
1420             }
1421             return true;
1422         }
1423 
1424         if (fBoundaries[fStartBufIdx] > position) {
1425             // The first position in the cache is beyond the requested position.
1426             // back up more until we get a boundary <= the requested position.
1427             while (fBoundaries[fStartBufIdx] > position) {
1428                 populatePreceding();
1429             }
1430             fBufIdx = fStartBufIdx;                    // Set iterator position to the start of the buffer.
1431             fTextIdx = fBoundaries[fBufIdx];           // Required because populatePreceding may add extra boundaries.
1432             while (fTextIdx < position) {              // Move forwards to a position at or following the requested pos.
1433                 next();
1434             }
1435             if (fTextIdx > position) {
1436                 // If position is not itself a boundary, the next() loop above will overshoot.
1437                 // Back up one, leaving cache position at the boundary preceding the requested position.
1438                 previous();
1439             }
1440             return true;
1441         }
1442 
1443         assert fTextIdx == position;
1444         return true;
1445 
1446     };
1447 
1448     /**
1449      *  Add boundary(s) to the cache following the current last boundary.
1450      *  Return false if at the end of the text, and no more boundaries can be added.
1451      *  Leave iteration position at the first newly added boundary, or unchanged if no boundary was added.
1452      */
populateFollowing()1453     boolean populateFollowing() {
1454         int fromPosition = fBoundaries[fEndBufIdx];
1455         int fromRuleStatusIdx = fStatuses[fEndBufIdx];
1456         int pos = 0;
1457         int ruleStatusIdx = 0;
1458 
1459         if (fDictionaryCache.following(fromPosition)) {
1460             addFollowing(fDictionaryCache.fBoundary, fDictionaryCache.fStatusIndex, UpdateCachePosition);
1461             return true;
1462         }
1463 
1464         fPosition = fromPosition;
1465         pos = handleNext();
1466         if (pos == BreakIterator.DONE) {
1467             return false;
1468         }
1469 
1470         ruleStatusIdx = fRuleStatusIndex;
1471         if (fDictionaryCharCount > 0) {
1472             // The text segment obtained from the rules includes dictionary characters.
1473             // Subdivide it, with subdivided results going into the dictionary cache.
1474             fDictionaryCache.populateDictionary(fromPosition, pos, fromRuleStatusIdx, ruleStatusIdx);
1475             if (fDictionaryCache.following(fromPosition)) {
1476                 addFollowing(fDictionaryCache.fBoundary, fDictionaryCache.fStatusIndex, UpdateCachePosition);
1477                 return true;
1478                 // TODO: may want to move a sizable chunk of the dictionary cache to the break cache at this point.
1479                 //       But be careful with interactions with populateNear().
1480             }
1481         }
1482 
1483         // Rule based segment did not include dictionary characters.
1484         // Or, it did contain dictionary chars, but the dictionary segmenter didn't handle them,
1485         //    meaning that we didn't take the return, above.
1486         // Add its end point to the cache.
1487         addFollowing(pos, ruleStatusIdx, UpdateCachePosition);
1488 
1489         // Add several non-dictionary boundaries at this point, to optimize straight forward iteration.
1490         //    (subsequent calls to BreakIterator::next() will take the fast path, getting cached results.
1491         //
1492         for (int count=0; count<6; ++count) {
1493             pos = handleNext();
1494             if (pos == BreakIterator.DONE || fDictionaryCharCount > 0) {
1495                 break;
1496             }
1497             addFollowing(pos, fRuleStatusIndex, RetainCachePosition);
1498         }
1499         return true;
1500     };
1501 
1502     /**
1503      *  Add one or more boundaries to the cache preceding the first currently cached boundary.
1504      *  Leave the iteration position on the first added boundary.
1505      *  Return false if no boundaries could be added (if at the start of the text.)
1506      */
populatePreceding()1507     boolean populatePreceding() {
1508         int textBegin = fText.getBeginIndex();
1509         int fromPosition = fBoundaries[fStartBufIdx];
1510         if (fromPosition == textBegin) {
1511             return false;
1512         }
1513 
1514         int position = textBegin;
1515         int positionStatusIdx = 0;
1516 
1517         if (fDictionaryCache.preceding(fromPosition)) {
1518             addPreceding(fDictionaryCache.fBoundary, fDictionaryCache.fStatusIndex, UpdateCachePosition);
1519             return true;
1520         }
1521 
1522         int backupPosition = fromPosition;
1523 
1524         // Find a boundary somewhere preceding the first already-cached boundary
1525         do {
1526             backupPosition = backupPosition - 30;
1527             if (backupPosition <= textBegin) {
1528                 backupPosition = textBegin;
1529             } else {
1530                 backupPosition = handleSafePrevious(backupPosition);
1531             }
1532             if (backupPosition == BreakIterator.DONE || backupPosition == textBegin) {
1533                 position = textBegin;
1534                 positionStatusIdx = 0;
1535             } else {
1536                 // Advance to the boundary following the backup position.
1537                 // There is a complication: the safe reverse rules identify pairs of code points
1538                 // that are safe. If advancing from the safe point moves forwards by less than
1539                 // two code points, we need to advance one more time to ensure that the boundary
1540                 // is good, including a correct rules status value.
1541                 //
1542                 fPosition = backupPosition;  // TODO: pass starting position in a clearer way.
1543                 position = handleNext();
1544                 if (position == backupPosition + 1 ||
1545                         (position == backupPosition + 2 &&
1546                         Character.isHighSurrogate(fText.setIndex(backupPosition)) &&
1547                         Character.isLowSurrogate(fText.next()))) {
1548                     // The initial handleNext() only advanced by a single code point. Go again.
1549                     // Safe rules identify safe pairs.
1550                     position = handleNext();
1551                 }
1552                 positionStatusIdx = fRuleStatusIndex;
1553             }
1554         } while (position >= fromPosition);
1555 
1556         // Find boundaries between the one we just located and the first already-cached boundary
1557         // Put them in a side buffer, because we don't yet know where they will fall in the circular cache buffer.
1558 
1559         fSideBuffer.removeAllElements();
1560         fSideBuffer.push(position);
1561         fSideBuffer.push(positionStatusIdx);
1562 
1563         do {
1564             int prevPosition = fPosition = position;
1565             int prevStatusIdx = positionStatusIdx;
1566             position = handleNext();
1567             positionStatusIdx = fRuleStatusIndex;
1568             if (position == BreakIterator.DONE) {
1569                 break;
1570             }
1571 
1572             boolean segmentHandledByDictionary = false;
1573             if (fDictionaryCharCount != 0) {
1574                 // Segment from the rules includes dictionary characters.
1575                 // Subdivide it, with subdivided results going into the dictionary cache.
1576                 int dictSegEndPosition = position;
1577                 fDictionaryCache.populateDictionary(prevPosition, dictSegEndPosition, prevStatusIdx, positionStatusIdx);
1578                 while (fDictionaryCache.following(prevPosition)) {
1579                     position = fDictionaryCache.fBoundary;
1580                     positionStatusIdx = fDictionaryCache.fStatusIndex;
1581                     segmentHandledByDictionary = true;
1582                     assert(position > prevPosition);
1583                     if (position >= fromPosition) {
1584                         break;
1585                     }
1586                     assert(position <= dictSegEndPosition);
1587                     fSideBuffer.push(position);
1588                     fSideBuffer.push(positionStatusIdx);
1589                     prevPosition = position;
1590                 }
1591                 assert(position==dictSegEndPosition || position>=fromPosition);
1592             }
1593 
1594             if (!segmentHandledByDictionary && position < fromPosition) {
1595                 fSideBuffer.push(position);
1596                 fSideBuffer.push(positionStatusIdx);
1597             }
1598         } while (position < fromPosition);
1599 
1600         // Move boundaries from the side buffer to the main circular buffer.
1601         boolean success = false;
1602         if (!fSideBuffer.isEmpty()) {
1603             positionStatusIdx = fSideBuffer.pop();
1604             position = fSideBuffer.pop();
1605             addPreceding(position, positionStatusIdx, UpdateCachePosition);
1606             success = true;
1607         }
1608 
1609         while (!fSideBuffer.isEmpty()) {
1610             positionStatusIdx = fSideBuffer.pop();
1611             position = fSideBuffer.pop();
1612             if (!addPreceding(position, positionStatusIdx, RetainCachePosition)) {
1613                 // No space in circular buffer to hold a new preceding result while
1614                 // also retaining the current cache (iteration) position.
1615                 // Bailing out is safe; the cache will refill again if needed.
1616                 break;
1617             }
1618         }
1619         return success;
1620     };
1621 
1622 
1623     static final boolean RetainCachePosition = false;
1624     static final boolean UpdateCachePosition = true;
1625 
1626     /**
1627      * Add the boundary following the current position.
1628      * The current position can be left as it was, or changed to the newly added boundary,
1629      * as specified by the update parameter.
1630      */
addFollowing(int position, int ruleStatusIdx, boolean update)1631     void addFollowing(int position, int ruleStatusIdx, boolean update) {
1632         assert(position > fBoundaries[fEndBufIdx]);
1633         assert(ruleStatusIdx <= Short.MAX_VALUE);
1634         int nextIdx = modChunkSize(fEndBufIdx + 1);
1635         if (nextIdx == fStartBufIdx) {
1636             fStartBufIdx = modChunkSize(fStartBufIdx + 6);    // TODO: experiment. Probably revert to 1.
1637         }
1638         fBoundaries[nextIdx] = position;
1639         fStatuses[nextIdx] = (short)ruleStatusIdx;
1640         fEndBufIdx = nextIdx;
1641         if (update == UpdateCachePosition) {
1642             // Set current position to the newly added boundary.
1643             fBufIdx = nextIdx;
1644             fTextIdx = position;
1645         } else {
1646             // Retaining the original cache position.
1647             // Check if the added boundary wraps around the buffer, and would over-write the original position.
1648             // It's the responsibility of callers of this function to not add too many.
1649             assert(nextIdx != fBufIdx);
1650         }
1651 
1652     };
1653 
1654 
1655     /**
1656      * Add the boundary preceding the current position.
1657      * The current position can be left as it was, or changed to the newly added boundary,
1658      * as specified by the update parameter.
1659      */
addPreceding(int position, int ruleStatusIdx, boolean update)1660     boolean addPreceding(int position, int ruleStatusIdx, boolean update) {
1661         assert(position < fBoundaries[fStartBufIdx]);
1662         assert(ruleStatusIdx <= Short.MAX_VALUE);
1663         int nextIdx = modChunkSize(fStartBufIdx - 1);
1664         if (nextIdx == fEndBufIdx) {
1665             if (fBufIdx == fEndBufIdx && update == RetainCachePosition) {
1666                 // Failure. The insertion of the new boundary would claim the buffer position that is the
1667                 // current iteration position. And we also want to retain the current iteration position.
1668                 // (The buffer is already completely full of entries that precede the iteration position.)
1669                 return false;
1670             }
1671             fEndBufIdx = modChunkSize(fEndBufIdx - 1);
1672         }
1673         fBoundaries[nextIdx] = position;
1674         fStatuses[nextIdx] = (short)ruleStatusIdx;
1675         fStartBufIdx = nextIdx;
1676         if (update == UpdateCachePosition) {
1677             fBufIdx = nextIdx;
1678             fTextIdx = position;
1679         }
1680         return true;
1681     };
1682 
1683     /**
1684      *  Set the cache position to the specified position, or, if the position
1685      *  falls between to cached boundaries, to the preceding boundary.
1686      *  Fails if the requested position is outside of the range of boundaries currently held by the cache.
1687      *  The startPosition must be on a code point boundary.
1688      *
1689      *  Return true if successful, false if the specified position is after
1690      *  the last cached boundary or before the first.
1691      */
1692     boolean seek(int pos) {
1693         if (pos < fBoundaries[fStartBufIdx] || pos > fBoundaries[fEndBufIdx]) {
1694             return false;
1695         }
1696         if (pos == fBoundaries[fStartBufIdx]) {
1697             // Common case: seek(0), from BreakIterator::first()
1698             fBufIdx = fStartBufIdx;
1699             fTextIdx = fBoundaries[fBufIdx];
1700             return true;
1701         }
1702         if (pos == fBoundaries[fEndBufIdx]) {
1703             fBufIdx = fEndBufIdx;
1704             fTextIdx = fBoundaries[fBufIdx];
1705             return true;
1706         }
1707 
1708         int min = fStartBufIdx;
1709         int max = fEndBufIdx;
1710         while (min != max) {
1711             int probe = (min + max + (min>max ? CACHE_SIZE : 0)) / 2;
1712             probe = modChunkSize(probe);
1713             if (fBoundaries[probe] > pos) {
1714                 max = probe;
1715             } else {
1716                 min = modChunkSize(probe + 1);
1717             }
1718         }
1719         assert(fBoundaries[max] > pos);
1720         fBufIdx = modChunkSize(max - 1);
1721         fTextIdx = fBoundaries[fBufIdx];
1722         assert(fTextIdx <= pos);
1723         return true;
1724 
1725     };
1726 
1727 
1728     /**
1729      * copy constructor, used from RuleBasedBreakIterator.clone().
1730      *
1731      * @param src
1732      */
BreakCache(BreakCache src)1733     BreakCache(BreakCache src)  {
1734         fStartBufIdx = src.fStartBufIdx;
1735         fEndBufIdx = src.fEndBufIdx;
1736         fTextIdx = src.fTextIdx;
1737         fBufIdx = src.fBufIdx;
1738         fBoundaries = src.fBoundaries.clone();
1739         fStatuses = src.fStatuses.clone();
1740         fSideBuffer = new DictionaryBreakEngine.DequeI();  // Transient, no need to clone contents.
1741     }
1742 
dumpCache()1743     void dumpCache() {
1744         System.out.printf("fTextIdx:%d   fBufIdx:%d%n", fTextIdx, fBufIdx);
1745         for (int i=fStartBufIdx; ; i=modChunkSize(i+1)) {
1746             System.out.printf("%d  %d%n", i, fBoundaries[i]);
1747             if (i == fEndBufIdx) {
1748                 break;
1749             }
1750         }
1751     };
1752 
modChunkSize(int index)1753     private final int   modChunkSize(int index) { return index & (CACHE_SIZE - 1); };
1754 
1755     static final int CACHE_SIZE = 128;
1756     // static_assert((CACHE_SIZE & (CACHE_SIZE-1)) == 0, "CACHE_SIZE must be power of two.");
1757 
1758     int                 fStartBufIdx;
1759     int                 fEndBufIdx;    // inclusive
1760 
1761     int                 fTextIdx;
1762     int                 fBufIdx;
1763 
1764     int[]               fBoundaries = new int[CACHE_SIZE];
1765     short[]             fStatuses = new short[CACHE_SIZE];
1766 
1767     DictionaryBreakEngine.DequeI   fSideBuffer = new DictionaryBreakEngine.DequeI();
1768 };
1769 
1770 
1771 
1772 
1773 }
1774 
1775