• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html#License
3 /*
4  *******************************************************************************
5  * Copyright (C) 2005-2016 International Business Machines Corporation and
6  * others. All Rights Reserved.
7  *******************************************************************************
8  */
9 
10 package com.ibm.icu.text;
11 
12 import static com.ibm.icu.impl.CharacterIteration.DONE32;
13 import static com.ibm.icu.impl.CharacterIteration.next32;
14 import static com.ibm.icu.impl.CharacterIteration.nextTrail32;
15 
16 import java.io.ByteArrayOutputStream;
17 import java.io.IOException;
18 import java.io.InputStream;
19 import java.io.OutputStream;
20 import java.nio.ByteBuffer;
21 import java.text.CharacterIterator;
22 import java.util.ArrayList;
23 import java.util.List;
24 
25 import com.ibm.icu.impl.CharacterIteration;
26 import com.ibm.icu.impl.ICUBinary;
27 import com.ibm.icu.impl.ICUDebug;
28 import com.ibm.icu.impl.RBBIDataWrapper;
29 import com.ibm.icu.impl.Trie2;
30 import com.ibm.icu.lang.UCharacter;
31 import com.ibm.icu.lang.UProperty;
32 import com.ibm.icu.lang.UScript;
33 
34 /**
35  * Rule Based Break Iterator
36  * This is a port of the C++ class RuleBasedBreakIterator from ICU4C.
37  *
38  * @stable ICU 2.0
39  */
40 public class RuleBasedBreakIterator extends BreakIterator {
41     //=======================================================================
42     // Constructors & Factories
43     //=======================================================================
44 
45     /**
46      * private constructor
47      */
RuleBasedBreakIterator()48     private RuleBasedBreakIterator() {
49         fDictionaryCharCount  = 0;
50         synchronized(gAllBreakEngines) {
51             fBreakEngines = new ArrayList<>(gAllBreakEngines);
52         }
53     }
54 
55     /**
56      * Create a break iterator from a precompiled set of break rules.
57      *
58      * Creating a break iterator from the binary rules is much faster than
59      * creating one from source rules.
60      *
61      * The binary rules are generated by the RuleBasedBreakIterator.compileRules() function.
62      * Binary break iterator rules are not guaranteed to be compatible between
63      * different versions of ICU.
64      *
65      * @param is an input stream supplying the compiled binary rules.
66      * @throws IOException if there is an error while reading the rules from the InputStream.
67      * @see    #compileRules(String, OutputStream)
68      * @stable ICU 4.8
69      */
getInstanceFromCompiledRules(InputStream is)70     public static RuleBasedBreakIterator getInstanceFromCompiledRules(InputStream is) throws IOException {
71         RuleBasedBreakIterator  This = new RuleBasedBreakIterator();
72         This.fRData = RBBIDataWrapper.get(ICUBinary.getByteBufferFromInputStreamAndCloseStream(is));
73         return This;
74     }
75 
76     /**
77      * Create a break iterator from a precompiled set of break rules.
78      *
79      * Creating a break iterator from the binary rules is much faster than
80      * creating one from source rules.
81      *
82      * The binary rules are generated by the RuleBasedBreakIterator.compileRules() function.
83      * Binary break iterator rules are not guaranteed to be compatible between
84      * different versions of ICU.
85      *
86      * @param bytes a buffer supplying the compiled binary rules.
87      * @throws IOException if there is an error while reading the rules from the buffer.
88      * @see    #compileRules(String, OutputStream)
89      * @internal
90      * @deprecated This API is ICU internal only.
91      */
92     @Deprecated
getInstanceFromCompiledRules(ByteBuffer bytes)93     public static RuleBasedBreakIterator getInstanceFromCompiledRules(ByteBuffer bytes) throws IOException {
94         RuleBasedBreakIterator  This = new RuleBasedBreakIterator();
95         This.fRData = RBBIDataWrapper.get(bytes);
96         return This;
97     }
98 
99     /**
100      * Construct a RuleBasedBreakIterator from a set of rules supplied as a string.
101      * @param rules The break rules to be used.
102      * @stable ICU 2.2
103      */
RuleBasedBreakIterator(String rules)104     public RuleBasedBreakIterator(String rules)  {
105         this();
106         try {
107             ByteArrayOutputStream ruleOS = new ByteArrayOutputStream();
108             compileRules(rules, ruleOS);
109             fRData = RBBIDataWrapper.get(ByteBuffer.wrap(ruleOS.toByteArray()));
110         } catch (IOException e) {
111             ///CLOVER:OFF
112             // An IO exception can only arrive here if there is a bug in the RBBI Rule compiler,
113             //  causing bogus compiled rules to be produced, but with no compile error raised.
114             RuntimeException rte = new RuntimeException("RuleBasedBreakIterator rule compilation internal error: "
115                     + e.getMessage());
116             throw rte;
117             ///CLOVER:ON
118         }
119     }
120 
121     //=======================================================================
122     // Boilerplate
123     //=======================================================================
124 
125     /**
126      * Clones this iterator.
127      * @return A newly-constructed RuleBasedBreakIterator with the same
128      * behavior as this one.
129      * @stable ICU 2.0
130      */
131     @Override
clone()132     public Object clone()  {
133         RuleBasedBreakIterator result;
134         result = (RuleBasedBreakIterator)super.clone();
135         if (fText != null) {
136             result.fText = (CharacterIterator)(fText.clone());
137         }
138         synchronized (gAllBreakEngines)  {
139             result.fBreakEngines = new ArrayList<>(gAllBreakEngines);
140         }
141         result.fLookAheadMatches = new LookAheadResults();
142         result.fBreakCache = result.new BreakCache(fBreakCache);
143         result.fDictionaryCache = result.new DictionaryCache(fDictionaryCache);
144         return result;
145     }
146 
147 
148     /**
149      * Returns true if both BreakIterators are of the same class, have the same
150      * rules, and iterate over the same text.
151      * @stable ICU 2.0
152      */
153     @Override
equals(Object that)154     public boolean equals(Object that) {
155         if (that == null) {
156             return false;
157         }
158         if (this == that) {
159             return true;
160         }
161         try {
162             RuleBasedBreakIterator other = (RuleBasedBreakIterator) that;
163             if (fRData != other.fRData && (fRData == null || other.fRData == null)) {
164                 return false;
165             }
166             if (fRData != null && other.fRData != null &&
167                     (!fRData.fRuleSource.equals(other.fRData.fRuleSource))) {
168                 return false;
169             }
170             if (fText == null && other.fText == null) {
171                 return true;
172             }
173             if (fText == null || other.fText == null || !fText.equals(other.fText)) {
174                 return false;
175             }
176             return fPosition == other.fPosition;
177         }
178         catch(ClassCastException e) {
179             return false;
180         }
181      }
182 
183     /**
184      * Returns the description (rules) used to create this iterator.
185      * (In ICU4C, the same function is RuleBasedBreakIterator::getRules())
186      * @stable ICU 2.0
187      */
188     @Override
toString()189     public String toString() {
190         String retStr = "";
191         if (fRData != null) {
192             retStr =  fRData.fRuleSource;
193         }
194         return retStr;
195     }
196 
197     /**
198      * Compute a hashcode for this BreakIterator
199      * @return A hash code
200      * @stable ICU 2.0
201      */
202     @Override
hashCode()203     public int hashCode()
204     {
205         return fRData.fRuleSource.hashCode();
206     }
207 
208 
209     private static final int  START_STATE = 1;     // The state number of the starting state
210     private static final int  STOP_STATE  = 0;     // The state-transition value indicating "stop"
211 
212     // RBBIRunMode - the state machine runs an extra iteration at the beginning and end
213     //               of user text.  A variable with this enum type keeps track of where we
214     //               are.  The state machine only fetches user text input while in RUN mode.
215     private static final int  RBBI_START  = 0;
216     private static final int  RBBI_RUN    = 1;
217     private static final int  RBBI_END    = 2;
218 
219     /**
220      * The character iterator through which this BreakIterator accesses the text.
221      */
222     private CharacterIterator   fText = new java.text.StringCharacterIterator("");
223 
224     /**
225      * The rule data for this BreakIterator instance.
226      * Not intended for public use. Declared public for testing purposes only.
227      * @internal
228      * @deprecated This API is ICU internal only.
229      */
230     @Deprecated
231     public RBBIDataWrapper    fRData;
232 
233     /**
234      *  The iteration state - current position, rule status for the current position,
235      *                        and whether the iterator ran off the end, yielding UBRK_DONE.
236      *                        Current position is pinned to be 0 < position <= text.length.
237      *                        Current position is always set to a boundary.
238      *
239      *  The current  position of the iterator. Pinned, 0 < fPosition <= text.length.
240      *  Never has the value UBRK_DONE (-1).
241      */
242     private int                fPosition;
243 
244     /**
245      * Index of the Rule {tag} values for the most recent match.
246      */
247     private int                fRuleStatusIndex;
248 
249     /**
250      * True when iteration has run off the end, and iterator functions should return UBRK_DONE.
251      */
252     private boolean            fDone;
253 
254     /**
255      *   Cache of previously determined boundary positions.
256      */
257     private BreakCache         fBreakCache = new BreakCache();
258 
259 
260     /**
261      * Counter for the number of characters encountered with the "dictionary"
262      *   flag set.  Normal RBBI iterators don't use it, although the code
263      *   for updating it is live.  Dictionary Based break iterators (a subclass
264      *   of us) access this field directly.
265      * @internal
266      */
267     private int fDictionaryCharCount;
268 
269     private DictionaryCache     fDictionaryCache = new DictionaryCache();
270 
271     /**
272      * ICU debug argument name for RBBI
273      */
274     private static final String RBBI_DEBUG_ARG = "rbbi";
275 
276     /**
277      * Debugging flag.  Trace operation of state machine when true.
278      */
279     private static final boolean TRACE = ICUDebug.enabled(RBBI_DEBUG_ARG)
280             && ICUDebug.value(RBBI_DEBUG_ARG).indexOf("trace") >= 0;
281 
282     /**
283      * The "default" break engine - just skips over ranges of dictionary words,
284      * producing no breaks. Should only be used if characters need to be handled
285      * by a dictionary but we have no dictionary implementation for them.
286      *
287      * Only one instance; shared by all break iterators.
288      */
289     private static final UnhandledBreakEngine gUnhandledBreakEngine;
290 
291     /**
292      * List of all known break engines, common for all break iterators.
293      * Lazily updated as break engines are needed, because instantiation of
294      * break engines is expensive.
295      *
296      * Because gAllBreakEngines can be referenced concurrently from different
297      * BreakIterator instances, all access is synchronized.
298      */
299     private static final List<LanguageBreakEngine> gAllBreakEngines;
300 
301     static {
302         gUnhandledBreakEngine = new UnhandledBreakEngine();
303         gAllBreakEngines = new ArrayList<>();
304         gAllBreakEngines.add(gUnhandledBreakEngine);
305     }
306 
307     /**
308      * List of all known break engines. Similar to gAllBreakEngines, but local to a
309      * break iterator, allowing it to be used without synchronization.
310      */
311     private List<LanguageBreakEngine> fBreakEngines;
312 
313     /**
314      * Dump the contents of the state table and character classes for this break iterator.
315      * For debugging only.
316      * @internal
317      * @deprecated This API is ICU internal only.
318      */
319     @Deprecated
dump(java.io.PrintStream out)320     public void dump(java.io.PrintStream out) {
321         if (out == null) {
322             out = System.out;
323         }
324         this.fRData.dump(out);
325     }
326 
327     /**
328      * Compile a set of source break rules into the binary state tables used
329      * by the break iterator engine.  Creating a break iterator from precompiled
330      * rules is much faster than creating one from source rules.
331      *
332      * Binary break rules are not guaranteed to be compatible between different
333      * versions of ICU.
334      *
335      *
336      * @param rules  The source form of the break rules
337      * @param ruleBinary  An output stream to receive the compiled rules.
338      * @throws IOException If there is an error writing the output.
339      * @see #getInstanceFromCompiledRules(InputStream)
340      * @stable ICU 4.8
341      */
compileRules(String rules, OutputStream ruleBinary)342     public static void compileRules(String rules, OutputStream ruleBinary) throws IOException {
343         RBBIRuleBuilder.compileRules(rules, ruleBinary);
344     }
345 
346     //=======================================================================
347     // BreakIterator overrides
348     //=======================================================================
349 
350     /**
351      * Sets the current iteration position to the beginning of the text.
352      * (i.e., the CharacterIterator's starting offset).
353      * @return The offset of the beginning of the text.
354      * @stable ICU 2.0
355      */
356     @Override
first()357     public int first() {
358         if (fText == null) {
359             return BreakIterator.DONE;
360         }
361         fText.first();
362         int start =  fText.getIndex();
363         if (!fBreakCache.seek(start)) {
364             fBreakCache.populateNear(start);
365         }
366         fBreakCache.current();
367         assert(fPosition == start);
368         return fPosition;
369     }
370 
371     /**
372      * Sets the current iteration position to the end of the text.
373      * (i.e., the CharacterIterator's ending offset).
374      * @return The text's past-the-end offset.
375      * @stable ICU 2.0
376      */
377     @Override
last()378     public int last() {
379         if (fText == null) {
380             return BreakIterator.DONE;
381         }
382         int endPos = fText.getEndIndex();
383         boolean endShouldBeBoundary = isBoundary(endPos);      // Has side effect of setting iterator position.
384         assert(endShouldBeBoundary);
385         if (fPosition != endPos) {
386             assert(fPosition == endPos);
387         }
388         return endPos;
389     }
390 
391     /**
392      * Advances the iterator either forward or backward the specified number of steps.
393      * Negative values move backward, and positive values move forward.  This is
394      * equivalent to repeatedly calling next() or previous().
395      * @param n The number of steps to move.  The sign indicates the direction
396      * (negative is backwards, and positive is forwards).
397      * @return The character offset of the boundary position n boundaries away from
398      * the current one.
399      * @stable ICU 2.0
400      */
401     @Override
next(int n)402     public int next(int n) {
403         int result = 0;
404         if (n > 0) {
405             for (; n > 0 && result != DONE; --n) {
406                 result = next();
407             }
408         } else if (n < 0) {
409             for (; n < 0 && result != DONE; ++n) {
410                 result = previous();
411             }
412         } else {
413             result = current();
414         }
415         return result;
416     }
417 
418     /**
419      * Advances the iterator to the next boundary position.
420      * @return The position of the first boundary after this one.
421      * @stable ICU 2.0
422      */
423     @Override
next()424     public int next() {
425         fBreakCache.next();
426         return fDone ? DONE : fPosition;
427     }
428 
429     /**
430      * Moves the iterator backwards, to the boundary preceding the current one.
431      * @return The position of the boundary position immediately preceding the starting position.
432      * @stable ICU 2.0
433      */
434     @Override
previous()435     public int previous() {
436         fBreakCache.previous();
437         return fDone ? DONE : fPosition;
438     }
439 
440     /**
441      * Sets the iterator to refer to the first boundary position following
442      * the specified position.
443      * @param startPos The position from which to begin searching for a break position.
444      * @return The position of the first break after the current position.
445      * @stable ICU 2.0
446      */
447     @Override
following(int startPos)448     public int following(int startPos) {
449         // if the supplied position is before the beginning, return the
450         // text's starting offset
451         if (startPos < fText.getBeginIndex()) {
452             return first();
453         }
454 
455         // Move requested offset to a code point start. It might be between a lead and trail surrogate.
456         // Or it may be beyond the end of the text.
457         startPos = CISetIndex32(fText, startPos);
458         fBreakCache.following(startPos);
459         return fDone ? DONE : fPosition;
460     }
461 
462 
463     /**
464      * Sets the iterator to refer to the last boundary position before the
465      * specified position.
466      * @param offset The position to begin searching for a break from.
467      * @return The position of the last boundary before the starting position.
468      * @stable ICU 2.0
469      */
470     @Override
preceding(int offset)471     public int preceding(int offset) {
472         if (fText == null || offset > fText.getEndIndex()) {
473             return last();
474         } else if (offset < fText.getBeginIndex()) {
475             return first();
476         }
477 
478         // Move requested offset to a code point start. It might be between a lead and trail surrogate.
479         // int adjustedOffset = CISetIndex32(fText, offset);    // TODO: restore to match ICU4C behavior.
480         int adjustedOffset = offset;
481         fBreakCache.preceding(adjustedOffset);
482         return fDone ? DONE : fPosition;
483 
484     }
485 
486 
487     /**
488      * Throw IllegalArgumentException unless begin &lt;= offset &lt; end.
489      * @stable ICU 2.0
490      */
checkOffset(int offset, CharacterIterator text)491     protected static final void checkOffset(int offset, CharacterIterator text) {
492         if (offset < text.getBeginIndex() || offset > text.getEndIndex()) {
493             throw new IllegalArgumentException("offset out of bounds");
494         }
495     }
496 
497 
498     /**
499      * Returns true if the specified position is a boundary position.  As a side
500      * effect, leaves the iterator pointing to the first boundary position at
501      * or after "offset".
502      * @param offset the offset to check.
503      * @return True if "offset" is a boundary position.
504      * @stable ICU 2.0
505      */
506     @Override
isBoundary(int offset)507     public boolean isBoundary(int offset) {
508         // TODO: behavior difference with ICU4C, which considers out-of-range offsets
509         //       to not be boundaries, and to not be errors.
510         checkOffset(offset, fText);
511 
512         // Adjust offset to be on a code point boundary and not beyond the end of the text.
513         // Note that isBoundary() is always false for offsets that are not on code point boundaries.
514         // But we still need the side effect of leaving iteration at the following boundary.
515         int adjustedOffset = CISetIndex32(fText, offset);
516 
517         boolean result = false;
518         if (fBreakCache.seek(adjustedOffset) || fBreakCache.populateNear(adjustedOffset)) {
519             result = (fBreakCache.current() == offset);
520         }
521 
522         if (!result) {
523             // Not on a boundary. isBoundary() must leave iterator on the following boundary.
524             // fBreakCache.seek(), above, left us on the preceding boundary, so advance one.
525             next();
526         }
527         return result;
528 
529     }
530 
531     /**
532      * Returns the current iteration position.  Note that DONE is never
533      * returned from this function; if iteration has run to the end of a
534      * string, current() will return the length of the string while
535      * next() will return BreakIterator.DONE).
536      * @return The current iteration position.
537      * @stable ICU 2.0
538      */
539     @Override
current()540     public int current() {
541         return (fText != null) ? fPosition : BreakIterator.DONE;
542     }
543 
544 
545     /**
546      * Return the status tag from the break rule that determined the boundary at
547      * the current iteration position.  The values appear in the rule source
548      * within brackets, {123}, for example.  For rules that do not specify a
549      * status, a default value of 0 is returned.  If more than one rule applies,
550      * the numerically largest of the possible status values is returned.
551      * <p>
552      * Of the standard types of ICU break iterators, only the word and line break
553      * iterator provides status values.  The values are defined in
554      * class RuleBasedBreakIterator, and allow distinguishing between words
555      * that contain alphabetic letters, "words" that appear to be numbers,
556      * punctuation and spaces, words containing ideographic characters, and
557      * more.  Call <code>getRuleStatus</code> after obtaining a boundary
558      * position from <code>next()</code>, <code>previous()</code>, or
559      * any other break iterator functions that returns a boundary position.
560      * <p>
561      * Note that <code>getRuleStatus()</code> returns the value corresponding to
562      * <code>current()</code> index even after <code>next()</code> has returned DONE.
563      * <p>
564 
565      * @return the status from the break rule that determined the boundary
566      * at the current iteration position.
567      *
568      * @stable ICU 60
569      */
570 
571     @Override
getRuleStatus()572     public int  getRuleStatus() {
573         //   Status records have this form:
574         //           Count N         <--  fLastRuleStatusIndex points here.
575         //           Status val 0
576         //           Status val 1
577         //              ...
578         //           Status val N-1  <--  the value we need to return
579         //   The status values are sorted in ascending order.
580         //   This function returns the last (largest) of the array of status values.
581         int  idx = fRuleStatusIndex + fRData.fStatusTable[fRuleStatusIndex];
582         int  tagVal = fRData.fStatusTable[idx];
583         return tagVal;
584     }
585 
586     /**
587      * Get the status (tag) values from the break rule(s) that determined the boundary
588      * at the current iteration position.  The values appear in the rule source
589      * within brackets, {123}, for example.  The default status value for rules
590      * that do not explicitly provide one is zero.
591      * <p>
592      * The status values used by the standard ICU break rules are defined
593      * as public constants in class RuleBasedBreakIterator.
594      * <p>
595      * If the size  of the output array is insufficient to hold the data,
596      *  the output will be truncated to the available length.  No exception
597      *  will be thrown.
598      *
599      * @param fillInArray an array to be filled in with the status values.
600      * @return          The number of rule status values from the rules that determined
601      *                  the boundary at the current iteration position.
602      *                  In the event that the array is too small, the return value
603      *                  is the total number of status values that were available,
604      *                  not the reduced number that were actually returned.
605      * @stable ICU 60
606      */
607     @Override
getRuleStatusVec(int[] fillInArray)608     public int getRuleStatusVec(int[] fillInArray) {
609         int numStatusVals = fRData.fStatusTable[fRuleStatusIndex];
610         if (fillInArray != null) {
611             int numToCopy = Math.min(numStatusVals, fillInArray.length);
612             for (int i=0; i<numToCopy; i++) {
613                 fillInArray[i] = fRData.fStatusTable[fRuleStatusIndex + i + 1];
614             }
615         }
616         return numStatusVals;
617     }
618 
619     /**
620      * Returns a CharacterIterator over the text being analyzed.
621      * <p>
622      * <b><i>Caution:</i></b>The state of the returned CharacterIterator
623      * must not be modified in any way while the BreakIterator is still in use.
624      * Doing so will lead to undefined behavior of the BreakIterator.
625      * Clone the returned CharacterIterator first and work with that.
626      * <p>
627      * The returned CharacterIterator is a reference
628      * to the <b>actual iterator being used</b> by the BreakIterator.
629      * No guarantees are made about the current position
630      * of this iterator when it is returned; it may differ from the
631      * BreakIterators current position.  If you need to move that
632      * position to examine the text, clone this function's return value first.
633      * @return An iterator over the text being analyzed.
634      * @stable ICU 2.0
635      */
636     @Override
getText()637     public CharacterIterator getText() {
638         return fText;
639     }
640 
641     /**
642      * Set the iterator to analyze a new piece of text.  This function resets
643      * the current iteration position to the beginning of the text.
644      * (The old iterator is dropped.)
645      * <p>
646      * <b><i>Caution:</i></b> The supplied CharacterIterator is used
647      * directly by the BreakIterator, and must not be altered in any
648      * way by code outside of the BreakIterator.
649      * Doing so will lead to undefined behavior of the BreakIterator.
650      *
651      * @param newText An iterator over the text to analyze.
652      * @stable ICU 2.0
653      */
654     @Override
setText(CharacterIterator newText)655     public void setText(CharacterIterator newText) {
656         if (newText != null) {
657             fBreakCache.reset(newText.getBeginIndex(), 0);
658         } else {
659             fBreakCache.reset();
660         }
661         fDictionaryCache.reset();
662         fText = newText;
663         this.first();
664     }
665 
666      /**
667      * Control debug, trace and dump options.
668      * @internal
669      * @deprecated This API is ICU internal only.
670      */
671     @Deprecated
672     public static final String fDebugEnv = ICUDebug.enabled(RBBI_DEBUG_ARG) ?
673                                            ICUDebug.value(RBBI_DEBUG_ARG) : null;
674 
675 
getLanguageBreakEngine(int c)676     private LanguageBreakEngine getLanguageBreakEngine(int c) {
677 
678         // We have a dictionary character.
679         // Does an already instantiated break engine handle it?
680         for (LanguageBreakEngine candidate : fBreakEngines) {
681             if (candidate.handles(c)) {
682                 return candidate;
683             }
684         }
685 
686         synchronized (gAllBreakEngines) {
687             // This break iterator's list of break engines didn't handle the character.
688             // Check the global list, another break iterator may have instantiated the
689             // desired engine.
690             for (LanguageBreakEngine candidate : gAllBreakEngines) {
691                 if (candidate.handles(c)) {
692                     fBreakEngines.add(candidate);
693                     return candidate;
694                 }
695             }
696 
697             // The global list doesn't have an existing engine, build one.
698             int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
699             if (script == UScript.KATAKANA || script == UScript.HIRAGANA) {
700                 // Katakana, Hiragana and Han are handled by the same dictionary engine.
701                 // Fold them together for mapping from script -> engine.
702                 script = UScript.HAN;
703             }
704 
705             LanguageBreakEngine eng;
706             try {
707                 switch (script) {
708                 case UScript.THAI:
709                     eng = new ThaiBreakEngine();
710                     break;
711                 case UScript.LAO:
712                     eng = new LaoBreakEngine();
713                     break;
714                 case UScript.MYANMAR:
715                     eng = new BurmeseBreakEngine();
716                     break;
717                 case UScript.KHMER:
718                     eng = new KhmerBreakEngine();
719                     break;
720                 case UScript.HAN:
721                     eng = new CjkBreakEngine(false);
722                      break;
723                 case UScript.HANGUL:
724                     eng = new CjkBreakEngine(true);
725                     break;
726                 default:
727                     gUnhandledBreakEngine.handleChar(c);
728                     eng = gUnhandledBreakEngine;
729                     break;
730                 }
731             } catch (IOException e) {
732                 eng = null;
733             }
734 
735             if (eng != null && eng != gUnhandledBreakEngine) {
736                 gAllBreakEngines.add(eng);
737                 fBreakEngines.add(eng);
738             }
739             return eng;
740         }   // end synchronized(gAllBreakEngines)
741     }
742 
743     private static final int kMaxLookaheads = 8;
744     private static class LookAheadResults {
745         int      fUsedSlotLimit;
746         int[]    fPositions;
747         int[]    fKeys;
748 
LookAheadResults()749         LookAheadResults() {
750             fUsedSlotLimit= 0;
751             fPositions = new int[kMaxLookaheads];
752             fKeys = new int[kMaxLookaheads];
753         }
754 
getPosition(int key)755         int getPosition(int key) {
756             for (int i=0; i<fUsedSlotLimit; ++i) {
757                 if (fKeys[i] == key) {
758                     return fPositions[i];
759                 }
760             }
761             assert(false);
762             return -1;
763         }
764 
setPosition(int key, int position)765         void setPosition(int key, int position) {
766             int i;
767             for (i=0; i<fUsedSlotLimit; ++i) {
768                 if (fKeys[i] == key) {
769                     fPositions[i] = position;
770                     return;
771                 }
772             }
773             if (i >= kMaxLookaheads) {
774                 assert(false);
775                 i = kMaxLookaheads - 1;
776             }
777             fKeys[i] = key;
778             fPositions[i] = position;
779             assert(fUsedSlotLimit == i);
780             fUsedSlotLimit = i + 1;
781         }
782 
reset()783         void reset() {
784             fUsedSlotLimit = 0;
785         }
786     };
787     private LookAheadResults fLookAheadMatches = new LookAheadResults();
788 
789 
790     /**
791      * The State Machine Engine for moving forward is here.
792      * This function is the heart of the RBBI run time engine.
793      *
794      * Input
795      *    fPosition, the position in the text to begin from.
796      * Output
797      *    fPosition:           the boundary following the starting position.
798      *    fDictionaryCharCount the number of dictionary characters encountered.
799      *                         If > 0, the segment will be further subdivided
800      *    fRuleStatusIndex     Info from the state table indicating which rules caused the boundary.
801      *
802      * @return the new iterator position
803      *
804      * A note on supplementary characters and the position of underlying
805      * Java CharacterIterator:   Normally, a character iterator is positioned at
806      * the char most recently returned by next().  Within this function, when
807      * a supplementary char is being processed, the char iterator is left
808      * sitting on the trail surrogate, in the middle of the code point.
809      * This is different from everywhere else, where an iterator always
810      * points at the lead surrogate of a supplementary.
811      */
handleNext()812     private int handleNext() {
813         if (TRACE) {
814             System.out.println("Handle Next   pos      char  state category");
815         }
816 
817         // handleNext always sets the break tag value.
818         // Set the default for it.
819         fRuleStatusIndex  = 0;
820         fDictionaryCharCount = 0;
821 
822         // caches for quicker access
823         CharacterIterator text = fText;
824         Trie2 trie = fRData.fTrie;
825 
826         short[] stateTable  = fRData.fFTable.fTable;
827         int initialPosition = fPosition;
828         text.setIndex(initialPosition);
829         int result          = initialPosition;
830 
831         // Set up the starting char
832         int c = text.current();
833         if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) {
834             c = nextTrail32(text, c);
835             if (c == DONE32) {
836                 fDone = true;
837                 return BreakIterator.DONE;
838             }
839         }
840 
841         // Set the initial state for the state machine
842         int state           = START_STATE;
843         int row             = fRData.getRowIndex(state);
844         short category      = 3;
845         int flagsState      = fRData.fFTable.fFlags;
846         int mode            = RBBI_RUN;
847         if ((flagsState & RBBIDataWrapper.RBBI_BOF_REQUIRED) != 0) {
848             category = 2;
849             mode     = RBBI_START;
850             if (TRACE) {
851                 System.out.print("            " +  RBBIDataWrapper.intToString(text.getIndex(), 5));
852                 System.out.print(RBBIDataWrapper.intToHexString(c, 10));
853                 System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6));
854             }
855         }
856         fLookAheadMatches.reset();
857 
858         // loop until we reach the end of the text or transition to state 0
859         while (state != STOP_STATE) {
860             if (c == DONE32) {
861                 // Reached end of input string.
862                 if (mode == RBBI_END) {
863                     // We have already run the loop one last time with the
864                     // character set to the pseudo {eof} value. Now it is time
865                     // to unconditionally bail out.
866                     break;
867                 }
868                 // Run the loop one last time with the fake end-of-input character category
869                 mode = RBBI_END;
870                 category = 1;
871             }
872             else if (mode == RBBI_RUN) {
873                 // Get the char category.  An incoming category of 1 or 2 mens that
874                 //      we are preset for doing the beginning or end of input, and
875                 //      that we shouldn't get a category from an actual text input character.
876                 //
877 
878                 // look up the current character's character category, which tells us
879                 // which column in the state table to look at.
880                 //
881                 category = (short) trie.get(c);
882 
883                 // Check the dictionary bit in the character's category.
884                 //    Counter is only used by dictionary based iterators (subclasses).
885                 //    Chars that need to be handled by a dictionary have a flag bit set
886                 //    in their category values.
887                 //
888                 if ((category & 0x4000) != 0)  {
889                     fDictionaryCharCount++;
890                     //  And off the dictionary flag bit.
891                     category &= ~0x4000;
892                 }
893 
894                 if (TRACE) {
895                     System.out.print("            " +  RBBIDataWrapper.intToString(text.getIndex(), 5));
896                     System.out.print(RBBIDataWrapper.intToHexString(c, 10));
897                     System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6));
898                 }
899 
900                 // Advance to the next character.
901                 // If this is a beginning-of-input loop iteration, don't advance.
902                 //    The next iteration will be processing the first real input character.
903                 c = text.next();
904                 if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) {
905                     c = nextTrail32(text, c);
906                 }
907             }
908             else {
909                 mode = RBBI_RUN;
910             }
911 
912             // look up a state transition in the state table
913             state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category];
914             row   = fRData.getRowIndex(state);
915 
916             if (stateTable[row + RBBIDataWrapper.ACCEPTING] == -1) {
917                 // Match found, common case
918                 result = text.getIndex();
919                 if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c <= UTF16.CODEPOINT_MAX_VALUE) {
920                     // The iterator has been left in the middle of a surrogate pair.
921                     // We want the start of it.
922                     result--;
923                 }
924 
925                 //  Remember the break status (tag) values.
926                 fRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGIDX];
927             }
928 
929             int completedRule = stateTable[row + RBBIDataWrapper.ACCEPTING];
930             if (completedRule > 0) {
931                 // Lookahead match is completed
932                 int lookaheadResult = fLookAheadMatches.getPosition(completedRule);
933                 if (lookaheadResult >= 0) {
934                     fRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGIDX];
935                     fPosition = lookaheadResult;
936                     return lookaheadResult;
937                 }
938             }
939 
940             int rule =  stateTable[row + RBBIDataWrapper.LOOKAHEAD];
941             if (rule != 0) {
942                 // At the position of a '/' in a look-ahead match. Record it.
943                 int  pos = text.getIndex();
944                 if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c <= UTF16.CODEPOINT_MAX_VALUE) {
945                     // The iterator has been left in the middle of a surrogate pair.
946                     // We want the beginning  of it.
947                     pos--;
948                 }
949                 fLookAheadMatches.setPosition(rule, pos);
950             }
951 
952 
953         }        // End of state machine main loop
954 
955         // The state machine is done.  Check whether it found a match...
956 
957         // If the iterator failed to advance in the match engine force it ahead by one.
958         // This indicates a defect in the break rules, which should always match
959         // at least one character.
960 
961         if (result == initialPosition) {
962             if (TRACE) {
963                 System.out.println("Iterator did not move. Advancing by 1.");
964             }
965             text.setIndex(initialPosition);
966             next32(text);
967             result = text.getIndex();
968             fRuleStatusIndex = 0;
969         }
970 
971         // Leave the iterator at our result position.
972         //   (we may have advanced beyond the last accepting position chasing after
973         //    longer matches that never completed.)
974         fPosition = result;
975 
976         if (TRACE) {
977             System.out.println("result = " + result);
978         }
979         return result;
980     }
981 
982     /**
983      * Iterate backwards from an arbitrary position in the input text using the Safe Reverse rules.
984      * This locates a "Safe Position" from which the forward break rules
985      * will operate correctly. A Safe Position is not necessarily a boundary itself.
986      *
987      * The logic of this function is very similar to handleNext(), above, but simpler
988      * because the safe table does not require as many options.
989      *
990      * @param fromPosition the position in the input text to begin the iteration.
991      * @internal
992      */
handleSafePrevious(int fromPosition)993     private int handleSafePrevious(int fromPosition) {
994         int             state;
995         short           category = 0;
996         int             result = 0;
997 
998         // caches for quicker access
999         CharacterIterator text = fText;
1000         Trie2 trie = fRData.fTrie;
1001         short[] stateTable  = fRData.fRTable.fTable;
1002 
1003         CISetIndex32(text, fromPosition);
1004         if (TRACE) {
1005             System.out.print("Handle Previous   pos   char  state category");
1006         }
1007 
1008         // if we're already at the start of the text, return DONE.
1009         if (text.getIndex() == text.getBeginIndex()) {
1010             return BreakIterator.DONE;
1011         }
1012 
1013         //  Set the initial state for the state machine
1014         int c = CharacterIteration.previous32(text);
1015         state = START_STATE;
1016         int row = fRData.getRowIndex(state);
1017 
1018         // loop until we reach the start of the text or transition to state 0
1019         //
1020         for (; c != DONE32; c = CharacterIteration.previous32(text)) {
1021 
1022             // look up the current character's character category, which tells us
1023             // which column in the state table to look at.
1024             //
1025             //  And off the dictionary flag bit. For reverse iteration it is not used.
1026             category = (short) trie.get(c);
1027             category &= ~0x4000;
1028             if (TRACE) {
1029                 System.out.print("            " +  RBBIDataWrapper.intToString(text.getIndex(), 5));
1030                 System.out.print(RBBIDataWrapper.intToHexString(c, 10));
1031                 System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6));
1032             }
1033 
1034             // State Transition - move machine to its next state
1035             //
1036             assert(category < fRData.fHeader.fCatCount);
1037             state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category];
1038             row   = fRData.getRowIndex(state);
1039 
1040             if (state == STOP_STATE) {
1041                 // This is the normal exit from the lookup state machine.
1042                 // Transition to state zero means we have found a safe point.
1043                 break;
1044             }
1045         }
1046 
1047         // The state machine is done.
1048         result = text.getIndex();
1049         if (TRACE) {
1050             System.out.println("result = " + result);
1051         }
1052         return result;
1053     }
1054 
1055     /**
1056      * Set the index of a CharacterIterator.
1057      * Pin the index to the valid range range of BeginIndex <= index <= EndIndex.
1058      * If the index points to a trail surrogate of a supplementary character, adjust it
1059      * to the start (lead surrogate) index.
1060      *
1061      * @param ci A CharacterIterator to set
1062      * @param index the index to set
1063      * @return the resulting index, possibly pinned or adjusted.
1064      */
1065     private static int CISetIndex32(CharacterIterator ci, int index) {
1066         if (index <= ci.getBeginIndex()) {
1067             ci.first();
1068         } else if (index >= ci.getEndIndex()) {
1069             ci.setIndex(ci.getEndIndex());
1070         } else if (Character.isLowSurrogate(ci.setIndex(index))) {
1071             if (!Character.isHighSurrogate(ci.previous())) {
1072                 ci.next();
1073             }
1074         }
1075         return ci.getIndex();
1076     }
1077 
1078     /** DictionaryCache  stores the boundaries obtained from a run of dictionary characters.
1079      *                 Dictionary boundaries are moved first to this cache, then from here
1080      *                 to the main BreakCache, where they may inter-leave with non-dictionary
1081      *                 boundaries. The public BreakIterator API always fetches directly
1082      *                 from the main BreakCache, not from here.
1083      *
1084      *                 In common situations, the number of boundaries in a single dictionary run
1085      *                 should be quite small, it will be terminated by punctuation, spaces,
1086      *                 or any other non-dictionary characters. The main BreakCache may end
1087      *                 up with boundaries from multiple dictionary based runs.
1088      *
1089      *                 The boundaries are stored in a simple ArrayList (vector), with the
1090      *                 assumption that they will be accessed sequentially.
1091      */
1092     class DictionaryCache  {
1093 
1094          void reset() {
1095              fPositionInCache = -1;
1096              fStart = 0;
1097              fLimit = 0;
1098              fFirstRuleStatusIndex = 0;
1099              fOtherRuleStatusIndex = 0;
1100              fBreaks.removeAllElements();
1101          };
1102 
1103          boolean following(int fromPos) {
1104              if (fromPos >= fLimit || fromPos < fStart) {
1105                  fPositionInCache = -1;
1106                  return false;
1107              }
1108 
1109              // Sequential iteration, move from previous boundary to the following
1110 
1111              int r = 0;
1112              if (fPositionInCache >= 0 && fPositionInCache < fBreaks.size() && fBreaks.elementAt(fPositionInCache) == fromPos) {
1113                  ++fPositionInCache;
1114                  if (fPositionInCache >= fBreaks.size()) {
1115                      fPositionInCache = -1;
1116                      return false;
1117                  }
1118                  r = fBreaks.elementAt(fPositionInCache);
1119                  assert(r > fromPos);
1120                  fBoundary = r;
1121                  fStatusIndex = fOtherRuleStatusIndex;
1122                  return true;
1123              }
1124 
1125              // Random indexing. Linear search for the boundary following the given position.
1126 
1127              for (fPositionInCache = 0; fPositionInCache < fBreaks.size(); ++fPositionInCache) {
1128                  r= fBreaks.elementAt(fPositionInCache);
1129                  if (r > fromPos) {
1130                      fBoundary = r;
1131                      fStatusIndex = fOtherRuleStatusIndex;
1132                      return true;
1133                  }
1134              }
1135 
1136              // Internal error. fStart <= fromPos < fLimit, but no cached boundary.
1137              assert(false);
1138              fPositionInCache = -1;
1139              return false;
1140          };
1141 
preceding(int fromPos)1142          boolean preceding(int fromPos) {
1143              if (fromPos <= fStart || fromPos > fLimit) {
1144                  fPositionInCache = -1;
1145                  return false;
1146              }
1147 
1148              if (fromPos == fLimit) {
1149                  fPositionInCache = fBreaks.size() - 1;
1150                  if (fPositionInCache >= 0) {
1151                      assert(fBreaks.elementAt(fPositionInCache) == fromPos);
1152                  }
1153              }
1154 
1155              int r;
1156              if (fPositionInCache > 0 && fPositionInCache < fBreaks.size() && fBreaks.elementAt(fPositionInCache) == fromPos) {
1157                  --fPositionInCache;
1158                  r = fBreaks.elementAt(fPositionInCache);
1159                  assert(r < fromPos);
1160                  fBoundary = r;
1161                  fStatusIndex = ( r== fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex;
1162                  return true;
1163              }
1164 
1165              if (fPositionInCache == 0) {
1166                  fPositionInCache = -1;
1167                  return false;
1168              }
1169 
1170              for (fPositionInCache = fBreaks.size()-1; fPositionInCache >= 0; --fPositionInCache) {
1171                  r = fBreaks.elementAt(fPositionInCache);
1172                  if (r < fromPos) {
1173                      fBoundary = r;
1174                      fStatusIndex = ( r == fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex;
1175                      return true;
1176                  }
1177              }
1178              assert(false);
1179              fPositionInCache = -1;
1180              return false;
1181          };
1182 
1183         /**
1184          * Populate the cache with the dictionary based boundaries within a region of text.
1185          * @param startPos  The start position of a range of text
1186          * @param endPos    The end position of a range of text
1187          * @param firstRuleStatus The rule status index that applies to the break at startPos
1188          * @param otherRuleStatus The rule status index that applies to boundaries other than startPos
1189          * @internal
1190          */
populateDictionary(int startPos, int endPos, int firstRuleStatus, int otherRuleStatus)1191         void populateDictionary(int startPos, int endPos,
1192                                 int firstRuleStatus, int otherRuleStatus) {
1193             if ((endPos - startPos) <= 1) {
1194                 return;
1195             }
1196 
1197             reset();
1198             fFirstRuleStatusIndex = firstRuleStatus;
1199             fOtherRuleStatusIndex = otherRuleStatus;
1200 
1201             int rangeStart = startPos;
1202             int rangeEnd = endPos;
1203 
1204             int         category;
1205             int         current;
1206             int         foundBreakCount = 0;
1207 
1208             // Loop through the text, looking for ranges of dictionary characters.
1209             // For each span, find the appropriate break engine, and ask it to find
1210             // any breaks within the span.
1211 
1212             fText.setIndex(rangeStart);
1213             int     c = CharacterIteration.current32(fText);
1214             category = (short)fRData.fTrie.get(c);
1215 
1216             while(true) {
1217                 while((current = fText.getIndex()) < rangeEnd && (category & 0x4000) == 0) {
1218                     c = CharacterIteration.next32(fText);    // pre-increment
1219                     category = (short)fRData.fTrie.get(c);
1220                 }
1221                 if (current >= rangeEnd) {
1222                     break;
1223                 }
1224 
1225                 // We now have a dictionary character. Get the appropriate language object
1226                 // to deal with it.
1227                 LanguageBreakEngine lbe = getLanguageBreakEngine(c);
1228 
1229                 // Ask the language object if there are any breaks. It will add them to the cache and
1230                 // leave the text pointer on the other side of its range, ready to search for the next one.
1231                 if (lbe != null) {
1232                     foundBreakCount += lbe.findBreaks(fText, rangeStart, rangeEnd, fBreaks);
1233                 }
1234 
1235                 // Reload the loop variables for the next go-round
1236                 c = CharacterIteration.current32(fText);
1237                 category = (short)fRData.fTrie.get(c);
1238             }
1239 
1240             // If we found breaks, ensure that the first and last entries are
1241             // the original starting and ending position. And initialize the
1242             // cache iteration position to the first entry.
1243 
1244             // System.out.printf("foundBreakCount = %d%n", foundBreakCount);
1245             if (foundBreakCount > 0) {
1246                 assert(foundBreakCount == fBreaks.size());
1247                 if (startPos < fBreaks.elementAt(0)) {
1248                     // The dictionary did not place a boundary at the start of the segment of text.
1249                     // Add one now. This should not commonly happen, but it would be easy for interactions
1250                     // of the rules for dictionary segments and the break engine implementations to
1251                     // inadvertently cause it. Cover it here, just in case.
1252                     fBreaks.offer(startPos);
1253                 }
1254                 if (endPos > fBreaks.peek()) {
1255                     fBreaks.push(endPos);
1256                 }
1257                 fPositionInCache = 0;
1258                 // Note: Dictionary matching may extend beyond the original limit.
1259                 fStart = fBreaks.elementAt(0);
1260                 fLimit = fBreaks.peek();
1261             } else {
1262                 // there were no language-based breaks, even though the segment contained
1263                 // dictionary characters. Subsequent attempts to fetch boundaries from the dictionary cache
1264                 // for this range will fail, and the calling code will fall back to the rule based boundaries.
1265             }
1266 
1267         };
1268 
1269 
DictionaryCache()1270         DictionaryCache() {
1271             fPositionInCache = -1;
1272             fBreaks = new DictionaryBreakEngine.DequeI();
1273         }
1274 
1275         /**
1276          * copy constructor. Used by RuleBasedBreakIterator.clone().
1277          *
1278          * @param src the source object to be copied.
1279          */
DictionaryCache(DictionaryCache src)1280         DictionaryCache(DictionaryCache src)  {
1281             try {
1282                 fBreaks = (DictionaryBreakEngine.DequeI)src.fBreaks.clone();
1283             }
1284             catch (CloneNotSupportedException e) {
1285                 throw new RuntimeException(e);
1286             }
1287             fPositionInCache      = src.fPositionInCache;
1288             fStart                = src.fStart;
1289             fLimit                = src.fLimit;
1290             fFirstRuleStatusIndex = src.fFirstRuleStatusIndex;
1291             fOtherRuleStatusIndex = src.fOtherRuleStatusIndex;
1292             fBoundary             = src.fBoundary;
1293             fStatusIndex          = src.fStatusIndex;
1294         }
1295 
1296         // A data structure containing the boundaries themselves. Essentially a vector of raw ints.
1297         DictionaryBreakEngine.DequeI fBreaks;
1298         int             fPositionInCache;       // Index in fBreaks of last boundary returned by following()
1299         //                                      //    or preceding(). Optimizes sequential access.
1300         int             fStart;                 // Text position of first boundary in cache.
1301         int             fLimit;                 // Last boundary in cache. Which is the limit of the
1302         //                                      //    text segment being handled by the dictionary.
1303         int             fFirstRuleStatusIndex;  // Rule status info for first boundary.
1304         int             fOtherRuleStatusIndex;  // Rule status info for 2nd through last boundaries.
1305         int             fBoundary;              // Current boundary. Set by preceding(), following().
1306         int             fStatusIndex;           // Current rule status index. Set by preceding, following().
1307     };
1308 
1309 
1310 
1311 
1312 /*
1313  * class BreakCache
1314  *
1315  * Cache of break boundary positions and rule status values.
1316  * Break iterator API functions, next(), previous(), etc., will use cached results
1317  * when possible, and otherwise cache new results as they are obtained.
1318  *
1319  * Uniformly caches both dictionary and rule based (non-dictionary) boundaries.
1320  *
1321  * The cache is implemented as a single circular buffer.
1322  */
1323 
1324 /*
1325  * size of the circular cache buffer.
1326  */
1327 
1328 class BreakCache {
1329 
BreakCache()1330     BreakCache() {
1331         reset();
1332     };
1333 
reset(int pos, int ruleStatus)1334     void  reset(int pos, int ruleStatus) {
1335         fStartBufIdx = 0;
1336         fEndBufIdx = 0;
1337         fTextIdx = pos;
1338         fBufIdx = 0;
1339         fBoundaries[0] = pos;
1340         fStatuses[0] = (short)ruleStatus;
1341     }
1342 
reset()1343     void  reset() {reset(0, 0); };
1344 
next()1345     void  next() {
1346         if (fBufIdx == fEndBufIdx) {
1347             fDone = !populateFollowing();
1348             fPosition = fTextIdx;
1349             fRuleStatusIndex = fStatuses[fBufIdx];
1350         } else {
1351             fBufIdx = modChunkSize(fBufIdx + 1);
1352             fTextIdx = fPosition = fBoundaries[fBufIdx];
1353             fRuleStatusIndex = fStatuses[fBufIdx];
1354         }
1355     };
1356 
previous()1357     void  previous() {
1358         int initialBufIdx = fBufIdx;
1359         if (fBufIdx == fStartBufIdx) {
1360             // At start of cache. Prepend to it.
1361             populatePreceding();
1362         } else {
1363             // Cache already holds the next boundary
1364             fBufIdx = modChunkSize(fBufIdx - 1);
1365             fTextIdx = fBoundaries[fBufIdx];
1366         }
1367         fDone = (fBufIdx == initialBufIdx);
1368         fPosition = fTextIdx;
1369         fRuleStatusIndex = fStatuses[fBufIdx];
1370         return;
1371     };
1372 
1373     // Move the iteration state to the position following the startPosition.
1374     // Input position must be pinned to the input length.
following(int startPos)1375     void following(int startPos) {
1376         if (startPos == fTextIdx || seek(startPos) || populateNear(startPos)) {
1377             // startPos is in the cache. Do a next() from that position.
1378             // TODO: an awkward set of interactions with bi->fDone
1379             //       seek() does not clear it; it can't because of interactions with populateNear().
1380             //       next() does not clear it in the fast-path case, where everything matters. Maybe it should.
1381             //       So clear it here, for the case where seek() succeeded on an iterator that had previously run off the end.
1382             fDone = false;
1383             next();
1384         }
1385 
1386     };
1387 
preceding(int startPos)1388     void  preceding(int startPos) {
1389         if (startPos == fTextIdx || seek(startPos) || populateNear(startPos)) {
1390             if (startPos == fTextIdx) {
1391                 previous();
1392             } else {
1393                 // seek() leaves the BreakCache positioned at the preceding boundary
1394                 //        if the requested position is between two bounaries.
1395                 // current() pushes the BreakCache position out to the BreakIterator itself.
1396                 assert(startPos > fTextIdx);
1397                 current();
1398             }
1399         }
1400         return;
1401     };
1402 
1403     /**
1404      * Update the state of the public BreakIterator (fBI) to reflect the
1405      * current state of the break iterator cache (this).
1406      */
current()1407     int current() {
1408         fPosition = fTextIdx;
1409         fRuleStatusIndex = fStatuses[fBufIdx];
1410         fDone = false;
1411         return fTextIdx;
1412     };
1413 
1414     /**
1415      * Add boundaries to the cache near the specified position.
1416      * The given position need not be a boundary itself.
1417      * The input position must be within the range of the text, and
1418      * on a code point boundary.
1419      * If the requested position is a break boundary, leave the iteration
1420      * position on it.
1421      * If the requested position is not a boundary, leave the iteration
1422      * position on the preceding boundary and include both the the
1423      * preceding and following boundaries in the cache.
1424      * Additional boundaries, either preceding or following, may be added
1425      * to the cache as a side effect.
1426      *
1427      * Return false if the operation failed.
1428      */
populateNear(int position)1429     boolean populateNear(int position) {
1430         assert(position < fBoundaries[fStartBufIdx] || position > fBoundaries[fEndBufIdx]);
1431 
1432         // Find a boundary somewhere in the vicinity of the requested position.
1433         // Depending on the safe rules and the text data, it could be either before, at, or after
1434         // the requested position.
1435 
1436 
1437         // If the requested position is not near already cached positions, clear the existing cache,
1438         // find a near-by boundary and begin new cache contents there.
1439 
1440         if ((position < fBoundaries[fStartBufIdx] - 15) || position > (fBoundaries[fEndBufIdx] + 15)) {
1441             int aBoundary = fText.getBeginIndex();
1442             int ruleStatusIndex = 0;
1443             if (position > aBoundary + 20) {
1444                 int backupPos = handleSafePrevious(position);
1445                 if (backupPos > aBoundary) {
1446                     // Advance to the boundary following the backup position.
1447                     // There is a complication: the safe reverse rules identify pairs of code points
1448                     // that are safe. If advancing from the safe point moves forwards by less than
1449                     // two code points, we need to advance one more time to ensure that the boundary
1450                     // is good, including a correct rules status value.
1451                     //
1452                     fPosition = backupPos;
1453                     aBoundary = handleNext();
1454                     if (aBoundary == backupPos + 1 ||
1455                             (aBoundary == backupPos + 2 &&
1456                             Character.isHighSurrogate(fText.setIndex(backupPos)) &&
1457                             Character.isLowSurrogate(fText.next()))) {
1458                         // The initial handleNext() only advanced by a single code point. Go again.
1459                         // Safe rules identify safe pairs.
1460                         aBoundary = handleNext();
1461                     }
1462                 }
1463                 ruleStatusIndex = fRuleStatusIndex;
1464             }
1465             reset(aBoundary, ruleStatusIndex);               // Reset cache to hold aBoundary as a single starting point.
1466         }
1467 
1468         // Fill in boundaries between existing cache content and the new requested position.
1469 
1470         if (fBoundaries[fEndBufIdx] < position) {
1471             // The last position in the cache precedes the requested position.
1472             // Add following position(s) to the cache.
1473             while (fBoundaries[fEndBufIdx] < position) {
1474                 if (!populateFollowing()) {
1475                     assert false;
1476                     return false;
1477                 }
1478             }
1479             fBufIdx = fEndBufIdx;                      // Set iterator position to the end of the buffer.
1480             fTextIdx = fBoundaries[fBufIdx];           // Required because populateFollowing may add extra boundaries.
1481             while (fTextIdx > position) {              // Move backwards to a position at or preceding the requested pos.
1482                 previous();
1483             }
1484             return true;
1485         }
1486 
1487         if (fBoundaries[fStartBufIdx] > position) {
1488             // The first position in the cache is beyond the requested position.
1489             // back up more until we get a boundary <= the requested position.
1490             while (fBoundaries[fStartBufIdx] > position) {
1491                 populatePreceding();
1492             }
1493             fBufIdx = fStartBufIdx;                    // Set iterator position to the start of the buffer.
1494             fTextIdx = fBoundaries[fBufIdx];           // Required because populatePreceding may add extra boundaries.
1495             while (fTextIdx < position) {              // Move forwards to a position at or following the requested pos.
1496                 next();
1497             }
1498             if (fTextIdx > position) {
1499                 // If position is not itself a boundary, the next() loop above will overshoot.
1500                 // Back up one, leaving cache position at the boundary preceding the requested position.
1501                 previous();
1502             }
1503             return true;
1504         }
1505 
1506         assert fTextIdx == position;
1507         return true;
1508 
1509     };
1510 
1511     /**
1512      *  Add boundary(s) to the cache following the current last boundary.
1513      *  Return false if at the end of the text, and no more boundaries can be added.
1514      *  Leave iteration position at the first newly added boundary, or unchanged if no boundary was added.
1515      */
populateFollowing()1516     boolean populateFollowing() {
1517         int fromPosition = fBoundaries[fEndBufIdx];
1518         int fromRuleStatusIdx = fStatuses[fEndBufIdx];
1519         int pos = 0;
1520         int ruleStatusIdx = 0;
1521 
1522         if (fDictionaryCache.following(fromPosition)) {
1523             addFollowing(fDictionaryCache.fBoundary, fDictionaryCache.fStatusIndex, UpdateCachePosition);
1524             return true;
1525         }
1526 
1527         fPosition = fromPosition;
1528         pos = handleNext();
1529         if (pos == BreakIterator.DONE) {
1530             return false;
1531         }
1532 
1533         ruleStatusIdx = fRuleStatusIndex;
1534         if (fDictionaryCharCount > 0) {
1535             // The text segment obtained from the rules includes dictionary characters.
1536             // Subdivide it, with subdivided results going into the dictionary cache.
1537             fDictionaryCache.populateDictionary(fromPosition, pos, fromRuleStatusIdx, ruleStatusIdx);
1538             if (fDictionaryCache.following(fromPosition)) {
1539                 addFollowing(fDictionaryCache.fBoundary, fDictionaryCache.fStatusIndex, UpdateCachePosition);
1540                 return true;
1541                 // TODO: may want to move a sizable chunk of the dictionary cache to the break cache at this point.
1542                 //       But be careful with interactions with populateNear().
1543             }
1544         }
1545 
1546         // Rule based segment did not include dictionary characters.
1547         // Or, it did contain dictionary chars, but the dictionary segmenter didn't handle them,
1548         //    meaning that we didn't take the return, above.
1549         // Add its end point to the cache.
1550         addFollowing(pos, ruleStatusIdx, UpdateCachePosition);
1551 
1552         // Add several non-dictionary boundaries at this point, to optimize straight forward iteration.
1553         //    (subsequent calls to BreakIterator::next() will take the fast path, getting cached results.
1554         //
1555         for (int count=0; count<6; ++count) {
1556             pos = handleNext();
1557             if (pos == BreakIterator.DONE || fDictionaryCharCount > 0) {
1558                 break;
1559             }
1560             addFollowing(pos, fRuleStatusIndex, RetainCachePosition);
1561         }
1562         return true;
1563     };
1564 
1565     /**
1566      *  Add one or more boundaries to the cache preceding the first currently cached boundary.
1567      *  Leave the iteration position on the first added boundary.
1568      *  Return false if no boundaries could be added (if at the start of the text.)
1569      */
populatePreceding()1570     boolean populatePreceding() {
1571         int textBegin = fText.getBeginIndex();
1572         int fromPosition = fBoundaries[fStartBufIdx];
1573         if (fromPosition == textBegin) {
1574             return false;
1575         }
1576 
1577         int position = textBegin;
1578         int positionStatusIdx = 0;
1579 
1580         if (fDictionaryCache.preceding(fromPosition)) {
1581             addPreceding(fDictionaryCache.fBoundary, fDictionaryCache.fStatusIndex, UpdateCachePosition);
1582             return true;
1583         }
1584 
1585         int backupPosition = fromPosition;
1586 
1587         // Find a boundary somewhere preceding the first already-cached boundary
1588         do {
1589             backupPosition = backupPosition - 30;
1590             if (backupPosition <= textBegin) {
1591                 backupPosition = textBegin;
1592             } else {
1593                 backupPosition = handleSafePrevious(backupPosition);
1594             }
1595             if (backupPosition == BreakIterator.DONE || backupPosition == textBegin) {
1596                 position = textBegin;
1597                 positionStatusIdx = 0;
1598             } else {
1599                 // Advance to the boundary following the backup position.
1600                 // There is a complication: the safe reverse rules identify pairs of code points
1601                 // that are safe. If advancing from the safe point moves forwards by less than
1602                 // two code points, we need to advance one more time to ensure that the boundary
1603                 // is good, including a correct rules status value.
1604                 //
1605                 fPosition = backupPosition;  // TODO: pass starting position in a clearer way.
1606                 position = handleNext();
1607                 if (position == backupPosition + 1 ||
1608                         (position == backupPosition + 2 &&
1609                         Character.isHighSurrogate(fText.setIndex(backupPosition)) &&
1610                         Character.isLowSurrogate(fText.next()))) {
1611                     // The initial handleNext() only advanced by a single code point. Go again.
1612                     // Safe rules identify safe pairs.
1613                     position = handleNext();
1614                 }
1615                 positionStatusIdx = fRuleStatusIndex;
1616             }
1617         } while (position >= fromPosition);
1618 
1619         // Find boundaries between the one we just located and the first already-cached boundary
1620         // Put them in a side buffer, because we don't yet know where they will fall in the circular cache buffer.
1621 
1622         fSideBuffer.removeAllElements();
1623         fSideBuffer.push(position);
1624         fSideBuffer.push(positionStatusIdx);
1625 
1626         do {
1627             int prevPosition = fPosition = position;
1628             int prevStatusIdx = positionStatusIdx;
1629             position = handleNext();
1630             positionStatusIdx = fRuleStatusIndex;
1631             if (position == BreakIterator.DONE) {
1632                 break;
1633             }
1634 
1635             boolean segmentHandledByDictionary = false;
1636             if (fDictionaryCharCount != 0) {
1637                 // Segment from the rules includes dictionary characters.
1638                 // Subdivide it, with subdivided results going into the dictionary cache.
1639                 int dictSegEndPosition = position;
1640                 fDictionaryCache.populateDictionary(prevPosition, dictSegEndPosition, prevStatusIdx, positionStatusIdx);
1641                 while (fDictionaryCache.following(prevPosition)) {
1642                     position = fDictionaryCache.fBoundary;
1643                     positionStatusIdx = fDictionaryCache.fStatusIndex;
1644                     segmentHandledByDictionary = true;
1645                     assert(position > prevPosition);
1646                     if (position >= fromPosition) {
1647                         break;
1648                     }
1649                     assert(position <= dictSegEndPosition);
1650                     fSideBuffer.push(position);
1651                     fSideBuffer.push(positionStatusIdx);
1652                     prevPosition = position;
1653                 }
1654                 assert(position==dictSegEndPosition || position>=fromPosition);
1655             }
1656 
1657             if (!segmentHandledByDictionary && position < fromPosition) {
1658                 fSideBuffer.push(position);
1659                 fSideBuffer.push(positionStatusIdx);
1660             }
1661         } while (position < fromPosition);
1662 
1663         // Move boundaries from the side buffer to the main circular buffer.
1664         boolean success = false;
1665         if (!fSideBuffer.isEmpty()) {
1666             positionStatusIdx = fSideBuffer.pop();
1667             position = fSideBuffer.pop();
1668             addPreceding(position, positionStatusIdx, UpdateCachePosition);
1669             success = true;
1670         }
1671 
1672         while (!fSideBuffer.isEmpty()) {
1673             positionStatusIdx = fSideBuffer.pop();
1674             position = fSideBuffer.pop();
1675             if (!addPreceding(position, positionStatusIdx, RetainCachePosition)) {
1676                 // No space in circular buffer to hold a new preceding result while
1677                 // also retaining the current cache (iteration) position.
1678                 // Bailing out is safe; the cache will refill again if needed.
1679                 break;
1680             }
1681         }
1682         return success;
1683     };
1684 
1685 
1686     static final boolean RetainCachePosition = false;
1687     static final boolean UpdateCachePosition = true;
1688 
1689     /**
1690      * Add the boundary following the current position.
1691      * The current position can be left as it was, or changed to the newly added boundary,
1692      * as specified by the update parameter.
1693      */
addFollowing(int position, int ruleStatusIdx, boolean update)1694     void addFollowing(int position, int ruleStatusIdx, boolean update) {
1695         assert(position > fBoundaries[fEndBufIdx]);
1696         assert(ruleStatusIdx <= Short.MAX_VALUE);
1697         int nextIdx = modChunkSize(fEndBufIdx + 1);
1698         if (nextIdx == fStartBufIdx) {
1699             fStartBufIdx = modChunkSize(fStartBufIdx + 6);    // TODO: experiment. Probably revert to 1.
1700         }
1701         fBoundaries[nextIdx] = position;
1702         fStatuses[nextIdx] = (short)ruleStatusIdx;
1703         fEndBufIdx = nextIdx;
1704         if (update == UpdateCachePosition) {
1705             // Set current position to the newly added boundary.
1706             fBufIdx = nextIdx;
1707             fTextIdx = position;
1708         } else {
1709             // Retaining the original cache position.
1710             // Check if the added boundary wraps around the buffer, and would over-write the original position.
1711             // It's the responsibility of callers of this function to not add too many.
1712             assert(nextIdx != fBufIdx);
1713         }
1714 
1715     };
1716 
1717 
1718     /**
1719      * Add the boundary preceding the current position.
1720      * The current position can be left as it was, or changed to the newly added boundary,
1721      * as specified by the update parameter.
1722      */
addPreceding(int position, int ruleStatusIdx, boolean update)1723     boolean addPreceding(int position, int ruleStatusIdx, boolean update) {
1724         assert(position < fBoundaries[fStartBufIdx]);
1725         assert(ruleStatusIdx <= Short.MAX_VALUE);
1726         int nextIdx = modChunkSize(fStartBufIdx - 1);
1727         if (nextIdx == fEndBufIdx) {
1728             if (fBufIdx == fEndBufIdx && update == RetainCachePosition) {
1729                 // Failure. The insertion of the new boundary would claim the buffer position that is the
1730                 // current iteration position. And we also want to retain the current iteration position.
1731                 // (The buffer is already completely full of entries that precede the iteration position.)
1732                 return false;
1733             }
1734             fEndBufIdx = modChunkSize(fEndBufIdx - 1);
1735         }
1736         fBoundaries[nextIdx] = position;
1737         fStatuses[nextIdx] = (short)ruleStatusIdx;
1738         fStartBufIdx = nextIdx;
1739         if (update == UpdateCachePosition) {
1740             fBufIdx = nextIdx;
1741             fTextIdx = position;
1742         }
1743         return true;
1744     };
1745 
1746     /**
1747      *  Set the cache position to the specified position, or, if the position
1748      *  falls between to cached boundaries, to the preceding boundary.
1749      *  Fails if the requested position is outside of the range of boundaries currently held by the cache.
1750      *  The startPosition must be on a code point boundary.
1751      *
1752      *  Return true if successful, false if the specified position is after
1753      *  the last cached boundary or before the first.
1754      */
1755     boolean seek(int pos) {
1756         if (pos < fBoundaries[fStartBufIdx] || pos > fBoundaries[fEndBufIdx]) {
1757             return false;
1758         }
1759         if (pos == fBoundaries[fStartBufIdx]) {
1760             // Common case: seek(0), from BreakIterator::first()
1761             fBufIdx = fStartBufIdx;
1762             fTextIdx = fBoundaries[fBufIdx];
1763             return true;
1764         }
1765         if (pos == fBoundaries[fEndBufIdx]) {
1766             fBufIdx = fEndBufIdx;
1767             fTextIdx = fBoundaries[fBufIdx];
1768             return true;
1769         }
1770 
1771         int min = fStartBufIdx;
1772         int max = fEndBufIdx;
1773         while (min != max) {
1774             int probe = (min + max + (min>max ? CACHE_SIZE : 0)) / 2;
1775             probe = modChunkSize(probe);
1776             if (fBoundaries[probe] > pos) {
1777                 max = probe;
1778             } else {
1779                 min = modChunkSize(probe + 1);
1780             }
1781         }
1782         assert(fBoundaries[max] > pos);
1783         fBufIdx = modChunkSize(max - 1);
1784         fTextIdx = fBoundaries[fBufIdx];
1785         assert(fTextIdx <= pos);
1786         return true;
1787 
1788     };
1789 
1790 
1791     /**
1792      * copy constructor, used from RuleBasedBreakIterator.clone().
1793      *
1794      * @param src
1795      */
BreakCache(BreakCache src)1796     BreakCache(BreakCache src)  {
1797         fStartBufIdx = src.fStartBufIdx;
1798         fEndBufIdx = src.fEndBufIdx;
1799         fTextIdx = src.fTextIdx;
1800         fBufIdx = src.fBufIdx;
1801         fBoundaries = src.fBoundaries.clone();
1802         fStatuses = src.fStatuses.clone();
1803         fSideBuffer = new DictionaryBreakEngine.DequeI();  // Transient, no need to clone contents.
1804     }
1805 
dumpCache()1806     void dumpCache() {
1807         System.out.printf("fTextIdx:%d   fBufIdx:%d%n", fTextIdx, fBufIdx);
1808         for (int i=fStartBufIdx; ; i=modChunkSize(i+1)) {
1809             System.out.printf("%d  %d%n", i, fBoundaries[i]);
1810             if (i == fEndBufIdx) {
1811                 break;
1812             }
1813         }
1814     };
1815 
modChunkSize(int index)1816     private final int   modChunkSize(int index) { return index & (CACHE_SIZE - 1); };
1817 
1818     static final int CACHE_SIZE = 128;
1819     // static_assert((CACHE_SIZE & (CACHE_SIZE-1)) == 0, "CACHE_SIZE must be power of two.");
1820 
1821     int                 fStartBufIdx;
1822     int                 fEndBufIdx;    // inclusive
1823 
1824     int                 fTextIdx;
1825     int                 fBufIdx;
1826 
1827     int[]               fBoundaries = new int[CACHE_SIZE];
1828     short[]             fStatuses = new short[CACHE_SIZE];
1829 
1830     DictionaryBreakEngine.DequeI   fSideBuffer = new DictionaryBreakEngine.DequeI();
1831 };
1832 
1833 
1834 
1835 
1836 }
1837 
1838