• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4  *******************************************************************************
5  * Copyright (C) 2005-2016 International Business Machines Corporation and
6  * others. All Rights Reserved.
7  *******************************************************************************
8  */
9 
10 package com.ibm.icu.text;
11 
12 import static com.ibm.icu.impl.CharacterIteration.DONE32;
13 import static com.ibm.icu.impl.CharacterIteration.next32;
14 import static com.ibm.icu.impl.CharacterIteration.nextTrail32;
15 
16 import java.io.ByteArrayOutputStream;
17 import java.io.IOException;
18 import java.io.InputStream;
19 import java.io.OutputStream;
20 import java.nio.ByteBuffer;
21 import java.text.CharacterIterator;
22 import java.util.ArrayList;
23 import java.util.List;
24 
25 import com.ibm.icu.impl.CharacterIteration;
26 import com.ibm.icu.impl.ICUBinary;
27 import com.ibm.icu.impl.ICUDebug;
28 import com.ibm.icu.impl.RBBIDataWrapper;
29 import com.ibm.icu.lang.UCharacter;
30 import com.ibm.icu.lang.UProperty;
31 import com.ibm.icu.lang.UScript;
32 import com.ibm.icu.util.CodePointTrie;
33 
34 /**
35  * Rule Based Break Iterator
36  * This is a port of the C++ class RuleBasedBreakIterator from ICU4C.
37  *
38  * @stable ICU 2.0
39  */
40 public class RuleBasedBreakIterator extends BreakIterator {
41     //=======================================================================
42     // Constructors & Factories
43     //=======================================================================
44 
45     /**
46      * private constructor
47      */
RuleBasedBreakIterator()48     private RuleBasedBreakIterator() {
49         fDictionaryCharCount  = 0;
50         synchronized(gAllBreakEngines) {
51             fBreakEngines = new ArrayList<>(gAllBreakEngines);
52         }
53     }
54 
55     /**
56      * Create a break iterator from a precompiled set of break rules.
57      *
58      * Creating a break iterator from the binary rules is much faster than
59      * creating one from source rules.
60      *
61      * The binary rules are generated by the RuleBasedBreakIterator.compileRules() function.
62      * Binary break iterator rules are not guaranteed to be compatible between
63      * different versions of ICU.
64      *
65      * @param is an input stream supplying the compiled binary rules.
66      * @throws IOException if there is an error while reading the rules from the InputStream.
67      * @see    #compileRules(String, OutputStream)
68      * @stable ICU 4.8
69      */
getInstanceFromCompiledRules(InputStream is)70     public static RuleBasedBreakIterator getInstanceFromCompiledRules(InputStream is) throws IOException {
71         RuleBasedBreakIterator  This = new RuleBasedBreakIterator();
72         This.fRData = RBBIDataWrapper.get(ICUBinary.getByteBufferFromInputStreamAndCloseStream(is));
73         This.fLookAheadMatches = new int[This.fRData.fFTable.fLookAheadResultsSize];
74         return This;
75     }
76 
77     /**
78      * Create a break iterator from a precompiled set of break rules.
79      *
80      * Creating a break iterator from the binary rules is much faster than
81      * creating one from source rules.
82      *
83      * The binary rules are generated by the RuleBasedBreakIterator.compileRules() function.
84      * Binary break iterator rules are not guaranteed to be compatible between
85      * different versions of ICU.
86      *
87      * @param bytes a buffer supplying the compiled binary rules.
88      * @throws IOException if there is an error while reading the rules from the buffer.
89      * @see    #compileRules(String, OutputStream)
90      * @internal
91      * @deprecated This API is ICU internal only.
92      */
93     @Deprecated
getInstanceFromCompiledRules(ByteBuffer bytes)94     public static RuleBasedBreakIterator getInstanceFromCompiledRules(ByteBuffer bytes) throws IOException {
95         RuleBasedBreakIterator  This = new RuleBasedBreakIterator();
96         This.fRData = RBBIDataWrapper.get(bytes);
97         This.fLookAheadMatches = new int[This.fRData.fFTable.fLookAheadResultsSize];
98         return This;
99     }
100 
101     /**
102      * Construct a RuleBasedBreakIterator from a set of rules supplied as a string.
103      * @param rules The break rules to be used.
104      * @stable ICU 2.2
105      */
RuleBasedBreakIterator(String rules)106     public RuleBasedBreakIterator(String rules)  {
107         this();
108         try {
109             ByteArrayOutputStream ruleOS = new ByteArrayOutputStream();
110             compileRules(rules, ruleOS);
111             fRData = RBBIDataWrapper.get(ByteBuffer.wrap(ruleOS.toByteArray()));
112             fLookAheadMatches = new int[fRData.fFTable.fLookAheadResultsSize];
113         } catch (IOException e) {
114             ///CLOVER:OFF
115             // An IO exception can only arrive here if there is a bug in the RBBI Rule compiler,
116             //  causing bogus compiled rules to be produced, but with no compile error raised.
117             RuntimeException rte = new RuntimeException("RuleBasedBreakIterator rule compilation internal error: "
118                     + e.getMessage());
119             throw rte;
120             ///CLOVER:ON
121         }
122     }
123 
124     //=======================================================================
125     // Boilerplate
126     //=======================================================================
127 
128     /**
129      * Clones this iterator.
130      * @return A newly-constructed RuleBasedBreakIterator with the same
131      * behavior as this one.
132      * @stable ICU 2.0
133      */
134     @Override
clone()135     public Object clone()  {
136         RuleBasedBreakIterator result;
137         result = (RuleBasedBreakIterator)super.clone();
138         if (fText != null) {
139             result.fText = (CharacterIterator)(fText.clone());
140         }
141         synchronized (gAllBreakEngines)  {
142             result.fBreakEngines = new ArrayList<>(gAllBreakEngines);
143         }
144         result.fLookAheadMatches = new int[fRData.fFTable.fLookAheadResultsSize];
145         result.fBreakCache = result.new BreakCache(fBreakCache);
146         result.fDictionaryCache = result.new DictionaryCache(fDictionaryCache);
147         return result;
148     }
149 
150 
151     /**
152      * Returns true if both BreakIterators are of the same class, have the same
153      * rules, and iterate over the same text.
154      * @stable ICU 2.0
155      */
156     @Override
equals(Object that)157     public boolean equals(Object that) {
158         if (that == null) {
159             return false;
160         }
161         if (this == that) {
162             return true;
163         }
164         try {
165             RuleBasedBreakIterator other = (RuleBasedBreakIterator) that;
166             if (fRData != other.fRData && (fRData == null || other.fRData == null)) {
167                 return false;
168             }
169             if (fRData != null && other.fRData != null &&
170                     (!fRData.fRuleSource.equals(other.fRData.fRuleSource))) {
171                 return false;
172             }
173             if (fText == null && other.fText == null) {
174                 return true;
175             }
176             if (fText == null || other.fText == null || !fText.equals(other.fText)) {
177                 return false;
178             }
179             return fPosition == other.fPosition;
180         }
181         catch(ClassCastException e) {
182             return false;
183         }
184      }
185 
186     /**
187      * Returns the description (rules) used to create this iterator.
188      * (In ICU4C, the same function is RuleBasedBreakIterator::getRules())
189      * @stable ICU 2.0
190      */
191     @Override
toString()192     public String toString() {
193         String retStr = "";
194         if (fRData != null) {
195             retStr =  fRData.fRuleSource;
196         }
197         return retStr;
198     }
199 
200     /**
201      * Compute a hashcode for this BreakIterator
202      * @return A hash code
203      * @stable ICU 2.0
204      */
205     @Override
hashCode()206     public int hashCode()
207     {
208         return fRData.fRuleSource.hashCode();
209     }
210 
211 
212     private static final int  START_STATE = 1;     // The state number of the starting state
213     private static final int  STOP_STATE  = 0;     // The state-transition value indicating "stop"
214 
215     // RBBIRunMode - the state machine runs an extra iteration at the beginning and end
216     //               of user text.  A variable with this enum type keeps track of where we
217     //               are.  The state machine only fetches user text input while in RUN mode.
218     private static final int  RBBI_START  = 0;
219     private static final int  RBBI_RUN    = 1;
220     private static final int  RBBI_END    = 2;
221 
222     /**
223      * The character iterator through which this BreakIterator accesses the text.
224      */
225     private CharacterIterator   fText = new java.text.StringCharacterIterator("");
226 
227     /**
228      * The rule data for this BreakIterator instance.
229      * Not intended for public use. Declared public for testing purposes only.
230      * @internal
231      * @deprecated This API is ICU internal only.
232      */
233     @Deprecated
234     public RBBIDataWrapper    fRData;
235 
236     /**
237      *  The iteration state - current position, rule status for the current position,
238      *                        and whether the iterator ran off the end, yielding UBRK_DONE.
239      *                        Current position is pinned to be 0 < position <= text.length.
240      *                        Current position is always set to a boundary.
241      *
242      *  The current  position of the iterator. Pinned, 0 < fPosition <= text.length.
243      *  Never has the value UBRK_DONE (-1).
244      */
245     private int                fPosition;
246 
247     /**
248      * Index of the Rule {tag} values for the most recent match.
249      */
250     private int                fRuleStatusIndex;
251 
252     /**
253      * True when iteration has run off the end, and iterator functions should return UBRK_DONE.
254      */
255     private boolean            fDone;
256 
257     /**
258      *  Array of look-ahead tentative results.
259      */
260     private int[]              fLookAheadMatches;
261 
262     /**
263      *   Cache of previously determined boundary positions.
264      */
265     private BreakCache         fBreakCache = new BreakCache();
266 
267 
268     /**
269      * Counter for the number of characters encountered with the "dictionary"
270      *   flag set.  Normal RBBI iterators don't use it, although the code
271      *   for updating it is live.  Dictionary Based break iterators (a subclass
272      *   of us) access this field directly.
273      * @internal
274      */
275     private int fDictionaryCharCount;
276 
277     private DictionaryCache     fDictionaryCache = new DictionaryCache();
278 
279     /**
280      * ICU debug argument name for RBBI
281      */
282     private static final String RBBI_DEBUG_ARG = "rbbi";
283 
284     /**
285      * Debugging flag.  Trace operation of state machine when true.
286      */
287     private static final boolean TRACE = ICUDebug.enabled(RBBI_DEBUG_ARG)
288             && ICUDebug.value(RBBI_DEBUG_ARG).indexOf("trace") >= 0;
289 
290     /**
291      * The "default" break engine - just skips over ranges of dictionary words,
292      * producing no breaks. Should only be used if characters need to be handled
293      * by a dictionary but we have no dictionary implementation for them.
294      *
295      * Only one instance; shared by all break iterators.
296      */
297     private static final UnhandledBreakEngine gUnhandledBreakEngine;
298 
299     /**
300      * List of all known break engines, common for all break iterators.
301      * Lazily updated as break engines are needed, because instantiation of
302      * break engines is expensive.
303      *
304      * Because gAllBreakEngines can be referenced concurrently from different
305      * BreakIterator instances, all access is synchronized.
306      */
307     private static final List<LanguageBreakEngine> gAllBreakEngines;
308 
309     static {
310         gUnhandledBreakEngine = new UnhandledBreakEngine();
311         gAllBreakEngines = new ArrayList<>();
312         gAllBreakEngines.add(gUnhandledBreakEngine);
313     }
314 
315     /**
316      * List of all known break engines. Similar to gAllBreakEngines, but local to a
317      * break iterator, allowing it to be used without synchronization.
318      */
319     private List<LanguageBreakEngine> fBreakEngines;
320 
321     /**
322      * Dump the contents of the state table and character classes for this break iterator.
323      * For debugging only.
324      * @internal
325      * @deprecated This API is ICU internal only.
326      */
327     @Deprecated
dump(java.io.PrintStream out)328     public void dump(java.io.PrintStream out) {
329         if (out == null) {
330             out = System.out;
331         }
332         this.fRData.dump(out);
333     }
334 
335     /**
336      * Compile a set of source break rules into the binary state tables used
337      * by the break iterator engine.  Creating a break iterator from precompiled
338      * rules is much faster than creating one from source rules.
339      *
340      * Binary break rules are not guaranteed to be compatible between different
341      * versions of ICU.
342      *
343      *
344      * @param rules  The source form of the break rules
345      * @param ruleBinary  An output stream to receive the compiled rules.
346      * @throws IOException If there is an error writing the output.
347      * @see #getInstanceFromCompiledRules(InputStream)
348      * @stable ICU 4.8
349      */
compileRules(String rules, OutputStream ruleBinary)350     public static void compileRules(String rules, OutputStream ruleBinary) throws IOException {
351         RBBIRuleBuilder.compileRules(rules, ruleBinary);
352     }
353 
354     //=======================================================================
355     // BreakIterator overrides
356     //=======================================================================
357 
358     /**
359      * Sets the current iteration position to the beginning of the text.
360      * (i.e., the CharacterIterator's starting offset).
361      * @return The offset of the beginning of the text.
362      * @stable ICU 2.0
363      */
364     @Override
first()365     public int first() {
366         if (fText == null) {
367             return BreakIterator.DONE;
368         }
369         fText.first();
370         int start =  fText.getIndex();
371         if (!fBreakCache.seek(start)) {
372             fBreakCache.populateNear(start);
373         }
374         fBreakCache.current();
375         assert(fPosition == start);
376         return fPosition;
377     }
378 
379     /**
380      * Sets the current iteration position to the end of the text.
381      * (i.e., the CharacterIterator's ending offset).
382      * @return The text's past-the-end offset.
383      * @stable ICU 2.0
384      */
385     @Override
last()386     public int last() {
387         if (fText == null) {
388             return BreakIterator.DONE;
389         }
390         int endPos = fText.getEndIndex();
391         boolean endShouldBeBoundary = isBoundary(endPos);      // Has side effect of setting iterator position.
392         assert(endShouldBeBoundary);
393         if (fPosition != endPos) {
394             assert(fPosition == endPos);
395         }
396         return endPos;
397     }
398 
399     /**
400      * Advances the iterator either forward or backward the specified number of steps.
401      * Negative values move backward, and positive values move forward.  This is
402      * equivalent to repeatedly calling next() or previous().
403      * @param n The number of steps to move.  The sign indicates the direction
404      * (negative is backwards, and positive is forwards).
405      * @return The character offset of the boundary position n boundaries away from
406      * the current one.
407      * @stable ICU 2.0
408      */
409     @Override
next(int n)410     public int next(int n) {
411         int result = 0;
412         if (n > 0) {
413             for (; n > 0 && result != DONE; --n) {
414                 result = next();
415             }
416         } else if (n < 0) {
417             for (; n < 0 && result != DONE; ++n) {
418                 result = previous();
419             }
420         } else {
421             result = current();
422         }
423         return result;
424     }
425 
426     /**
427      * Advances the iterator to the next boundary position.
428      * @return The position of the first boundary after this one.
429      * @stable ICU 2.0
430      */
431     @Override
next()432     public int next() {
433         fBreakCache.next();
434         return fDone ? DONE : fPosition;
435     }
436 
437     /**
438      * Moves the iterator backwards, to the boundary preceding the current one.
439      * @return The position of the boundary position immediately preceding the starting position.
440      * @stable ICU 2.0
441      */
442     @Override
previous()443     public int previous() {
444         fBreakCache.previous();
445         return fDone ? DONE : fPosition;
446     }
447 
448     /**
449      * Sets the iterator to refer to the first boundary position following
450      * the specified position.
451      * @param startPos The position from which to begin searching for a break position.
452      * @return The position of the first break after the current position.
453      * @stable ICU 2.0
454      */
455     @Override
following(int startPos)456     public int following(int startPos) {
457         // if the supplied position is before the beginning, return the
458         // text's starting offset
459         if (startPos < fText.getBeginIndex()) {
460             return first();
461         }
462 
463         // Move requested offset to a code point start. It might be between a lead and trail surrogate.
464         // Or it may be beyond the end of the text.
465         startPos = CISetIndex32(fText, startPos);
466         fBreakCache.following(startPos);
467         return fDone ? DONE : fPosition;
468     }
469 
470 
471     /**
472      * Sets the iterator to refer to the last boundary position before the
473      * specified position.
474      * @param offset The position to begin searching for a break from.
475      * @return The position of the last boundary before the starting position.
476      * @stable ICU 2.0
477      */
478     @Override
preceding(int offset)479     public int preceding(int offset) {
480         if (fText == null || offset > fText.getEndIndex()) {
481             return last();
482         } else if (offset < fText.getBeginIndex()) {
483             return first();
484         }
485 
486         // Move requested offset to a code point start. It might be between a lead and trail surrogate.
487         // int adjustedOffset = CISetIndex32(fText, offset);    // TODO: restore to match ICU4C behavior.
488         int adjustedOffset = offset;
489         fBreakCache.preceding(adjustedOffset);
490         return fDone ? DONE : fPosition;
491 
492     }
493 
494 
495     /**
496      * Throw IllegalArgumentException unless begin &lt;= offset &lt; end.
497      * @stable ICU 2.0
498      */
checkOffset(int offset, CharacterIterator text)499     protected static final void checkOffset(int offset, CharacterIterator text) {
500         if (offset < text.getBeginIndex() || offset > text.getEndIndex()) {
501             throw new IllegalArgumentException("offset out of bounds");
502         }
503     }
504 
505 
506     /**
507      * Returns true if the specified position is a boundary position.  As a side
508      * effect, leaves the iterator pointing to the first boundary position at
509      * or after "offset".
510      * @param offset the offset to check.
511      * @return True if "offset" is a boundary position.
512      * @stable ICU 2.0
513      */
514     @Override
isBoundary(int offset)515     public boolean isBoundary(int offset) {
516         // TODO: behavior difference with ICU4C, which considers out-of-range offsets
517         //       to not be boundaries, and to not be errors.
518         checkOffset(offset, fText);
519 
520         // Adjust offset to be on a code point boundary and not beyond the end of the text.
521         // Note that isBoundary() is always false for offsets that are not on code point boundaries.
522         // But we still need the side effect of leaving iteration at the following boundary.
523         int adjustedOffset = CISetIndex32(fText, offset);
524 
525         boolean result = false;
526         if (fBreakCache.seek(adjustedOffset) || fBreakCache.populateNear(adjustedOffset)) {
527             result = (fBreakCache.current() == offset);
528         }
529 
530         if (!result) {
531             // Not on a boundary. isBoundary() must leave iterator on the following boundary.
532             // fBreakCache.seek(), above, left us on the preceding boundary, so advance one.
533             next();
534         }
535         return result;
536 
537     }
538 
539     /**
540      * Returns the current iteration position.  Note that DONE is never
541      * returned from this function; if iteration has run to the end of a
542      * string, current() will return the length of the string while
543      * next() will return BreakIterator.DONE).
544      * @return The current iteration position.
545      * @stable ICU 2.0
546      */
547     @Override
current()548     public int current() {
549         return (fText != null) ? fPosition : BreakIterator.DONE;
550     }
551 
552 
553     /**
554      * Return the status tag from the break rule that determined the boundary at
555      * the current iteration position.  The values appear in the rule source
556      * within brackets, {123}, for example.  For rules that do not specify a
557      * status, a default value of 0 is returned.  If more than one rule applies,
558      * the numerically largest of the possible status values is returned.
559      * <p>
560      * Of the standard types of ICU break iterators, only the word and line break
561      * iterator provides status values.  The values are defined in
562      * class RuleBasedBreakIterator, and allow distinguishing between words
563      * that contain alphabetic letters, "words" that appear to be numbers,
564      * punctuation and spaces, words containing ideographic characters, and
565      * more.  Call <code>getRuleStatus</code> after obtaining a boundary
566      * position from <code>next()</code>, <code>previous()</code>, or
567      * any other break iterator functions that returns a boundary position.
568      * <p>
569      * Note that <code>getRuleStatus()</code> returns the value corresponding to
570      * <code>current()</code> index even after <code>next()</code> has returned DONE.
571      * <p>
572 
573      * @return the status from the break rule that determined the boundary
574      * at the current iteration position.
575      *
576      * @stable ICU 60
577      */
578 
579     @Override
getRuleStatus()580     public int  getRuleStatus() {
581         //   Status records have this form:
582         //           Count N         <--  fLastRuleStatusIndex points here.
583         //           Status val 0
584         //           Status val 1
585         //              ...
586         //           Status val N-1  <--  the value we need to return
587         //   The status values are sorted in ascending order.
588         //   This function returns the last (largest) of the array of status values.
589         int  idx = fRuleStatusIndex + fRData.fStatusTable[fRuleStatusIndex];
590         int  tagVal = fRData.fStatusTable[idx];
591         return tagVal;
592     }
593 
594     /**
595      * Get the status (tag) values from the break rule(s) that determined the boundary
596      * at the current iteration position.  The values appear in the rule source
597      * within brackets, {123}, for example.  The default status value for rules
598      * that do not explicitly provide one is zero.
599      * <p>
600      * The status values used by the standard ICU break rules are defined
601      * as public constants in class RuleBasedBreakIterator.
602      * <p>
603      * If the size  of the output array is insufficient to hold the data,
604      *  the output will be truncated to the available length.  No exception
605      *  will be thrown.
606      *
607      * @param fillInArray an array to be filled in with the status values.
608      * @return          The number of rule status values from the rules that determined
609      *                  the boundary at the current iteration position.
610      *                  In the event that the array is too small, the return value
611      *                  is the total number of status values that were available,
612      *                  not the reduced number that were actually returned.
613      * @stable ICU 60
614      */
615     @Override
getRuleStatusVec(int[] fillInArray)616     public int getRuleStatusVec(int[] fillInArray) {
617         int numStatusVals = fRData.fStatusTable[fRuleStatusIndex];
618         if (fillInArray != null) {
619             int numToCopy = Math.min(numStatusVals, fillInArray.length);
620             for (int i=0; i<numToCopy; i++) {
621                 fillInArray[i] = fRData.fStatusTable[fRuleStatusIndex + i + 1];
622             }
623         }
624         return numStatusVals;
625     }
626 
627     /**
628      * Returns a CharacterIterator over the text being analyzed.
629      * <p>
630      * <b><i>Caution:</i></b>The state of the returned CharacterIterator
631      * must not be modified in any way while the BreakIterator is still in use.
632      * Doing so will lead to undefined behavior of the BreakIterator.
633      * Clone the returned CharacterIterator first and work with that.
634      * <p>
635      * The returned CharacterIterator is a reference
636      * to the <b>actual iterator being used</b> by the BreakIterator.
637      * No guarantees are made about the current position
638      * of this iterator when it is returned; it may differ from the
639      * BreakIterators current position.  If you need to move that
640      * position to examine the text, clone this function's return value first.
641      * @return An iterator over the text being analyzed.
642      * @stable ICU 2.0
643      */
644     @Override
getText()645     public CharacterIterator getText() {
646         return fText;
647     }
648 
649     /**
650      * Set the iterator to analyze a new piece of text.  This function resets
651      * the current iteration position to the beginning of the text.
652      * (The old iterator is dropped.)
653      * <p>
654      * <b><i>Caution:</i></b> The supplied CharacterIterator is used
655      * directly by the BreakIterator, and must not be altered in any
656      * way by code outside of the BreakIterator.
657      * Doing so will lead to undefined behavior of the BreakIterator.
658      *
659      * @param newText An iterator over the text to analyze.
660      * @stable ICU 2.0
661      */
662     @Override
setText(CharacterIterator newText)663     public void setText(CharacterIterator newText) {
664         if (newText != null) {
665             fBreakCache.reset(newText.getBeginIndex(), 0);
666         } else {
667             fBreakCache.reset();
668         }
669         fDictionaryCache.reset();
670         fText = newText;
671         this.first();
672     }
673 
674      /**
675      * Control debug, trace and dump options.
676      * @internal
677      * @deprecated This API is ICU internal only.
678      */
679     @Deprecated
680     public static final String fDebugEnv = ICUDebug.enabled(RBBI_DEBUG_ARG) ?
681                                            ICUDebug.value(RBBI_DEBUG_ARG) : null;
682 
683 
getLanguageBreakEngine(int c)684     private LanguageBreakEngine getLanguageBreakEngine(int c) {
685 
686         // We have a dictionary character.
687         // Does an already instantiated break engine handle it?
688         for (LanguageBreakEngine candidate : fBreakEngines) {
689             if (candidate.handles(c)) {
690                 return candidate;
691             }
692         }
693 
694         synchronized (gAllBreakEngines) {
695             // This break iterator's list of break engines didn't handle the character.
696             // Check the global list, another break iterator may have instantiated the
697             // desired engine.
698             for (LanguageBreakEngine candidate : gAllBreakEngines) {
699                 if (candidate.handles(c)) {
700                     fBreakEngines.add(candidate);
701                     return candidate;
702                 }
703             }
704 
705             // The global list doesn't have an existing engine, build one.
706             int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
707             if (script == UScript.KATAKANA || script == UScript.HIRAGANA) {
708                 // Katakana, Hiragana and Han are handled by the same dictionary engine.
709                 // Fold them together for mapping from script -> engine.
710                 script = UScript.HAN;
711             }
712 
713             LanguageBreakEngine eng;
714             try {
715                 switch (script) {
716                 case UScript.THAI:
717                     eng = new ThaiBreakEngine();
718                     break;
719                 case UScript.LAO:
720                     eng = new LaoBreakEngine();
721                     break;
722                 case UScript.MYANMAR:
723                     eng = new BurmeseBreakEngine();
724                     break;
725                 case UScript.KHMER:
726                     eng = new KhmerBreakEngine();
727                     break;
728                 case UScript.HAN:
729                     eng = new CjkBreakEngine(false);
730                      break;
731                 case UScript.HANGUL:
732                     eng = new CjkBreakEngine(true);
733                     break;
734                 default:
735                     gUnhandledBreakEngine.handleChar(c);
736                     eng = gUnhandledBreakEngine;
737                     break;
738                 }
739             } catch (IOException e) {
740                 eng = null;
741             }
742 
743             if (eng != null && eng != gUnhandledBreakEngine) {
744                 gAllBreakEngines.add(eng);
745                 fBreakEngines.add(eng);
746             }
747             return eng;
748         }   // end synchronized(gAllBreakEngines)
749     }
750 
751     /**
752      * The State Machine Engine for moving forward is here.
753      * This function is the heart of the RBBI run time engine.
754      *
755      * Input
756      *    fPosition, the position in the text to begin from.
757      * Output
758      *    fPosition:           the boundary following the starting position.
759      *    fDictionaryCharCount the number of dictionary characters encountered.
760      *                         If > 0, the segment will be further subdivided
761      *    fRuleStatusIndex     Info from the state table indicating which rules caused the boundary.
762      *
763      * @return the new iterator position
764      *
765      * A note on supplementary characters and the position of underlying
766      * Java CharacterIterator:   Normally, a character iterator is positioned at
767      * the char most recently returned by next().  Within this function, when
768      * a supplementary char is being processed, the char iterator is left
769      * sitting on the trail surrogate, in the middle of the code point.
770      * This is different from everywhere else, where an iterator always
771      * points at the lead surrogate of a supplementary.
772      */
handleNext()773     private int handleNext() {
774         if (TRACE) {
775             System.out.println("Handle Next   pos      char  state category");
776         }
777 
778         // handleNext always sets the break tag value.
779         // Set the default for it.
780         fRuleStatusIndex  = 0;
781         fDictionaryCharCount = 0;
782 
783         // caches for quicker access
784         CharacterIterator text = fText;
785         CodePointTrie trie = fRData.fTrie;
786 
787         char[] stateTable  = fRData.fFTable.fTable;
788         int initialPosition = fPosition;
789         text.setIndex(initialPosition);
790         int result          = initialPosition;
791 
792         // Set up the starting char
793         int c = text.current();
794         if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) {
795             c = nextTrail32(text, c);
796             if (c == DONE32) {
797                 fDone = true;
798                 return BreakIterator.DONE;
799             }
800         }
801 
802         // Set the initial state for the state machine
803         int state           = START_STATE;
804         int row             = fRData.getRowIndex(state);
805         short category      = 3;
806         int flagsState      = fRData.fFTable.fFlags;
807         int dictStart       = fRData.fFTable.fDictCategoriesStart;
808         int mode            = RBBI_RUN;
809         if ((flagsState & RBBIDataWrapper.RBBI_BOF_REQUIRED) != 0) {
810             category = 2;
811             mode     = RBBI_START;
812             if (TRACE) {
813                 System.out.print("            " +  RBBIDataWrapper.intToString(text.getIndex(), 5));
814                 System.out.print(RBBIDataWrapper.intToHexString(c, 10));
815                 System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6));
816             }
817         }
818 
819         // loop until we reach the end of the text or transition to state 0
820         while (state != STOP_STATE) {
821             if (c == DONE32) {
822                 // Reached end of input string.
823                 if (mode == RBBI_END) {
824                     // We have already run the loop one last time with the
825                     // character set to the pseudo {eof} value. Now it is time
826                     // to unconditionally bail out.
827                     break;
828                 }
829                 // Run the loop one last time with the fake end-of-input character category
830                 mode = RBBI_END;
831                 category = 1;
832             }
833             else if (mode == RBBI_RUN) {
834                 // Get the char category.  An incoming category of 1 or 2 mens that
835                 //      we are preset for doing the beginning or end of input, and
836                 //      that we shouldn't get a category from an actual text input character.
837                 //
838 
839                 // look up the current character's character category, which tells us
840                 // which column in the state table to look at.
841                 //
842                 category = (short) trie.get(c);
843 
844                 // Check for categories that require word dictionary handling.
845                 if (category >= dictStart) {
846                     fDictionaryCharCount++;
847                 }
848 
849                 if (TRACE) {
850                     System.out.print("            " +  RBBIDataWrapper.intToString(text.getIndex(), 5));
851                     System.out.print(RBBIDataWrapper.intToHexString(c, 10));
852                     System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6));
853                 }
854 
855                 // Advance to the next character.
856                 // If this is a beginning-of-input loop iteration, don't advance.
857                 //    The next iteration will be processing the first real input character.
858                 c = text.next();
859                 if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) {
860                     c = nextTrail32(text, c);
861                 }
862             }
863             else {
864                 mode = RBBI_RUN;
865             }
866 
867             // look up a state transition in the state table
868             state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category];
869             row   = fRData.getRowIndex(state);
870             int accepting = stateTable[row + RBBIDataWrapper.ACCEPTING];
871             if (accepting == RBBIDataWrapper.ACCEPTING_UNCONDITIONAL) {
872                 // Match found, common case
873                 result = text.getIndex();
874                 if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c <= UTF16.CODEPOINT_MAX_VALUE) {
875                     // The iterator has been left in the middle of a surrogate pair.
876                     // We want the start of it.
877                     result--;
878                 }
879 
880                 //  Remember the break status (tag) values.
881                 fRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGSIDX];
882             } else if (accepting > RBBIDataWrapper.ACCEPTING_UNCONDITIONAL) {
883                 // Lookahead match is completed
884                 int lookaheadResult = fLookAheadMatches[accepting];
885                 if (lookaheadResult >= 0) {
886                     fRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGSIDX];
887                     fPosition = lookaheadResult;
888                     return lookaheadResult;
889                 }
890             }
891 
892 
893             // If we are at the position of the '/' in a look-ahead (hard break) rule;
894             // record the current position, to be returned later, if the full rule matches.
895             // TODO: Move this check before the previous check of fAccepting.
896             //       This would enable hard-break rules with no following context.
897             //       But there are line break test failures when trying this. Investigate.
898             //       Issue ICU-20837
899             int rule = stateTable[row + RBBIDataWrapper.LOOKAHEAD];
900             if (rule != 0) {
901                 int  pos = text.getIndex();
902                 if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c <= UTF16.CODEPOINT_MAX_VALUE) {
903                     // The iterator has been left in the middle of a surrogate pair.
904                     // We want the beginning  of it.
905                     pos--;
906                 }
907                 fLookAheadMatches[rule] = pos;
908             }
909 
910 
911         }        // End of state machine main loop
912 
913         // The state machine is done.  Check whether it found a match...
914 
915         // If the iterator failed to advance in the match engine force it ahead by one.
916         // This indicates a defect in the break rules, which should always match
917         // at least one character.
918 
919         if (result == initialPosition) {
920             if (TRACE) {
921                 System.out.println("Iterator did not move. Advancing by 1.");
922             }
923             text.setIndex(initialPosition);
924             next32(text);
925             result = text.getIndex();
926             fRuleStatusIndex = 0;
927         }
928 
929         // Leave the iterator at our result position.
930         //   (we may have advanced beyond the last accepting position chasing after
931         //    longer matches that never completed.)
932         fPosition = result;
933 
934         if (TRACE) {
935             System.out.println("result = " + result);
936         }
937         return result;
938     }
939 
940     /**
941      * Iterate backwards from an arbitrary position in the input text using the Safe Reverse rules.
942      * This locates a "Safe Position" from which the forward break rules
943      * will operate correctly. A Safe Position is not necessarily a boundary itself.
944      *
945      * The logic of this function is very similar to handleNext(), above, but simpler
946      * because the safe table does not require as many options.
947      *
948      * @param fromPosition the position in the input text to begin the iteration.
949      * @internal
950      */
handleSafePrevious(int fromPosition)951     private int handleSafePrevious(int fromPosition) {
952         char            state;
953         short           category = 0;
954         int             result = 0;
955 
956         // caches for quicker access
957         CharacterIterator text = fText;
958         CodePointTrie trie = fRData.fTrie;
959         char[] stateTable  = fRData.fRTable.fTable;
960 
961         CISetIndex32(text, fromPosition);
962         if (TRACE) {
963             System.out.print("Handle Previous   pos   char  state category");
964         }
965 
966         // if we're already at the start of the text, return DONE.
967         if (text.getIndex() == text.getBeginIndex()) {
968             return BreakIterator.DONE;
969         }
970 
971         //  Set the initial state for the state machine
972         int c = CharacterIteration.previous32(text);
973         state = START_STATE;
974         int row = fRData.getRowIndex(state);
975 
976         // loop until we reach the start of the text or transition to state 0
977         //
978         for (; c != DONE32; c = CharacterIteration.previous32(text)) {
979 
980             // look up the current character's character category, which tells us
981             // which column in the state table to look at.
982             //
983             //  And off the dictionary flag bit. For reverse iteration it is not used.
984             category = (short) trie.get(c);
985             if (TRACE) {
986                 System.out.print("            " +  RBBIDataWrapper.intToString(text.getIndex(), 5));
987                 System.out.print(RBBIDataWrapper.intToHexString(c, 10));
988                 System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6));
989             }
990 
991             // State Transition - move machine to its next state
992             //
993             assert(category < fRData.fHeader.fCatCount);
994             state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category];
995             row   = fRData.getRowIndex(state);
996 
997             if (state == STOP_STATE) {
998                 // This is the normal exit from the lookup state machine.
999                 // Transition to state zero means we have found a safe point.
1000                 break;
1001             }
1002         }
1003 
1004         // The state machine is done.
1005         result = text.getIndex();
1006         if (TRACE) {
1007             System.out.println("result = " + result);
1008         }
1009         return result;
1010     }
1011 
1012     /**
1013      * Set the index of a CharacterIterator.
1014      * Pin the index to the valid range range of BeginIndex <= index <= EndIndex.
1015      * If the index points to a trail surrogate of a supplementary character, adjust it
1016      * to the start (lead surrogate) index.
1017      *
1018      * @param ci A CharacterIterator to set
1019      * @param index the index to set
1020      * @return the resulting index, possibly pinned or adjusted.
1021      */
1022     private static int CISetIndex32(CharacterIterator ci, int index) {
1023         if (index <= ci.getBeginIndex()) {
1024             ci.first();
1025         } else if (index >= ci.getEndIndex()) {
1026             ci.setIndex(ci.getEndIndex());
1027         } else if (Character.isLowSurrogate(ci.setIndex(index))) {
1028             if (!Character.isHighSurrogate(ci.previous())) {
1029                 ci.next();
1030             }
1031         }
1032         return ci.getIndex();
1033     }
1034 
1035     /** DictionaryCache  stores the boundaries obtained from a run of dictionary characters.
1036      *                 Dictionary boundaries are moved first to this cache, then from here
1037      *                 to the main BreakCache, where they may inter-leave with non-dictionary
1038      *                 boundaries. The public BreakIterator API always fetches directly
1039      *                 from the main BreakCache, not from here.
1040      *
1041      *                 In common situations, the number of boundaries in a single dictionary run
1042      *                 should be quite small, it will be terminated by punctuation, spaces,
1043      *                 or any other non-dictionary characters. The main BreakCache may end
1044      *                 up with boundaries from multiple dictionary based runs.
1045      *
1046      *                 The boundaries are stored in a simple ArrayList (vector), with the
1047      *                 assumption that they will be accessed sequentially.
1048      */
1049     class DictionaryCache  {
1050 
1051          void reset() {
1052              fPositionInCache = -1;
1053              fStart = 0;
1054              fLimit = 0;
1055              fFirstRuleStatusIndex = 0;
1056              fOtherRuleStatusIndex = 0;
1057              fBreaks.removeAllElements();
1058          };
1059 
1060          boolean following(int fromPos) {
1061              if (fromPos >= fLimit || fromPos < fStart) {
1062                  fPositionInCache = -1;
1063                  return false;
1064              }
1065 
1066              // Sequential iteration, move from previous boundary to the following
1067 
1068              int r = 0;
1069              if (fPositionInCache >= 0 && fPositionInCache < fBreaks.size() && fBreaks.elementAt(fPositionInCache) == fromPos) {
1070                  ++fPositionInCache;
1071                  if (fPositionInCache >= fBreaks.size()) {
1072                      fPositionInCache = -1;
1073                      return false;
1074                  }
1075                  r = fBreaks.elementAt(fPositionInCache);
1076                  assert(r > fromPos);
1077                  fBoundary = r;
1078                  fStatusIndex = fOtherRuleStatusIndex;
1079                  return true;
1080              }
1081 
1082              // Random indexing. Linear search for the boundary following the given position.
1083 
1084              for (fPositionInCache = 0; fPositionInCache < fBreaks.size(); ++fPositionInCache) {
1085                  r= fBreaks.elementAt(fPositionInCache);
1086                  if (r > fromPos) {
1087                      fBoundary = r;
1088                      fStatusIndex = fOtherRuleStatusIndex;
1089                      return true;
1090                  }
1091              }
1092 
1093              // Internal error. fStart <= fromPos < fLimit, but no cached boundary.
1094              assert(false);
1095              fPositionInCache = -1;
1096              return false;
1097          };
1098 
preceding(int fromPos)1099          boolean preceding(int fromPos) {
1100              if (fromPos <= fStart || fromPos > fLimit) {
1101                  fPositionInCache = -1;
1102                  return false;
1103              }
1104 
1105              if (fromPos == fLimit) {
1106                  fPositionInCache = fBreaks.size() - 1;
1107                  if (fPositionInCache >= 0) {
1108                      assert(fBreaks.elementAt(fPositionInCache) == fromPos);
1109                  }
1110              }
1111 
1112              int r;
1113              if (fPositionInCache > 0 && fPositionInCache < fBreaks.size() && fBreaks.elementAt(fPositionInCache) == fromPos) {
1114                  --fPositionInCache;
1115                  r = fBreaks.elementAt(fPositionInCache);
1116                  assert(r < fromPos);
1117                  fBoundary = r;
1118                  fStatusIndex = ( r== fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex;
1119                  return true;
1120              }
1121 
1122              if (fPositionInCache == 0) {
1123                  fPositionInCache = -1;
1124                  return false;
1125              }
1126 
1127              for (fPositionInCache = fBreaks.size()-1; fPositionInCache >= 0; --fPositionInCache) {
1128                  r = fBreaks.elementAt(fPositionInCache);
1129                  if (r < fromPos) {
1130                      fBoundary = r;
1131                      fStatusIndex = ( r == fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex;
1132                      return true;
1133                  }
1134              }
1135              assert(false);
1136              fPositionInCache = -1;
1137              return false;
1138          };
1139 
1140         /**
1141          * Populate the cache with the dictionary based boundaries within a region of text.
1142          * @param startPos  The start position of a range of text
1143          * @param endPos    The end position of a range of text
1144          * @param firstRuleStatus The rule status index that applies to the break at startPos
1145          * @param otherRuleStatus The rule status index that applies to boundaries other than startPos
1146          * @internal
1147          */
populateDictionary(int startPos, int endPos, int firstRuleStatus, int otherRuleStatus)1148         void populateDictionary(int startPos, int endPos,
1149                                 int firstRuleStatus, int otherRuleStatus) {
1150             if ((endPos - startPos) <= 1) {
1151                 return;
1152             }
1153 
1154             reset();
1155             fFirstRuleStatusIndex = firstRuleStatus;
1156             fOtherRuleStatusIndex = otherRuleStatus;
1157 
1158             int rangeStart = startPos;
1159             int rangeEnd = endPos;
1160 
1161             int         category;
1162             int         current;
1163             int         foundBreakCount = 0;
1164 
1165             // Loop through the text, looking for ranges of dictionary characters.
1166             // For each span, find the appropriate break engine, and ask it to find
1167             // any breaks within the span.
1168 
1169             fText.setIndex(rangeStart);
1170             int     c = CharacterIteration.current32(fText);
1171             category = (short)fRData.fTrie.get(c);
1172             int dictStart = fRData.fFTable.fDictCategoriesStart;
1173 
1174             while(true) {
1175                 while((current = fText.getIndex()) < rangeEnd && (category < dictStart)) {
1176                     c = CharacterIteration.next32(fText);    // pre-increment
1177                     category = (short)fRData.fTrie.get(c);
1178                 }
1179                 if (current >= rangeEnd) {
1180                     break;
1181                 }
1182 
1183                 // We now have a dictionary character. Get the appropriate language object
1184                 // to deal with it.
1185                 LanguageBreakEngine lbe = getLanguageBreakEngine(c);
1186 
1187                 // Ask the language object if there are any breaks. It will add them to the cache and
1188                 // leave the text pointer on the other side of its range, ready to search for the next one.
1189                 if (lbe != null) {
1190                     foundBreakCount += lbe.findBreaks(fText, rangeStart, rangeEnd, fBreaks);
1191                 }
1192 
1193                 // Reload the loop variables for the next go-round
1194                 c = CharacterIteration.current32(fText);
1195                 category = (short)fRData.fTrie.get(c);
1196             }
1197 
1198             // If we found breaks, ensure that the first and last entries are
1199             // the original starting and ending position. And initialize the
1200             // cache iteration position to the first entry.
1201 
1202             // System.out.printf("foundBreakCount = %d%n", foundBreakCount);
1203             if (foundBreakCount > 0) {
1204                 assert(foundBreakCount == fBreaks.size());
1205                 if (startPos < fBreaks.elementAt(0)) {
1206                     // The dictionary did not place a boundary at the start of the segment of text.
1207                     // Add one now. This should not commonly happen, but it would be easy for interactions
1208                     // of the rules for dictionary segments and the break engine implementations to
1209                     // inadvertently cause it. Cover it here, just in case.
1210                     fBreaks.offer(startPos);
1211                 }
1212                 if (endPos > fBreaks.peek()) {
1213                     fBreaks.push(endPos);
1214                 }
1215                 fPositionInCache = 0;
1216                 // Note: Dictionary matching may extend beyond the original limit.
1217                 fStart = fBreaks.elementAt(0);
1218                 fLimit = fBreaks.peek();
1219             } else {
1220                 // there were no language-based breaks, even though the segment contained
1221                 // dictionary characters. Subsequent attempts to fetch boundaries from the dictionary cache
1222                 // for this range will fail, and the calling code will fall back to the rule based boundaries.
1223             }
1224 
1225         };
1226 
1227 
DictionaryCache()1228         DictionaryCache() {
1229             fPositionInCache = -1;
1230             fBreaks = new DictionaryBreakEngine.DequeI();
1231         }
1232 
1233         /**
1234          * copy constructor. Used by RuleBasedBreakIterator.clone().
1235          *
1236          * @param src the source object to be copied.
1237          */
DictionaryCache(DictionaryCache src)1238         DictionaryCache(DictionaryCache src)  {
1239             try {
1240                 fBreaks = (DictionaryBreakEngine.DequeI)src.fBreaks.clone();
1241             }
1242             catch (CloneNotSupportedException e) {
1243                 throw new RuntimeException(e);
1244             }
1245             fPositionInCache      = src.fPositionInCache;
1246             fStart                = src.fStart;
1247             fLimit                = src.fLimit;
1248             fFirstRuleStatusIndex = src.fFirstRuleStatusIndex;
1249             fOtherRuleStatusIndex = src.fOtherRuleStatusIndex;
1250             fBoundary             = src.fBoundary;
1251             fStatusIndex          = src.fStatusIndex;
1252         }
1253 
1254         // A data structure containing the boundaries themselves. Essentially a vector of raw ints.
1255         DictionaryBreakEngine.DequeI fBreaks;
1256         int             fPositionInCache;       // Index in fBreaks of last boundary returned by following()
1257         //                                      //    or preceding(). Optimizes sequential access.
1258         int             fStart;                 // Text position of first boundary in cache.
1259         int             fLimit;                 // Last boundary in cache. Which is the limit of the
1260         //                                      //    text segment being handled by the dictionary.
1261         int             fFirstRuleStatusIndex;  // Rule status info for first boundary.
1262         int             fOtherRuleStatusIndex;  // Rule status info for 2nd through last boundaries.
1263         int             fBoundary;              // Current boundary. Set by preceding(), following().
1264         int             fStatusIndex;           // Current rule status index. Set by preceding, following().
1265     };
1266 
1267 
1268 
1269 
1270 /*
1271  * class BreakCache
1272  *
1273  * Cache of break boundary positions and rule status values.
1274  * Break iterator API functions, next(), previous(), etc., will use cached results
1275  * when possible, and otherwise cache new results as they are obtained.
1276  *
1277  * Uniformly caches both dictionary and rule based (non-dictionary) boundaries.
1278  *
1279  * The cache is implemented as a single circular buffer.
1280  */
1281 
1282 /*
1283  * size of the circular cache buffer.
1284  */
1285 
1286 class BreakCache {
1287 
BreakCache()1288     BreakCache() {
1289         reset();
1290     };
1291 
reset(int pos, int ruleStatus)1292     void  reset(int pos, int ruleStatus) {
1293         fStartBufIdx = 0;
1294         fEndBufIdx = 0;
1295         fTextIdx = pos;
1296         fBufIdx = 0;
1297         fBoundaries[0] = pos;
1298         fStatuses[0] = (short)ruleStatus;
1299     }
1300 
reset()1301     void  reset() {reset(0, 0); };
1302 
next()1303     void  next() {
1304         if (fBufIdx == fEndBufIdx) {
1305             fDone = !populateFollowing();
1306             fPosition = fTextIdx;
1307             fRuleStatusIndex = fStatuses[fBufIdx];
1308         } else {
1309             fBufIdx = modChunkSize(fBufIdx + 1);
1310             fTextIdx = fPosition = fBoundaries[fBufIdx];
1311             fRuleStatusIndex = fStatuses[fBufIdx];
1312         }
1313     };
1314 
previous()1315     void  previous() {
1316         int initialBufIdx = fBufIdx;
1317         if (fBufIdx == fStartBufIdx) {
1318             // At start of cache. Prepend to it.
1319             populatePreceding();
1320         } else {
1321             // Cache already holds the next boundary
1322             fBufIdx = modChunkSize(fBufIdx - 1);
1323             fTextIdx = fBoundaries[fBufIdx];
1324         }
1325         fDone = (fBufIdx == initialBufIdx);
1326         fPosition = fTextIdx;
1327         fRuleStatusIndex = fStatuses[fBufIdx];
1328         return;
1329     };
1330 
1331     // Move the iteration state to the position following the startPosition.
1332     // Input position must be pinned to the input length.
following(int startPos)1333     void following(int startPos) {
1334         if (startPos == fTextIdx || seek(startPos) || populateNear(startPos)) {
1335             // startPos is in the cache. Do a next() from that position.
1336             // TODO: an awkward set of interactions with bi->fDone
1337             //       seek() does not clear it; it can't because of interactions with populateNear().
1338             //       next() does not clear it in the fast-path case, where everything matters. Maybe it should.
1339             //       So clear it here, for the case where seek() succeeded on an iterator that had previously run off the end.
1340             fDone = false;
1341             next();
1342         }
1343 
1344     };
1345 
preceding(int startPos)1346     void  preceding(int startPos) {
1347         if (startPos == fTextIdx || seek(startPos) || populateNear(startPos)) {
1348             if (startPos == fTextIdx) {
1349                 previous();
1350             } else {
1351                 // seek() leaves the BreakCache positioned at the preceding boundary
1352                 //        if the requested position is between two boundaries.
1353                 // current() pushes the BreakCache position out to the BreakIterator itself.
1354                 assert(startPos > fTextIdx);
1355                 current();
1356             }
1357         }
1358         return;
1359     };
1360 
1361     /**
1362      * Update the state of the public BreakIterator (fBI) to reflect the
1363      * current state of the break iterator cache (this).
1364      */
current()1365     int current() {
1366         fPosition = fTextIdx;
1367         fRuleStatusIndex = fStatuses[fBufIdx];
1368         fDone = false;
1369         return fTextIdx;
1370     };
1371 
1372     /**
1373      * Add boundaries to the cache near the specified position.
1374      * The given position need not be a boundary itself.
1375      * The input position must be within the range of the text, and
1376      * on a code point boundary.
1377      * If the requested position is a break boundary, leave the iteration
1378      * position on it.
1379      * If the requested position is not a boundary, leave the iteration
1380      * position on the preceding boundary and include both the the
1381      * preceding and following boundaries in the cache.
1382      * Additional boundaries, either preceding or following, may be added
1383      * to the cache as a side effect.
1384      *
1385      * Return false if the operation failed.
1386      */
populateNear(int position)1387     boolean populateNear(int position) {
1388         assert(position < fBoundaries[fStartBufIdx] || position > fBoundaries[fEndBufIdx]);
1389 
1390         // Find a boundary somewhere in the vicinity of the requested position.
1391         // Depending on the safe rules and the text data, it could be either before, at, or after
1392         // the requested position.
1393 
1394 
1395         // If the requested position is not near already cached positions, clear the existing cache,
1396         // find a near-by boundary and begin new cache contents there.
1397 
1398         if ((position < fBoundaries[fStartBufIdx] - 15) || position > (fBoundaries[fEndBufIdx] + 15)) {
1399             int aBoundary = fText.getBeginIndex();
1400             int ruleStatusIndex = 0;
1401             if (position > aBoundary + 20) {
1402                 int backupPos = handleSafePrevious(position);
1403                 if (backupPos > aBoundary) {
1404                     // Advance to the boundary following the backup position.
1405                     // There is a complication: the safe reverse rules identify pairs of code points
1406                     // that are safe. If advancing from the safe point moves forwards by less than
1407                     // two code points, we need to advance one more time to ensure that the boundary
1408                     // is good, including a correct rules status value.
1409                     //
1410                     fPosition = backupPos;
1411                     aBoundary = handleNext();
1412                     if (aBoundary == backupPos + 1 ||
1413                             (aBoundary == backupPos + 2 &&
1414                             Character.isHighSurrogate(fText.setIndex(backupPos)) &&
1415                             Character.isLowSurrogate(fText.next()))) {
1416                         // The initial handleNext() only advanced by a single code point. Go again.
1417                         // Safe rules identify safe pairs.
1418                         aBoundary = handleNext();
1419                     }
1420                 }
1421                 ruleStatusIndex = fRuleStatusIndex;
1422             }
1423             reset(aBoundary, ruleStatusIndex);               // Reset cache to hold aBoundary as a single starting point.
1424         }
1425 
1426         // Fill in boundaries between existing cache content and the new requested position.
1427 
1428         if (fBoundaries[fEndBufIdx] < position) {
1429             // The last position in the cache precedes the requested position.
1430             // Add following position(s) to the cache.
1431             while (fBoundaries[fEndBufIdx] < position) {
1432                 if (!populateFollowing()) {
1433                     assert false;
1434                     return false;
1435                 }
1436             }
1437             fBufIdx = fEndBufIdx;                      // Set iterator position to the end of the buffer.
1438             fTextIdx = fBoundaries[fBufIdx];           // Required because populateFollowing may add extra boundaries.
1439             while (fTextIdx > position) {              // Move backwards to a position at or preceding the requested pos.
1440                 previous();
1441             }
1442             return true;
1443         }
1444 
1445         if (fBoundaries[fStartBufIdx] > position) {
1446             // The first position in the cache is beyond the requested position.
1447             // back up more until we get a boundary <= the requested position.
1448             while (fBoundaries[fStartBufIdx] > position) {
1449                 populatePreceding();
1450             }
1451             fBufIdx = fStartBufIdx;                    // Set iterator position to the start of the buffer.
1452             fTextIdx = fBoundaries[fBufIdx];           // Required because populatePreceding may add extra boundaries.
1453             while (fTextIdx < position) {              // Move forwards to a position at or following the requested pos.
1454                 next();
1455             }
1456             if (fTextIdx > position) {
1457                 // If position is not itself a boundary, the next() loop above will overshoot.
1458                 // Back up one, leaving cache position at the boundary preceding the requested position.
1459                 previous();
1460             }
1461             return true;
1462         }
1463 
1464         assert fTextIdx == position;
1465         return true;
1466 
1467     };
1468 
1469     /**
1470      *  Add boundary(s) to the cache following the current last boundary.
1471      *  Return false if at the end of the text, and no more boundaries can be added.
1472      *  Leave iteration position at the first newly added boundary, or unchanged if no boundary was added.
1473      */
populateFollowing()1474     boolean populateFollowing() {
1475         int fromPosition = fBoundaries[fEndBufIdx];
1476         int fromRuleStatusIdx = fStatuses[fEndBufIdx];
1477         int pos = 0;
1478         int ruleStatusIdx = 0;
1479 
1480         if (fDictionaryCache.following(fromPosition)) {
1481             addFollowing(fDictionaryCache.fBoundary, fDictionaryCache.fStatusIndex, UpdateCachePosition);
1482             return true;
1483         }
1484 
1485         fPosition = fromPosition;
1486         pos = handleNext();
1487         if (pos == BreakIterator.DONE) {
1488             return false;
1489         }
1490 
1491         ruleStatusIdx = fRuleStatusIndex;
1492         if (fDictionaryCharCount > 0) {
1493             // The text segment obtained from the rules includes dictionary characters.
1494             // Subdivide it, with subdivided results going into the dictionary cache.
1495             fDictionaryCache.populateDictionary(fromPosition, pos, fromRuleStatusIdx, ruleStatusIdx);
1496             if (fDictionaryCache.following(fromPosition)) {
1497                 addFollowing(fDictionaryCache.fBoundary, fDictionaryCache.fStatusIndex, UpdateCachePosition);
1498                 return true;
1499                 // TODO: may want to move a sizable chunk of the dictionary cache to the break cache at this point.
1500                 //       But be careful with interactions with populateNear().
1501             }
1502         }
1503 
1504         // Rule based segment did not include dictionary characters.
1505         // Or, it did contain dictionary chars, but the dictionary segmenter didn't handle them,
1506         //    meaning that we didn't take the return, above.
1507         // Add its end point to the cache.
1508         addFollowing(pos, ruleStatusIdx, UpdateCachePosition);
1509 
1510         // Add several non-dictionary boundaries at this point, to optimize straight forward iteration.
1511         //    (subsequent calls to BreakIterator::next() will take the fast path, getting cached results.
1512         //
1513         for (int count=0; count<6; ++count) {
1514             pos = handleNext();
1515             if (pos == BreakIterator.DONE || fDictionaryCharCount > 0) {
1516                 break;
1517             }
1518             addFollowing(pos, fRuleStatusIndex, RetainCachePosition);
1519         }
1520         return true;
1521     };
1522 
1523     /**
1524      *  Add one or more boundaries to the cache preceding the first currently cached boundary.
1525      *  Leave the iteration position on the first added boundary.
1526      *  Return false if no boundaries could be added (if at the start of the text.)
1527      */
populatePreceding()1528     boolean populatePreceding() {
1529         int textBegin = fText.getBeginIndex();
1530         int fromPosition = fBoundaries[fStartBufIdx];
1531         if (fromPosition == textBegin) {
1532             return false;
1533         }
1534 
1535         int position = textBegin;
1536         int positionStatusIdx = 0;
1537 
1538         if (fDictionaryCache.preceding(fromPosition)) {
1539             addPreceding(fDictionaryCache.fBoundary, fDictionaryCache.fStatusIndex, UpdateCachePosition);
1540             return true;
1541         }
1542 
1543         int backupPosition = fromPosition;
1544 
1545         // Find a boundary somewhere preceding the first already-cached boundary
1546         do {
1547             backupPosition = backupPosition - 30;
1548             if (backupPosition <= textBegin) {
1549                 backupPosition = textBegin;
1550             } else {
1551                 backupPosition = handleSafePrevious(backupPosition);
1552             }
1553             if (backupPosition == BreakIterator.DONE || backupPosition == textBegin) {
1554                 position = textBegin;
1555                 positionStatusIdx = 0;
1556             } else {
1557                 // Advance to the boundary following the backup position.
1558                 // There is a complication: the safe reverse rules identify pairs of code points
1559                 // that are safe. If advancing from the safe point moves forwards by less than
1560                 // two code points, we need to advance one more time to ensure that the boundary
1561                 // is good, including a correct rules status value.
1562                 //
1563                 fPosition = backupPosition;  // TODO: pass starting position in a clearer way.
1564                 position = handleNext();
1565                 if (position == backupPosition + 1 ||
1566                         (position == backupPosition + 2 &&
1567                         Character.isHighSurrogate(fText.setIndex(backupPosition)) &&
1568                         Character.isLowSurrogate(fText.next()))) {
1569                     // The initial handleNext() only advanced by a single code point. Go again.
1570                     // Safe rules identify safe pairs.
1571                     position = handleNext();
1572                 }
1573                 positionStatusIdx = fRuleStatusIndex;
1574             }
1575         } while (position >= fromPosition);
1576 
1577         // Find boundaries between the one we just located and the first already-cached boundary
1578         // Put them in a side buffer, because we don't yet know where they will fall in the circular cache buffer.
1579 
1580         fSideBuffer.removeAllElements();
1581         fSideBuffer.push(position);
1582         fSideBuffer.push(positionStatusIdx);
1583 
1584         do {
1585             int prevPosition = fPosition = position;
1586             int prevStatusIdx = positionStatusIdx;
1587             position = handleNext();
1588             positionStatusIdx = fRuleStatusIndex;
1589             if (position == BreakIterator.DONE) {
1590                 break;
1591             }
1592 
1593             boolean segmentHandledByDictionary = false;
1594             if (fDictionaryCharCount != 0) {
1595                 // Segment from the rules includes dictionary characters.
1596                 // Subdivide it, with subdivided results going into the dictionary cache.
1597                 int dictSegEndPosition = position;
1598                 fDictionaryCache.populateDictionary(prevPosition, dictSegEndPosition, prevStatusIdx, positionStatusIdx);
1599                 while (fDictionaryCache.following(prevPosition)) {
1600                     position = fDictionaryCache.fBoundary;
1601                     positionStatusIdx = fDictionaryCache.fStatusIndex;
1602                     segmentHandledByDictionary = true;
1603                     assert(position > prevPosition);
1604                     if (position >= fromPosition) {
1605                         break;
1606                     }
1607                     assert(position <= dictSegEndPosition);
1608                     fSideBuffer.push(position);
1609                     fSideBuffer.push(positionStatusIdx);
1610                     prevPosition = position;
1611                 }
1612                 assert(position==dictSegEndPosition || position>=fromPosition);
1613             }
1614 
1615             if (!segmentHandledByDictionary && position < fromPosition) {
1616                 fSideBuffer.push(position);
1617                 fSideBuffer.push(positionStatusIdx);
1618             }
1619         } while (position < fromPosition);
1620 
1621         // Move boundaries from the side buffer to the main circular buffer.
1622         boolean success = false;
1623         if (!fSideBuffer.isEmpty()) {
1624             positionStatusIdx = fSideBuffer.pop();
1625             position = fSideBuffer.pop();
1626             addPreceding(position, positionStatusIdx, UpdateCachePosition);
1627             success = true;
1628         }
1629 
1630         while (!fSideBuffer.isEmpty()) {
1631             positionStatusIdx = fSideBuffer.pop();
1632             position = fSideBuffer.pop();
1633             if (!addPreceding(position, positionStatusIdx, RetainCachePosition)) {
1634                 // No space in circular buffer to hold a new preceding result while
1635                 // also retaining the current cache (iteration) position.
1636                 // Bailing out is safe; the cache will refill again if needed.
1637                 break;
1638             }
1639         }
1640         return success;
1641     };
1642 
1643 
1644     static final boolean RetainCachePosition = false;
1645     static final boolean UpdateCachePosition = true;
1646 
1647     /**
1648      * Add the boundary following the current position.
1649      * The current position can be left as it was, or changed to the newly added boundary,
1650      * as specified by the update parameter.
1651      */
addFollowing(int position, int ruleStatusIdx, boolean update)1652     void addFollowing(int position, int ruleStatusIdx, boolean update) {
1653         assert(position > fBoundaries[fEndBufIdx]);
1654         assert(ruleStatusIdx <= Short.MAX_VALUE);
1655         int nextIdx = modChunkSize(fEndBufIdx + 1);
1656         if (nextIdx == fStartBufIdx) {
1657             fStartBufIdx = modChunkSize(fStartBufIdx + 6);    // TODO: experiment. Probably revert to 1.
1658         }
1659         fBoundaries[nextIdx] = position;
1660         fStatuses[nextIdx] = (short)ruleStatusIdx;
1661         fEndBufIdx = nextIdx;
1662         if (update == UpdateCachePosition) {
1663             // Set current position to the newly added boundary.
1664             fBufIdx = nextIdx;
1665             fTextIdx = position;
1666         } else {
1667             // Retaining the original cache position.
1668             // Check if the added boundary wraps around the buffer, and would over-write the original position.
1669             // It's the responsibility of callers of this function to not add too many.
1670             assert(nextIdx != fBufIdx);
1671         }
1672 
1673     };
1674 
1675 
1676     /**
1677      * Add the boundary preceding the current position.
1678      * The current position can be left as it was, or changed to the newly added boundary,
1679      * as specified by the update parameter.
1680      */
addPreceding(int position, int ruleStatusIdx, boolean update)1681     boolean addPreceding(int position, int ruleStatusIdx, boolean update) {
1682         assert(position < fBoundaries[fStartBufIdx]);
1683         assert(ruleStatusIdx <= Short.MAX_VALUE);
1684         int nextIdx = modChunkSize(fStartBufIdx - 1);
1685         if (nextIdx == fEndBufIdx) {
1686             if (fBufIdx == fEndBufIdx && update == RetainCachePosition) {
1687                 // Failure. The insertion of the new boundary would claim the buffer position that is the
1688                 // current iteration position. And we also want to retain the current iteration position.
1689                 // (The buffer is already completely full of entries that precede the iteration position.)
1690                 return false;
1691             }
1692             fEndBufIdx = modChunkSize(fEndBufIdx - 1);
1693         }
1694         fBoundaries[nextIdx] = position;
1695         fStatuses[nextIdx] = (short)ruleStatusIdx;
1696         fStartBufIdx = nextIdx;
1697         if (update == UpdateCachePosition) {
1698             fBufIdx = nextIdx;
1699             fTextIdx = position;
1700         }
1701         return true;
1702     };
1703 
1704     /**
1705      *  Set the cache position to the specified position, or, if the position
1706      *  falls between to cached boundaries, to the preceding boundary.
1707      *  Fails if the requested position is outside of the range of boundaries currently held by the cache.
1708      *  The startPosition must be on a code point boundary.
1709      *
1710      *  Return true if successful, false if the specified position is after
1711      *  the last cached boundary or before the first.
1712      */
1713     boolean seek(int pos) {
1714         if (pos < fBoundaries[fStartBufIdx] || pos > fBoundaries[fEndBufIdx]) {
1715             return false;
1716         }
1717         if (pos == fBoundaries[fStartBufIdx]) {
1718             // Common case: seek(0), from BreakIterator::first()
1719             fBufIdx = fStartBufIdx;
1720             fTextIdx = fBoundaries[fBufIdx];
1721             return true;
1722         }
1723         if (pos == fBoundaries[fEndBufIdx]) {
1724             fBufIdx = fEndBufIdx;
1725             fTextIdx = fBoundaries[fBufIdx];
1726             return true;
1727         }
1728 
1729         int min = fStartBufIdx;
1730         int max = fEndBufIdx;
1731         while (min != max) {
1732             int probe = (min + max + (min>max ? CACHE_SIZE : 0)) / 2;
1733             probe = modChunkSize(probe);
1734             if (fBoundaries[probe] > pos) {
1735                 max = probe;
1736             } else {
1737                 min = modChunkSize(probe + 1);
1738             }
1739         }
1740         assert(fBoundaries[max] > pos);
1741         fBufIdx = modChunkSize(max - 1);
1742         fTextIdx = fBoundaries[fBufIdx];
1743         assert(fTextIdx <= pos);
1744         return true;
1745 
1746     };
1747 
1748 
1749     /**
1750      * copy constructor, used from RuleBasedBreakIterator.clone().
1751      *
1752      * @param src
1753      */
BreakCache(BreakCache src)1754     BreakCache(BreakCache src)  {
1755         fStartBufIdx = src.fStartBufIdx;
1756         fEndBufIdx = src.fEndBufIdx;
1757         fTextIdx = src.fTextIdx;
1758         fBufIdx = src.fBufIdx;
1759         fBoundaries = src.fBoundaries.clone();
1760         fStatuses = src.fStatuses.clone();
1761         fSideBuffer = new DictionaryBreakEngine.DequeI();  // Transient, no need to clone contents.
1762     }
1763 
dumpCache()1764     void dumpCache() {
1765         System.out.printf("fTextIdx:%d   fBufIdx:%d%n", fTextIdx, fBufIdx);
1766         for (int i=fStartBufIdx; ; i=modChunkSize(i+1)) {
1767             System.out.printf("%d  %d%n", i, fBoundaries[i]);
1768             if (i == fEndBufIdx) {
1769                 break;
1770             }
1771         }
1772     };
1773 
modChunkSize(int index)1774     private final int   modChunkSize(int index) { return index & (CACHE_SIZE - 1); };
1775 
1776     static final int CACHE_SIZE = 128;
1777     // static_assert((CACHE_SIZE & (CACHE_SIZE-1)) == 0, "CACHE_SIZE must be power of two.");
1778 
1779     int                 fStartBufIdx;
1780     int                 fEndBufIdx;    // inclusive
1781 
1782     int                 fTextIdx;
1783     int                 fBufIdx;
1784 
1785     int[]               fBoundaries = new int[CACHE_SIZE];
1786     short[]             fStatuses = new short[CACHE_SIZE];
1787 
1788     DictionaryBreakEngine.DequeI   fSideBuffer = new DictionaryBreakEngine.DequeI();
1789 };
1790 
1791 
1792 
1793 
1794 }
1795 
1796