1 /* GENERATED SOURCE. DO NOT MODIFY. */ 2 // © 2016 and later: Unicode, Inc. and others. 3 // License & terms of use: http://www.unicode.org/copyright.html 4 /* 5 ******************************************************************************* 6 * Copyright (C) 2005-2016 International Business Machines Corporation and 7 * others. All Rights Reserved. 8 ******************************************************************************* 9 */ 10 11 package android.icu.text; 12 13 import static android.icu.impl.CharacterIteration.DONE32; 14 import static android.icu.impl.CharacterIteration.next32; 15 import static android.icu.impl.CharacterIteration.nextTrail32; 16 17 import java.io.ByteArrayOutputStream; 18 import java.io.IOException; 19 import java.io.InputStream; 20 import java.io.OutputStream; 21 import java.nio.ByteBuffer; 22 import java.text.CharacterIterator; 23 import java.util.ArrayList; 24 import java.util.List; 25 import java.util.MissingResourceException; 26 27 import android.icu.impl.CharacterIteration; 28 import android.icu.impl.ICUBinary; 29 import android.icu.impl.ICUDebug; 30 import android.icu.impl.RBBIDataWrapper; 31 import android.icu.impl.breakiter.BurmeseBreakEngine; 32 import android.icu.impl.breakiter.CjkBreakEngine; 33 import android.icu.impl.breakiter.DictionaryBreakEngine; 34 import android.icu.impl.breakiter.KhmerBreakEngine; 35 import android.icu.impl.breakiter.LSTMBreakEngine; 36 import android.icu.impl.breakiter.LanguageBreakEngine; 37 import android.icu.impl.breakiter.LaoBreakEngine; 38 import android.icu.impl.breakiter.ThaiBreakEngine; 39 import android.icu.impl.breakiter.UnhandledBreakEngine; 40 import android.icu.lang.UCharacter; 41 import android.icu.lang.UProperty; 42 import android.icu.lang.UScript; 43 import android.icu.util.CodePointTrie; 44 45 /** 46 * Rule Based Break Iterator 47 * This is a port of the C++ class RuleBasedBreakIterator from ICU4C. 48 * 49 * @hide Only a subset of ICU is exposed in Android 50 */ 51 public class RuleBasedBreakIterator extends BreakIterator { 52 //======================================================================= 53 // Constructors & Factories 54 //======================================================================= 55 56 /** 57 * private constructor 58 */ RuleBasedBreakIterator()59 private RuleBasedBreakIterator() { 60 fDictionaryCharCount = 0; 61 synchronized(gAllBreakEngines) { 62 fBreakEngines = new ArrayList<>(gAllBreakEngines); 63 } 64 } 65 66 /** 67 * Create a break iterator from a precompiled set of break rules. 68 * 69 * Creating a break iterator from the binary rules is much faster than 70 * creating one from source rules. 71 * 72 * The binary rules are generated by the RuleBasedBreakIterator.compileRules() function. 73 * Binary break iterator rules are not guaranteed to be compatible between 74 * different versions of ICU. 75 * 76 * @param is an input stream supplying the compiled binary rules. 77 * @throws IOException if there is an error while reading the rules from the InputStream. 78 * @see #compileRules(String, OutputStream) 79 */ getInstanceFromCompiledRules(InputStream is)80 public static RuleBasedBreakIterator getInstanceFromCompiledRules(InputStream is) throws IOException { 81 RuleBasedBreakIterator This = new RuleBasedBreakIterator(); 82 This.fRData = RBBIDataWrapper.get(ICUBinary.getByteBufferFromInputStreamAndCloseStream(is)); 83 This.fLookAheadMatches = new int[This.fRData.fFTable.fLookAheadResultsSize]; 84 return This; 85 } 86 87 /** 88 * This factory method doesn't have an access modifier; it is only accessible in the same 89 * package. 90 * 91 * Create a break iterator from a precompiled set of break rules. 92 * 93 * Creating a break iterator from the binary rules is much faster than 94 * creating one from source rules. 95 * 96 * The binary rules are generated by the RuleBasedBreakIterator.compileRules() function. 97 * Binary break iterator rules are not guaranteed to be compatible between 98 * different versions of ICU. 99 * 100 * @param bytes a buffer supplying the compiled binary rules. 101 * @param phraseBreaking a flag indicating if phrase breaking is required. 102 * @throws IOException if there is an error while reading the rules from the buffer. 103 * @see #compileRules(String, OutputStream) 104 * @hide draft / provisional / internal are hidden on Android 105 */ getInstanceFromCompiledRules( ByteBuffer bytes, boolean phraseBreaking)106 /* package-potected */ static RuleBasedBreakIterator getInstanceFromCompiledRules( 107 ByteBuffer bytes, boolean phraseBreaking) throws IOException { 108 RuleBasedBreakIterator instance = getInstanceFromCompiledRules(bytes); 109 instance.fPhraseBreaking = phraseBreaking; 110 return instance; 111 } 112 113 /** 114 * Create a break iterator from a precompiled set of break rules. 115 * 116 * Creating a break iterator from the binary rules is much faster than 117 * creating one from source rules. 118 * 119 * The binary rules are generated by the RuleBasedBreakIterator.compileRules() function. 120 * Binary break iterator rules are not guaranteed to be compatible between 121 * different versions of ICU. 122 * 123 * @param bytes a buffer supplying the compiled binary rules. 124 * @throws IOException if there is an error while reading the rules from the buffer. 125 * @see #compileRules(String, OutputStream) 126 * @deprecated This API is ICU internal only. 127 * @hide draft / provisional / internal are hidden on Android 128 */ 129 @Deprecated getInstanceFromCompiledRules(ByteBuffer bytes)130 public static RuleBasedBreakIterator getInstanceFromCompiledRules(ByteBuffer bytes) throws IOException { 131 RuleBasedBreakIterator This = new RuleBasedBreakIterator(); 132 This.fRData = RBBIDataWrapper.get(bytes); 133 This.fLookAheadMatches = new int[This.fRData.fFTable.fLookAheadResultsSize]; 134 return This; 135 } 136 137 /** 138 * Construct a RuleBasedBreakIterator from a set of rules supplied as a string. 139 * @param rules The break rules to be used. 140 */ RuleBasedBreakIterator(String rules)141 public RuleBasedBreakIterator(String rules) { 142 this(); 143 try { 144 ByteArrayOutputStream ruleOS = new ByteArrayOutputStream(); 145 compileRules(rules, ruleOS); 146 fRData = RBBIDataWrapper.get(ByteBuffer.wrap(ruleOS.toByteArray())); 147 fLookAheadMatches = new int[fRData.fFTable.fLookAheadResultsSize]; 148 } catch (IOException e) { 149 ///CLOVER:OFF 150 // An IO exception can only arrive here if there is a bug in the RBBI Rule compiler, 151 // causing bogus compiled rules to be produced, but with no compile error raised. 152 RuntimeException rte = new RuntimeException("RuleBasedBreakIterator rule compilation internal error: " 153 + e.getMessage()); 154 throw rte; 155 ///CLOVER:ON 156 } 157 } 158 159 //======================================================================= 160 // Boilerplate 161 //======================================================================= 162 163 /** 164 * Clones this iterator. 165 * @return A newly-constructed RuleBasedBreakIterator with the same 166 * behavior as this one. 167 */ 168 @Override clone()169 public Object clone() { 170 RuleBasedBreakIterator result; 171 result = (RuleBasedBreakIterator)super.clone(); 172 if (fText != null) { 173 result.fText = (CharacterIterator)(fText.clone()); 174 } 175 synchronized (gAllBreakEngines) { 176 result.fBreakEngines = new ArrayList<>(gAllBreakEngines); 177 } 178 result.fLookAheadMatches = new int[fRData.fFTable.fLookAheadResultsSize]; 179 result.fBreakCache = result.new BreakCache(fBreakCache); 180 result.fDictionaryCache = result.new DictionaryCache(fDictionaryCache); 181 return result; 182 } 183 184 185 /** 186 * Returns true if both BreakIterators are of the same class, have the same 187 * rules, and iterate over the same text. 188 */ 189 @Override equals(Object that)190 public boolean equals(Object that) { 191 if (that == null) { 192 return false; 193 } 194 if (this == that) { 195 return true; 196 } 197 try { 198 RuleBasedBreakIterator other = (RuleBasedBreakIterator) that; 199 if (fRData != other.fRData && (fRData == null || other.fRData == null)) { 200 return false; 201 } 202 if (fRData != null && other.fRData != null && 203 (!fRData.fRuleSource.equals(other.fRData.fRuleSource))) { 204 return false; 205 } 206 if (fText == null && other.fText == null) { 207 return true; 208 } 209 if (fText == null || other.fText == null || !fText.equals(other.fText)) { 210 return false; 211 } 212 return fPosition == other.fPosition; 213 } 214 catch(ClassCastException e) { 215 return false; 216 } 217 } 218 219 /** 220 * Returns the description (rules) used to create this iterator. 221 * (In ICU4C, the same function is RuleBasedBreakIterator::getRules()) 222 */ 223 @Override toString()224 public String toString() { 225 String retStr = ""; 226 if (fRData != null) { 227 retStr = fRData.fRuleSource; 228 } 229 return retStr; 230 } 231 232 /** 233 * Compute a hashcode for this BreakIterator 234 * @return A hash code 235 */ 236 @Override hashCode()237 public int hashCode() 238 { 239 return fRData.fRuleSource.hashCode(); 240 } 241 242 243 private static final int START_STATE = 1; // The state number of the starting state 244 private static final int STOP_STATE = 0; // The state-transition value indicating "stop" 245 246 // RBBIRunMode - the state machine runs an extra iteration at the beginning and end 247 // of user text. A variable with this enum type keeps track of where we 248 // are. The state machine only fetches user text input while in RUN mode. 249 private static final int RBBI_START = 0; 250 private static final int RBBI_RUN = 1; 251 private static final int RBBI_END = 2; 252 253 /** 254 * The character iterator through which this BreakIterator accesses the text. 255 */ 256 private CharacterIterator fText = new java.text.StringCharacterIterator(""); 257 258 /** 259 * The rule data for this BreakIterator instance. 260 * Not intended for public use. Declared public for testing purposes only. 261 * @deprecated This API is ICU internal only. 262 * @hide draft / provisional / internal are hidden on Android 263 */ 264 @Deprecated 265 public RBBIDataWrapper fRData; 266 267 /** 268 * The iteration state - current position, rule status for the current position, 269 * and whether the iterator ran off the end, yielding UBRK_DONE. 270 * Current position is pinned to be 0 < position <= text.length. 271 * Current position is always set to a boundary. 272 * 273 * The current position of the iterator. Pinned, 0 < fPosition <= text.length. 274 * Never has the value UBRK_DONE (-1). 275 */ 276 private int fPosition; 277 278 /** 279 * Index of the Rule {tag} values for the most recent match. 280 */ 281 private int fRuleStatusIndex; 282 283 /** 284 * True when iteration has run off the end, and iterator functions should return UBRK_DONE. 285 */ 286 private boolean fDone; 287 288 /** 289 * Array of look-ahead tentative results. 290 */ 291 private int[] fLookAheadMatches; 292 293 /** 294 * Cache of previously determined boundary positions. 295 */ 296 private BreakCache fBreakCache = new BreakCache(); 297 298 /** 299 * Flag used to indicate if phrase breaking is required. 300 */ 301 private boolean fPhraseBreaking = false; 302 303 304 /** 305 * Counter for the number of characters encountered with the "dictionary" 306 * flag set. Normal RBBI iterators don't use it, although the code 307 * for updating it is live. Dictionary Based break iterators (a subclass 308 * of us) access this field directly. 309 * @hide draft / provisional / internal are hidden on Android 310 */ 311 private int fDictionaryCharCount; 312 313 private DictionaryCache fDictionaryCache = new DictionaryCache(); 314 315 /** 316 * ICU debug argument name for RBBI 317 */ 318 private static final String RBBI_DEBUG_ARG = "rbbi"; 319 320 /** 321 * Debugging flag. Trace operation of state machine when true. 322 */ 323 private static final boolean TRACE = ICUDebug.enabled(RBBI_DEBUG_ARG) 324 && ICUDebug.value(RBBI_DEBUG_ARG).indexOf("trace") >= 0; 325 326 /** 327 * The "default" break engine - just skips over ranges of dictionary words, 328 * producing no breaks. Should only be used if characters need to be handled 329 * by a dictionary but we have no dictionary implementation for them. 330 * 331 * Only one instance; shared by all break iterators. 332 */ 333 private static final UnhandledBreakEngine gUnhandledBreakEngine; 334 335 /** 336 * List of all known break engines, common for all break iterators. 337 * Lazily updated as break engines are needed, because instantiation of 338 * break engines is expensive. 339 * 340 * Because gAllBreakEngines can be referenced concurrently from different 341 * BreakIterator instances, all access is synchronized. 342 */ 343 private static final List<LanguageBreakEngine> gAllBreakEngines; 344 345 static { 346 gUnhandledBreakEngine = new UnhandledBreakEngine(); 347 gAllBreakEngines = new ArrayList<>(); 348 gAllBreakEngines.add(gUnhandledBreakEngine); 349 } 350 351 /** 352 * List of all known break engines. Similar to gAllBreakEngines, but local to a 353 * break iterator, allowing it to be used without synchronization. 354 */ 355 private List<LanguageBreakEngine> fBreakEngines; 356 357 /** 358 * Dump the contents of the state table and character classes for this break iterator. 359 * For debugging only. 360 * @deprecated This API is ICU internal only. 361 * @hide draft / provisional / internal are hidden on Android 362 */ 363 @Deprecated dump(java.io.PrintStream out)364 public void dump(java.io.PrintStream out) { 365 if (out == null) { 366 out = System.out; 367 } 368 this.fRData.dump(out); 369 } 370 371 /** 372 * Compile a set of source break rules into the binary state tables used 373 * by the break iterator engine. Creating a break iterator from precompiled 374 * rules is much faster than creating one from source rules. 375 * 376 * Binary break rules are not guaranteed to be compatible between different 377 * versions of ICU. 378 * 379 * 380 * @param rules The source form of the break rules 381 * @param ruleBinary An output stream to receive the compiled rules. 382 * @throws IOException If there is an error writing the output. 383 * @see #getInstanceFromCompiledRules(InputStream) 384 */ compileRules(String rules, OutputStream ruleBinary)385 public static void compileRules(String rules, OutputStream ruleBinary) throws IOException { 386 RBBIRuleBuilder.compileRules(rules, ruleBinary); 387 } 388 389 //======================================================================= 390 // BreakIterator overrides 391 //======================================================================= 392 393 /** 394 * Sets the current iteration position to the beginning of the text. 395 * (i.e., the CharacterIterator's starting offset). 396 * @return The offset of the beginning of the text. 397 */ 398 @Override first()399 public int first() { 400 if (fText == null) { 401 return BreakIterator.DONE; 402 } 403 fText.first(); 404 int start = fText.getIndex(); 405 if (!fBreakCache.seek(start)) { 406 fBreakCache.populateNear(start); 407 } 408 fBreakCache.current(); 409 assert(fPosition == start); 410 return fPosition; 411 } 412 413 /** 414 * Sets the current iteration position to the end of the text. 415 * (i.e., the CharacterIterator's ending offset). 416 * @return The text's past-the-end offset. 417 */ 418 @Override last()419 public int last() { 420 if (fText == null) { 421 return BreakIterator.DONE; 422 } 423 int endPos = fText.getEndIndex(); 424 boolean endShouldBeBoundary = isBoundary(endPos); // Has side effect of setting iterator position. 425 assert(endShouldBeBoundary); 426 if (fPosition != endPos) { 427 assert(fPosition == endPos); 428 } 429 return endPos; 430 } 431 432 /** 433 * Advances the iterator either forward or backward the specified number of steps. 434 * Negative values move backward, and positive values move forward. This is 435 * equivalent to repeatedly calling next() or previous(). 436 * @param n The number of steps to move. The sign indicates the direction 437 * (negative is backwards, and positive is forwards). 438 * @return The character offset of the boundary position n boundaries away from 439 * the current one. 440 */ 441 @Override next(int n)442 public int next(int n) { 443 int result = 0; 444 if (n > 0) { 445 for (; n > 0 && result != DONE; --n) { 446 result = next(); 447 } 448 } else if (n < 0) { 449 for (; n < 0 && result != DONE; ++n) { 450 result = previous(); 451 } 452 } else { 453 result = current(); 454 } 455 return result; 456 } 457 458 /** 459 * Advances the iterator to the next boundary position. 460 * @return The position of the first boundary after this one. 461 */ 462 @Override next()463 public int next() { 464 fBreakCache.next(); 465 return fDone ? DONE : fPosition; 466 } 467 468 /** 469 * Moves the iterator backwards, to the boundary preceding the current one. 470 * @return The position of the boundary position immediately preceding the starting position. 471 */ 472 @Override previous()473 public int previous() { 474 fBreakCache.previous(); 475 return fDone ? DONE : fPosition; 476 } 477 478 /** 479 * Sets the iterator to refer to the first boundary position following 480 * the specified position. 481 * @param startPos The position from which to begin searching for a break position. 482 * @return The position of the first break after the current position. 483 */ 484 @Override following(int startPos)485 public int following(int startPos) { 486 // if the supplied position is before the beginning, return the 487 // text's starting offset 488 if (startPos < fText.getBeginIndex()) { 489 return first(); 490 } 491 492 // Move requested offset to a code point start. It might be between a lead and trail surrogate. 493 // Or it may be beyond the end of the text. 494 startPos = CISetIndex32(fText, startPos); 495 fBreakCache.following(startPos); 496 return fDone ? DONE : fPosition; 497 } 498 499 500 /** 501 * Sets the iterator to refer to the last boundary position before the 502 * specified position. 503 * @param offset The position to begin searching for a break from. 504 * @return The position of the last boundary before the starting position. 505 */ 506 @Override preceding(int offset)507 public int preceding(int offset) { 508 if (fText == null || offset > fText.getEndIndex()) { 509 return last(); 510 } else if (offset < fText.getBeginIndex()) { 511 return first(); 512 } 513 514 // Move requested offset to a code point start. It might be between a lead and trail surrogate. 515 // int adjustedOffset = CISetIndex32(fText, offset); // TODO: restore to match ICU4C behavior. 516 int adjustedOffset = offset; 517 fBreakCache.preceding(adjustedOffset); 518 return fDone ? DONE : fPosition; 519 520 } 521 522 523 /** 524 * Throw IllegalArgumentException unless begin <= offset < end. 525 */ checkOffset(int offset, CharacterIterator text)526 protected static final void checkOffset(int offset, CharacterIterator text) { 527 if (offset < text.getBeginIndex() || offset > text.getEndIndex()) { 528 throw new IllegalArgumentException("offset out of bounds"); 529 } 530 } 531 532 533 /** 534 * Returns true if the specified position is a boundary position. As a side 535 * effect, leaves the iterator pointing to the first boundary position at 536 * or after "offset". 537 * @param offset the offset to check. 538 * @return True if "offset" is a boundary position. 539 */ 540 @Override isBoundary(int offset)541 public boolean isBoundary(int offset) { 542 // TODO: behavior difference with ICU4C, which considers out-of-range offsets 543 // to not be boundaries, and to not be errors. 544 checkOffset(offset, fText); 545 546 // Adjust offset to be on a code point boundary and not beyond the end of the text. 547 // Note that isBoundary() is always false for offsets that are not on code point boundaries. 548 // But we still need the side effect of leaving iteration at the following boundary. 549 int adjustedOffset = CISetIndex32(fText, offset); 550 551 boolean result = false; 552 if (fBreakCache.seek(adjustedOffset) || fBreakCache.populateNear(adjustedOffset)) { 553 result = (fBreakCache.current() == offset); 554 } 555 556 if (!result) { 557 // Not on a boundary. isBoundary() must leave iterator on the following boundary. 558 // fBreakCache.seek(), above, left us on the preceding boundary, so advance one. 559 next(); 560 } 561 return result; 562 563 } 564 565 /** 566 * Returns the current iteration position. Note that DONE is never 567 * returned from this function; if iteration has run to the end of a 568 * string, current() will return the length of the string while 569 * next() will return BreakIterator.DONE). 570 * @return The current iteration position. 571 */ 572 @Override current()573 public int current() { 574 return (fText != null) ? fPosition : BreakIterator.DONE; 575 } 576 577 578 /** 579 * Return the status tag from the break rule that determined the boundary at 580 * the current iteration position. The values appear in the rule source 581 * within brackets, {123}, for example. For rules that do not specify a 582 * status, a default value of 0 is returned. If more than one rule applies, 583 * the numerically largest of the possible status values is returned. 584 * <p> 585 * Of the standard types of ICU break iterators, only the word and line break 586 * iterator provides status values. The values are defined in 587 * class RuleBasedBreakIterator, and allow distinguishing between words 588 * that contain alphabetic letters, "words" that appear to be numbers, 589 * punctuation and spaces, words containing ideographic characters, and 590 * more. Call <code>getRuleStatus</code> after obtaining a boundary 591 * position from <code>next()</code>, <code>previous()</code>, or 592 * any other break iterator functions that returns a boundary position. 593 * <p> 594 * Note that <code>getRuleStatus()</code> returns the value corresponding to 595 * <code>current()</code> index even after <code>next()</code> has returned DONE. 596 * <p> 597 598 * @return the status from the break rule that determined the boundary 599 * at the current iteration position. 600 */ 601 602 @Override getRuleStatus()603 public int getRuleStatus() { 604 // Status records have this form: 605 // Count N <-- fLastRuleStatusIndex points here. 606 // Status val 0 607 // Status val 1 608 // ... 609 // Status val N-1 <-- the value we need to return 610 // The status values are sorted in ascending order. 611 // This function returns the last (largest) of the array of status values. 612 int idx = fRuleStatusIndex + fRData.fStatusTable[fRuleStatusIndex]; 613 int tagVal = fRData.fStatusTable[idx]; 614 return tagVal; 615 } 616 617 /** 618 * Get the status (tag) values from the break rule(s) that determined the boundary 619 * at the current iteration position. The values appear in the rule source 620 * within brackets, {123}, for example. The default status value for rules 621 * that do not explicitly provide one is zero. 622 * <p> 623 * The status values used by the standard ICU break rules are defined 624 * as public constants in class RuleBasedBreakIterator. 625 * <p> 626 * If the size of the output array is insufficient to hold the data, 627 * the output will be truncated to the available length. No exception 628 * will be thrown. 629 * 630 * @param fillInArray an array to be filled in with the status values. 631 * @return The number of rule status values from the rules that determined 632 * the boundary at the current iteration position. 633 * In the event that the array is too small, the return value 634 * is the total number of status values that were available, 635 * not the reduced number that were actually returned. 636 */ 637 @Override getRuleStatusVec(int[] fillInArray)638 public int getRuleStatusVec(int[] fillInArray) { 639 int numStatusVals = fRData.fStatusTable[fRuleStatusIndex]; 640 if (fillInArray != null) { 641 int numToCopy = Math.min(numStatusVals, fillInArray.length); 642 for (int i=0; i<numToCopy; i++) { 643 fillInArray[i] = fRData.fStatusTable[fRuleStatusIndex + i + 1]; 644 } 645 } 646 return numStatusVals; 647 } 648 649 /** 650 * Returns a CharacterIterator over the text being analyzed. 651 * <p> 652 * <b><i>Caution:</i></b>The state of the returned CharacterIterator 653 * must not be modified in any way while the BreakIterator is still in use. 654 * Doing so will lead to undefined behavior of the BreakIterator. 655 * Clone the returned CharacterIterator first and work with that. 656 * <p> 657 * The returned CharacterIterator is a reference 658 * to the <b>actual iterator being used</b> by the BreakIterator. 659 * No guarantees are made about the current position 660 * of this iterator when it is returned; it may differ from the 661 * BreakIterators current position. If you need to move that 662 * position to examine the text, clone this function's return value first. 663 * @return An iterator over the text being analyzed. 664 */ 665 @Override getText()666 public CharacterIterator getText() { 667 return fText; 668 } 669 670 /** 671 * Set the iterator to analyze a new piece of text. This function resets 672 * the current iteration position to the beginning of the text. 673 * (The old iterator is dropped.) 674 * <p> 675 * <b><i>Caution:</i></b> The supplied CharacterIterator is used 676 * directly by the BreakIterator, and must not be altered in any 677 * way by code outside of the BreakIterator. 678 * Doing so will lead to undefined behavior of the BreakIterator. 679 * 680 * @param newText An iterator over the text to analyze. 681 */ 682 @Override setText(CharacterIterator newText)683 public void setText(CharacterIterator newText) { 684 if (newText != null) { 685 fBreakCache.reset(newText.getBeginIndex(), 0); 686 } else { 687 fBreakCache.reset(); 688 } 689 fDictionaryCache.reset(); 690 fText = newText; 691 this.first(); 692 } 693 694 /** 695 * Control debug, trace and dump options. 696 * @deprecated This API is ICU internal only. 697 * @hide draft / provisional / internal are hidden on Android 698 */ 699 @Deprecated 700 public static final String fDebugEnv = ICUDebug.enabled(RBBI_DEBUG_ARG) ? 701 ICUDebug.value(RBBI_DEBUG_ARG) : null; 702 703 getLanguageBreakEngine(int c)704 private LanguageBreakEngine getLanguageBreakEngine(int c) { 705 706 // We have a dictionary character. 707 // Does an already instantiated break engine handle it? 708 for (LanguageBreakEngine candidate : fBreakEngines) { 709 if (candidate.handles(c)) { 710 return candidate; 711 } 712 } 713 714 synchronized (gAllBreakEngines) { 715 // This break iterator's list of break engines didn't handle the character. 716 // Check the global list, another break iterator may have instantiated the 717 // desired engine. 718 for (LanguageBreakEngine candidate : gAllBreakEngines) { 719 if (candidate.handles(c)) { 720 fBreakEngines.add(candidate); 721 return candidate; 722 } 723 } 724 725 // The global list doesn't have an existing engine, build one. 726 int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT); 727 if (script == UScript.KATAKANA || script == UScript.HIRAGANA) { 728 // Katakana, Hiragana and Han are handled by the same dictionary engine. 729 // Fold them together for mapping from script -> engine. 730 script = UScript.HAN; 731 } 732 733 LanguageBreakEngine eng; 734 try { 735 switch (script) { 736 case UScript.THAI: 737 try { 738 eng = LSTMBreakEngine.create(script, LSTMBreakEngine.createData(script)); 739 } catch (MissingResourceException e) { 740 eng = new ThaiBreakEngine(); 741 } 742 break; 743 case UScript.LAO: 744 eng = new LaoBreakEngine(); 745 break; 746 case UScript.MYANMAR: 747 try { 748 eng = LSTMBreakEngine.create(script, LSTMBreakEngine.createData(script)); 749 } catch (MissingResourceException e) { 750 eng = new BurmeseBreakEngine(); 751 } 752 break; 753 case UScript.KHMER: 754 eng = new KhmerBreakEngine(); 755 break; 756 case UScript.HAN: 757 eng = new CjkBreakEngine(false); 758 break; 759 case UScript.HANGUL: 760 eng = new CjkBreakEngine(true); 761 break; 762 default: 763 gUnhandledBreakEngine.handleChar(c); 764 eng = gUnhandledBreakEngine; 765 break; 766 } 767 } catch (IOException e) { 768 eng = null; 769 } 770 771 if (eng != null && eng != gUnhandledBreakEngine) { 772 gAllBreakEngines.add(eng); 773 fBreakEngines.add(eng); 774 } 775 return eng; 776 } // end synchronized(gAllBreakEngines) 777 } 778 779 /** 780 * The State Machine Engine for moving forward is here. 781 * This function is the heart of the RBBI run time engine. 782 * 783 * Input 784 * fPosition, the position in the text to begin from. 785 * Output 786 * fPosition: the boundary following the starting position. 787 * fDictionaryCharCount the number of dictionary characters encountered. 788 * If > 0, the segment will be further subdivided 789 * fRuleStatusIndex Info from the state table indicating which rules caused the boundary. 790 * 791 * @return the new iterator position 792 * 793 * A note on supplementary characters and the position of underlying 794 * Java CharacterIterator: Normally, a character iterator is positioned at 795 * the char most recently returned by next(). Within this function, when 796 * a supplementary char is being processed, the char iterator is left 797 * sitting on the trail surrogate, in the middle of the code point. 798 * This is different from everywhere else, where an iterator always 799 * points at the lead surrogate of a supplementary. 800 */ handleNext()801 private int handleNext() { 802 if (TRACE) { 803 System.out.println("Handle Next pos char state category"); 804 } 805 806 // handleNext always sets the break tag value. 807 // Set the default for it. 808 fRuleStatusIndex = 0; 809 fDictionaryCharCount = 0; 810 811 // caches for quicker access 812 CharacterIterator text = fText; 813 CodePointTrie trie = fRData.fTrie; 814 815 char[] stateTable = fRData.fFTable.fTable; 816 int initialPosition = fPosition; 817 text.setIndex(initialPosition); 818 int result = initialPosition; 819 820 // Set up the starting char 821 int c = text.current(); 822 if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) { 823 c = nextTrail32(text, c); 824 if (c == DONE32) { 825 fDone = true; 826 return BreakIterator.DONE; 827 } 828 } 829 830 // Set the initial state for the state machine 831 int state = START_STATE; 832 int row = fRData.getRowIndex(state); 833 short category = 3; 834 int flagsState = fRData.fFTable.fFlags; 835 int dictStart = fRData.fFTable.fDictCategoriesStart; 836 int mode = RBBI_RUN; 837 if ((flagsState & RBBIDataWrapper.RBBI_BOF_REQUIRED) != 0) { 838 category = 2; 839 mode = RBBI_START; 840 if (TRACE) { 841 System.out.print(" " + RBBIDataWrapper.intToString(text.getIndex(), 5)); 842 System.out.print(RBBIDataWrapper.intToHexString(c, 10)); 843 System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6)); 844 } 845 } 846 847 // loop until we reach the end of the text or transition to state 0 848 while (state != STOP_STATE) { 849 if (c == DONE32) { 850 // Reached end of input string. 851 if (mode == RBBI_END) { 852 // We have already run the loop one last time with the 853 // character set to the pseudo {eof} value. Now it is time 854 // to unconditionally bail out. 855 break; 856 } 857 // Run the loop one last time with the fake end-of-input character category 858 mode = RBBI_END; 859 category = 1; 860 } 861 else if (mode == RBBI_RUN) { 862 // Get the char category. An incoming category of 1 or 2 mens that 863 // we are preset for doing the beginning or end of input, and 864 // that we shouldn't get a category from an actual text input character. 865 // 866 867 // look up the current character's character category, which tells us 868 // which column in the state table to look at. 869 // 870 category = (short) trie.get(c); 871 872 // Check for categories that require word dictionary handling. 873 if (category >= dictStart) { 874 fDictionaryCharCount++; 875 } 876 877 if (TRACE) { 878 System.out.print(" " + RBBIDataWrapper.intToString(text.getIndex(), 5)); 879 System.out.print(RBBIDataWrapper.intToHexString(c, 10)); 880 System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6)); 881 } 882 883 // Advance to the next character. 884 // If this is a beginning-of-input loop iteration, don't advance. 885 // The next iteration will be processing the first real input character. 886 c = text.next(); 887 if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) { 888 c = nextTrail32(text, c); 889 } 890 } 891 else { 892 mode = RBBI_RUN; 893 } 894 895 // look up a state transition in the state table 896 state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category]; 897 row = fRData.getRowIndex(state); 898 int accepting = stateTable[row + RBBIDataWrapper.ACCEPTING]; 899 if (accepting == RBBIDataWrapper.ACCEPTING_UNCONDITIONAL) { 900 // Match found, common case 901 result = text.getIndex(); 902 if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c <= UTF16.CODEPOINT_MAX_VALUE) { 903 // The iterator has been left in the middle of a surrogate pair. 904 // We want the start of it. 905 result--; 906 } 907 908 // Remember the break status (tag) values. 909 fRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGSIDX]; 910 } else if (accepting > RBBIDataWrapper.ACCEPTING_UNCONDITIONAL) { 911 // Lookahead match is completed 912 int lookaheadResult = fLookAheadMatches[accepting]; 913 if (lookaheadResult >= 0) { 914 fRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGSIDX]; 915 fPosition = lookaheadResult; 916 return lookaheadResult; 917 } 918 } 919 920 921 // If we are at the position of the '/' in a look-ahead (hard break) rule; 922 // record the current position, to be returned later, if the full rule matches. 923 // TODO: Move this check before the previous check of fAccepting. 924 // This would enable hard-break rules with no following context. 925 // But there are line break test failures when trying this. Investigate. 926 // Issue ICU-20837 927 int rule = stateTable[row + RBBIDataWrapper.LOOKAHEAD]; 928 if (rule != 0) { 929 int pos = text.getIndex(); 930 if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c <= UTF16.CODEPOINT_MAX_VALUE) { 931 // The iterator has been left in the middle of a surrogate pair. 932 // We want the beginning of it. 933 pos--; 934 } 935 fLookAheadMatches[rule] = pos; 936 } 937 938 939 } // End of state machine main loop 940 941 // The state machine is done. Check whether it found a match... 942 943 // If the iterator failed to advance in the match engine force it ahead by one. 944 // This indicates a defect in the break rules, which should always match 945 // at least one character. 946 947 if (result == initialPosition) { 948 if (TRACE) { 949 System.out.println("Iterator did not move. Advancing by 1."); 950 } 951 text.setIndex(initialPosition); 952 next32(text); 953 result = text.getIndex(); 954 fRuleStatusIndex = 0; 955 } 956 957 // Leave the iterator at our result position. 958 // (we may have advanced beyond the last accepting position chasing after 959 // longer matches that never completed.) 960 fPosition = result; 961 962 if (TRACE) { 963 System.out.println("result = " + result); 964 } 965 return result; 966 } 967 968 /** 969 * Iterate backwards from an arbitrary position in the input text using the Safe Reverse rules. 970 * This locates a "Safe Position" from which the forward break rules 971 * will operate correctly. A Safe Position is not necessarily a boundary itself. 972 * 973 * The logic of this function is very similar to handleNext(), above, but simpler 974 * because the safe table does not require as many options. 975 * 976 * @param fromPosition the position in the input text to begin the iteration. 977 * @hide draft / provisional / internal are hidden on Android 978 */ handleSafePrevious(int fromPosition)979 private int handleSafePrevious(int fromPosition) { 980 char state; 981 short category = 0; 982 int result = 0; 983 984 // caches for quicker access 985 CharacterIterator text = fText; 986 CodePointTrie trie = fRData.fTrie; 987 char[] stateTable = fRData.fRTable.fTable; 988 989 CISetIndex32(text, fromPosition); 990 if (TRACE) { 991 System.out.print("Handle Previous pos char state category"); 992 } 993 994 // if we're already at the start of the text, return DONE. 995 if (text.getIndex() == text.getBeginIndex()) { 996 return BreakIterator.DONE; 997 } 998 999 // Set the initial state for the state machine 1000 int c = CharacterIteration.previous32(text); 1001 state = START_STATE; 1002 int row = fRData.getRowIndex(state); 1003 1004 // loop until we reach the start of the text or transition to state 0 1005 // 1006 for (; c != DONE32; c = CharacterIteration.previous32(text)) { 1007 1008 // look up the current character's character category, which tells us 1009 // which column in the state table to look at. 1010 // 1011 // And off the dictionary flag bit. For reverse iteration it is not used. 1012 category = (short) trie.get(c); 1013 if (TRACE) { 1014 System.out.print(" " + RBBIDataWrapper.intToString(text.getIndex(), 5)); 1015 System.out.print(RBBIDataWrapper.intToHexString(c, 10)); 1016 System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6)); 1017 } 1018 1019 // State Transition - move machine to its next state 1020 // 1021 assert(category < fRData.fHeader.fCatCount); 1022 state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category]; 1023 row = fRData.getRowIndex(state); 1024 1025 if (state == STOP_STATE) { 1026 // This is the normal exit from the lookup state machine. 1027 // Transition to state zero means we have found a safe point. 1028 break; 1029 } 1030 } 1031 1032 // The state machine is done. 1033 result = text.getIndex(); 1034 if (TRACE) { 1035 System.out.println("result = " + result); 1036 } 1037 return result; 1038 } 1039 1040 /** 1041 * Set the index of a CharacterIterator. 1042 * Pin the index to the valid range range of BeginIndex <= index <= EndIndex. 1043 * If the index points to a trail surrogate of a supplementary character, adjust it 1044 * to the start (lead surrogate) index. 1045 * 1046 * @param ci A CharacterIterator to set 1047 * @param index the index to set 1048 * @return the resulting index, possibly pinned or adjusted. 1049 */ 1050 private static int CISetIndex32(CharacterIterator ci, int index) { 1051 if (index <= ci.getBeginIndex()) { 1052 ci.first(); 1053 } else if (index >= ci.getEndIndex()) { 1054 ci.setIndex(ci.getEndIndex()); 1055 } else if (Character.isLowSurrogate(ci.setIndex(index))) { 1056 if (!Character.isHighSurrogate(ci.previous())) { 1057 ci.next(); 1058 } 1059 } 1060 return ci.getIndex(); 1061 } 1062 1063 /** DictionaryCache stores the boundaries obtained from a run of dictionary characters. 1064 * Dictionary boundaries are moved first to this cache, then from here 1065 * to the main BreakCache, where they may inter-leave with non-dictionary 1066 * boundaries. The public BreakIterator API always fetches directly 1067 * from the main BreakCache, not from here. 1068 * 1069 * In common situations, the number of boundaries in a single dictionary run 1070 * should be quite small, it will be terminated by punctuation, spaces, 1071 * or any other non-dictionary characters. The main BreakCache may end 1072 * up with boundaries from multiple dictionary based runs. 1073 * 1074 * The boundaries are stored in a simple ArrayList (vector), with the 1075 * assumption that they will be accessed sequentially. 1076 */ 1077 class DictionaryCache { 1078 1079 void reset() { 1080 fPositionInCache = -1; 1081 fStart = 0; 1082 fLimit = 0; 1083 fFirstRuleStatusIndex = 0; 1084 fOtherRuleStatusIndex = 0; 1085 fBreaks.removeAllElements(); 1086 }; 1087 1088 boolean following(int fromPos) { 1089 if (fromPos >= fLimit || fromPos < fStart) { 1090 fPositionInCache = -1; 1091 return false; 1092 } 1093 1094 // Sequential iteration, move from previous boundary to the following 1095 1096 int r = 0; 1097 if (fPositionInCache >= 0 && fPositionInCache < fBreaks.size() && fBreaks.elementAt(fPositionInCache) == fromPos) { 1098 ++fPositionInCache; 1099 if (fPositionInCache >= fBreaks.size()) { 1100 fPositionInCache = -1; 1101 return false; 1102 } 1103 r = fBreaks.elementAt(fPositionInCache); 1104 assert(r > fromPos); 1105 fBoundary = r; 1106 fStatusIndex = fOtherRuleStatusIndex; 1107 return true; 1108 } 1109 1110 // Random indexing. Linear search for the boundary following the given position. 1111 1112 for (fPositionInCache = 0; fPositionInCache < fBreaks.size(); ++fPositionInCache) { 1113 r= fBreaks.elementAt(fPositionInCache); 1114 if (r > fromPos) { 1115 fBoundary = r; 1116 fStatusIndex = fOtherRuleStatusIndex; 1117 return true; 1118 } 1119 } 1120 1121 // Internal error. fStart <= fromPos < fLimit, but no cached boundary. 1122 assert(false); 1123 fPositionInCache = -1; 1124 return false; 1125 }; 1126 preceding(int fromPos)1127 boolean preceding(int fromPos) { 1128 if (fromPos <= fStart || fromPos > fLimit) { 1129 fPositionInCache = -1; 1130 return false; 1131 } 1132 1133 if (fromPos == fLimit) { 1134 fPositionInCache = fBreaks.size() - 1; 1135 if (fPositionInCache >= 0) { 1136 assert(fBreaks.elementAt(fPositionInCache) == fromPos); 1137 } 1138 } 1139 1140 int r; 1141 if (fPositionInCache > 0 && fPositionInCache < fBreaks.size() && fBreaks.elementAt(fPositionInCache) == fromPos) { 1142 --fPositionInCache; 1143 r = fBreaks.elementAt(fPositionInCache); 1144 assert(r < fromPos); 1145 fBoundary = r; 1146 fStatusIndex = ( r== fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex; 1147 return true; 1148 } 1149 1150 if (fPositionInCache == 0) { 1151 fPositionInCache = -1; 1152 return false; 1153 } 1154 1155 for (fPositionInCache = fBreaks.size()-1; fPositionInCache >= 0; --fPositionInCache) { 1156 r = fBreaks.elementAt(fPositionInCache); 1157 if (r < fromPos) { 1158 fBoundary = r; 1159 fStatusIndex = ( r == fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex; 1160 return true; 1161 } 1162 } 1163 assert(false); 1164 fPositionInCache = -1; 1165 return false; 1166 }; 1167 1168 /** 1169 * Populate the cache with the dictionary based boundaries within a region of text. 1170 * @param startPos The start position of a range of text 1171 * @param endPos The end position of a range of text 1172 * @param firstRuleStatus The rule status index that applies to the break at startPos 1173 * @param otherRuleStatus The rule status index that applies to boundaries other than startPos 1174 * @hide draft / provisional / internal are hidden on Android 1175 */ populateDictionary(int startPos, int endPos, int firstRuleStatus, int otherRuleStatus)1176 void populateDictionary(int startPos, int endPos, 1177 int firstRuleStatus, int otherRuleStatus) { 1178 if ((endPos - startPos) <= 1) { 1179 return; 1180 } 1181 1182 reset(); 1183 fFirstRuleStatusIndex = firstRuleStatus; 1184 fOtherRuleStatusIndex = otherRuleStatus; 1185 1186 int rangeStart = startPos; 1187 int rangeEnd = endPos; 1188 1189 int category; 1190 int current; 1191 int foundBreakCount = 0; 1192 1193 // Loop through the text, looking for ranges of dictionary characters. 1194 // For each span, find the appropriate break engine, and ask it to find 1195 // any breaks within the span. 1196 1197 fText.setIndex(rangeStart); 1198 int c = CharacterIteration.current32(fText); 1199 category = (short)fRData.fTrie.get(c); 1200 int dictStart = fRData.fFTable.fDictCategoriesStart; 1201 1202 while(true) { 1203 while((current = fText.getIndex()) < rangeEnd && (category < dictStart)) { 1204 c = CharacterIteration.next32(fText); // pre-increment 1205 category = (short)fRData.fTrie.get(c); 1206 } 1207 if (current >= rangeEnd) { 1208 break; 1209 } 1210 1211 // We now have a dictionary character. Get the appropriate language object 1212 // to deal with it. 1213 LanguageBreakEngine lbe = getLanguageBreakEngine(c); 1214 1215 // Ask the language object if there are any breaks. It will add them to the cache and 1216 // leave the text pointer on the other side of its range, ready to search for the next one. 1217 if (lbe != null) { 1218 foundBreakCount += lbe.findBreaks(fText, rangeStart, rangeEnd, fBreaks, fPhraseBreaking); 1219 } 1220 1221 // Reload the loop variables for the next go-round 1222 c = CharacterIteration.current32(fText); 1223 category = (short)fRData.fTrie.get(c); 1224 } 1225 1226 // If we found breaks, ensure that the first and last entries are 1227 // the original starting and ending position. And initialize the 1228 // cache iteration position to the first entry. 1229 1230 // System.out.printf("foundBreakCount = %d%n", foundBreakCount); 1231 if (foundBreakCount > 0) { 1232 assert(foundBreakCount == fBreaks.size()); 1233 if (startPos < fBreaks.elementAt(0)) { 1234 // The dictionary did not place a boundary at the start of the segment of text. 1235 // Add one now. This should not commonly happen, but it would be easy for interactions 1236 // of the rules for dictionary segments and the break engine implementations to 1237 // inadvertently cause it. Cover it here, just in case. 1238 fBreaks.offer(startPos); 1239 } 1240 if (endPos > fBreaks.peek()) { 1241 fBreaks.push(endPos); 1242 } 1243 fPositionInCache = 0; 1244 // Note: Dictionary matching may extend beyond the original limit. 1245 fStart = fBreaks.elementAt(0); 1246 fLimit = fBreaks.peek(); 1247 } else { 1248 // there were no language-based breaks, even though the segment contained 1249 // dictionary characters. Subsequent attempts to fetch boundaries from the dictionary cache 1250 // for this range will fail, and the calling code will fall back to the rule based boundaries. 1251 } 1252 1253 }; 1254 1255 DictionaryCache()1256 DictionaryCache() { 1257 fPositionInCache = -1; 1258 fBreaks = new DictionaryBreakEngine.DequeI(); 1259 } 1260 1261 /** 1262 * copy constructor. Used by RuleBasedBreakIterator.clone(). 1263 * 1264 * @param src the source object to be copied. 1265 */ DictionaryCache(DictionaryCache src)1266 DictionaryCache(DictionaryCache src) { 1267 try { 1268 fBreaks = (DictionaryBreakEngine.DequeI)src.fBreaks.clone(); 1269 } 1270 catch (CloneNotSupportedException e) { 1271 throw new RuntimeException(e); 1272 } 1273 fPositionInCache = src.fPositionInCache; 1274 fStart = src.fStart; 1275 fLimit = src.fLimit; 1276 fFirstRuleStatusIndex = src.fFirstRuleStatusIndex; 1277 fOtherRuleStatusIndex = src.fOtherRuleStatusIndex; 1278 fBoundary = src.fBoundary; 1279 fStatusIndex = src.fStatusIndex; 1280 } 1281 1282 // A data structure containing the boundaries themselves. Essentially a vector of raw ints. 1283 DictionaryBreakEngine.DequeI fBreaks; 1284 int fPositionInCache; // Index in fBreaks of last boundary returned by following() 1285 // // or preceding(). Optimizes sequential access. 1286 int fStart; // Text position of first boundary in cache. 1287 int fLimit; // Last boundary in cache. Which is the limit of the 1288 // // text segment being handled by the dictionary. 1289 int fFirstRuleStatusIndex; // Rule status info for first boundary. 1290 int fOtherRuleStatusIndex; // Rule status info for 2nd through last boundaries. 1291 int fBoundary; // Current boundary. Set by preceding(), following(). 1292 int fStatusIndex; // Current rule status index. Set by preceding, following(). 1293 }; 1294 1295 1296 1297 1298 /* 1299 * class BreakCache 1300 * 1301 * Cache of break boundary positions and rule status values. 1302 * Break iterator API functions, next(), previous(), etc., will use cached results 1303 * when possible, and otherwise cache new results as they are obtained. 1304 * 1305 * Uniformly caches both dictionary and rule based (non-dictionary) boundaries. 1306 * 1307 * The cache is implemented as a single circular buffer. 1308 */ 1309 1310 /* 1311 * size of the circular cache buffer. 1312 */ 1313 1314 class BreakCache { 1315 BreakCache()1316 BreakCache() { 1317 reset(); 1318 }; 1319 reset(int pos, int ruleStatus)1320 void reset(int pos, int ruleStatus) { 1321 fStartBufIdx = 0; 1322 fEndBufIdx = 0; 1323 fTextIdx = pos; 1324 fBufIdx = 0; 1325 fBoundaries[0] = pos; 1326 fStatuses[0] = (short)ruleStatus; 1327 } 1328 reset()1329 void reset() {reset(0, 0); }; 1330 next()1331 void next() { 1332 if (fBufIdx == fEndBufIdx) { 1333 fDone = !populateFollowing(); 1334 fPosition = fTextIdx; 1335 fRuleStatusIndex = fStatuses[fBufIdx]; 1336 } else { 1337 fBufIdx = modChunkSize(fBufIdx + 1); 1338 fTextIdx = fPosition = fBoundaries[fBufIdx]; 1339 fRuleStatusIndex = fStatuses[fBufIdx]; 1340 } 1341 }; 1342 previous()1343 void previous() { 1344 int initialBufIdx = fBufIdx; 1345 if (fBufIdx == fStartBufIdx) { 1346 // At start of cache. Prepend to it. 1347 populatePreceding(); 1348 } else { 1349 // Cache already holds the next boundary 1350 fBufIdx = modChunkSize(fBufIdx - 1); 1351 fTextIdx = fBoundaries[fBufIdx]; 1352 } 1353 fDone = (fBufIdx == initialBufIdx); 1354 fPosition = fTextIdx; 1355 fRuleStatusIndex = fStatuses[fBufIdx]; 1356 return; 1357 }; 1358 1359 // Move the iteration state to the position following the startPosition. 1360 // Input position must be pinned to the input length. following(int startPos)1361 void following(int startPos) { 1362 if (startPos == fTextIdx || seek(startPos) || populateNear(startPos)) { 1363 // startPos is in the cache. Do a next() from that position. 1364 // TODO: an awkward set of interactions with bi->fDone 1365 // seek() does not clear it; it can't because of interactions with populateNear(). 1366 // next() does not clear it in the fast-path case, where everything matters. Maybe it should. 1367 // So clear it here, for the case where seek() succeeded on an iterator that had previously run off the end. 1368 fDone = false; 1369 next(); 1370 } 1371 1372 }; 1373 preceding(int startPos)1374 void preceding(int startPos) { 1375 if (startPos == fTextIdx || seek(startPos) || populateNear(startPos)) { 1376 if (startPos == fTextIdx) { 1377 previous(); 1378 } else { 1379 // seek() leaves the BreakCache positioned at the preceding boundary 1380 // if the requested position is between two boundaries. 1381 // current() pushes the BreakCache position out to the BreakIterator itself. 1382 assert(startPos > fTextIdx); 1383 current(); 1384 } 1385 } 1386 return; 1387 }; 1388 1389 /** 1390 * Update the state of the public BreakIterator (fBI) to reflect the 1391 * current state of the break iterator cache (this). 1392 */ current()1393 int current() { 1394 fPosition = fTextIdx; 1395 fRuleStatusIndex = fStatuses[fBufIdx]; 1396 fDone = false; 1397 return fTextIdx; 1398 }; 1399 1400 /** 1401 * Add boundaries to the cache near the specified position. 1402 * The given position need not be a boundary itself. 1403 * The input position must be within the range of the text, and 1404 * on a code point boundary. 1405 * If the requested position is a break boundary, leave the iteration 1406 * position on it. 1407 * If the requested position is not a boundary, leave the iteration 1408 * position on the preceding boundary and include both the the 1409 * preceding and following boundaries in the cache. 1410 * Additional boundaries, either preceding or following, may be added 1411 * to the cache as a side effect. 1412 * 1413 * Return false if the operation failed. 1414 */ populateNear(int position)1415 boolean populateNear(int position) { 1416 assert(position < fBoundaries[fStartBufIdx] || position > fBoundaries[fEndBufIdx]); 1417 1418 // Find a boundary somewhere in the vicinity of the requested position. 1419 // Depending on the safe rules and the text data, it could be either before, at, or after 1420 // the requested position. 1421 1422 1423 // If the requested position is not near already cached positions, clear the existing cache, 1424 // find a near-by boundary and begin new cache contents there. 1425 1426 if ((position < fBoundaries[fStartBufIdx] - 15) || position > (fBoundaries[fEndBufIdx] + 15)) { 1427 int aBoundary = fText.getBeginIndex(); 1428 int ruleStatusIndex = 0; 1429 if (position > aBoundary + 20) { 1430 int backupPos = handleSafePrevious(position); 1431 if (backupPos > aBoundary) { 1432 // Advance to the boundary following the backup position. 1433 // There is a complication: the safe reverse rules identify pairs of code points 1434 // that are safe. If advancing from the safe point moves forwards by less than 1435 // two code points, we need to advance one more time to ensure that the boundary 1436 // is good, including a correct rules status value. 1437 // 1438 fPosition = backupPos; 1439 aBoundary = handleNext(); 1440 if (aBoundary == backupPos + 1 || 1441 (aBoundary == backupPos + 2 && 1442 Character.isHighSurrogate(fText.setIndex(backupPos)) && 1443 Character.isLowSurrogate(fText.next()))) { 1444 // The initial handleNext() only advanced by a single code point. Go again. 1445 // Safe rules identify safe pairs. 1446 aBoundary = handleNext(); 1447 } 1448 } 1449 ruleStatusIndex = fRuleStatusIndex; 1450 } 1451 reset(aBoundary, ruleStatusIndex); // Reset cache to hold aBoundary as a single starting point. 1452 } 1453 1454 // Fill in boundaries between existing cache content and the new requested position. 1455 1456 if (fBoundaries[fEndBufIdx] < position) { 1457 // The last position in the cache precedes the requested position. 1458 // Add following position(s) to the cache. 1459 while (fBoundaries[fEndBufIdx] < position) { 1460 if (!populateFollowing()) { 1461 assert false; 1462 return false; 1463 } 1464 } 1465 fBufIdx = fEndBufIdx; // Set iterator position to the end of the buffer. 1466 fTextIdx = fBoundaries[fBufIdx]; // Required because populateFollowing may add extra boundaries. 1467 while (fTextIdx > position) { // Move backwards to a position at or preceding the requested pos. 1468 previous(); 1469 } 1470 return true; 1471 } 1472 1473 if (fBoundaries[fStartBufIdx] > position) { 1474 // The first position in the cache is beyond the requested position. 1475 // back up more until we get a boundary <= the requested position. 1476 while (fBoundaries[fStartBufIdx] > position) { 1477 populatePreceding(); 1478 } 1479 fBufIdx = fStartBufIdx; // Set iterator position to the start of the buffer. 1480 fTextIdx = fBoundaries[fBufIdx]; // Required because populatePreceding may add extra boundaries. 1481 while (fTextIdx < position) { // Move forwards to a position at or following the requested pos. 1482 next(); 1483 } 1484 if (fTextIdx > position) { 1485 // If position is not itself a boundary, the next() loop above will overshoot. 1486 // Back up one, leaving cache position at the boundary preceding the requested position. 1487 previous(); 1488 } 1489 return true; 1490 } 1491 1492 assert fTextIdx == position; 1493 return true; 1494 1495 }; 1496 1497 /** 1498 * Add boundary(s) to the cache following the current last boundary. 1499 * Return false if at the end of the text, and no more boundaries can be added. 1500 * Leave iteration position at the first newly added boundary, or unchanged if no boundary was added. 1501 */ populateFollowing()1502 boolean populateFollowing() { 1503 int fromPosition = fBoundaries[fEndBufIdx]; 1504 int fromRuleStatusIdx = fStatuses[fEndBufIdx]; 1505 int pos = 0; 1506 int ruleStatusIdx = 0; 1507 1508 if (fDictionaryCache.following(fromPosition)) { 1509 addFollowing(fDictionaryCache.fBoundary, fDictionaryCache.fStatusIndex, UpdateCachePosition); 1510 return true; 1511 } 1512 1513 fPosition = fromPosition; 1514 pos = handleNext(); 1515 if (pos == BreakIterator.DONE) { 1516 return false; 1517 } 1518 1519 ruleStatusIdx = fRuleStatusIndex; 1520 if (fDictionaryCharCount > 0) { 1521 // The text segment obtained from the rules includes dictionary characters. 1522 // Subdivide it, with subdivided results going into the dictionary cache. 1523 fDictionaryCache.populateDictionary(fromPosition, pos, fromRuleStatusIdx, ruleStatusIdx); 1524 if (fDictionaryCache.following(fromPosition)) { 1525 addFollowing(fDictionaryCache.fBoundary, fDictionaryCache.fStatusIndex, UpdateCachePosition); 1526 return true; 1527 // TODO: may want to move a sizable chunk of the dictionary cache to the break cache at this point. 1528 // But be careful with interactions with populateNear(). 1529 } 1530 } 1531 1532 // Rule based segment did not include dictionary characters. 1533 // Or, it did contain dictionary chars, but the dictionary segmenter didn't handle them, 1534 // meaning that we didn't take the return, above. 1535 // Add its end point to the cache. 1536 addFollowing(pos, ruleStatusIdx, UpdateCachePosition); 1537 1538 // Add several non-dictionary boundaries at this point, to optimize straight forward iteration. 1539 // (subsequent calls to BreakIterator::next() will take the fast path, getting cached results. 1540 // 1541 for (int count=0; count<6; ++count) { 1542 pos = handleNext(); 1543 if (pos == BreakIterator.DONE || fDictionaryCharCount > 0) { 1544 break; 1545 } 1546 addFollowing(pos, fRuleStatusIndex, RetainCachePosition); 1547 } 1548 return true; 1549 }; 1550 1551 /** 1552 * Add one or more boundaries to the cache preceding the first currently cached boundary. 1553 * Leave the iteration position on the first added boundary. 1554 * Return false if no boundaries could be added (if at the start of the text.) 1555 */ populatePreceding()1556 boolean populatePreceding() { 1557 int textBegin = fText.getBeginIndex(); 1558 int fromPosition = fBoundaries[fStartBufIdx]; 1559 if (fromPosition == textBegin) { 1560 return false; 1561 } 1562 1563 int position = textBegin; 1564 int positionStatusIdx = 0; 1565 1566 if (fDictionaryCache.preceding(fromPosition)) { 1567 addPreceding(fDictionaryCache.fBoundary, fDictionaryCache.fStatusIndex, UpdateCachePosition); 1568 return true; 1569 } 1570 1571 int backupPosition = fromPosition; 1572 1573 // Find a boundary somewhere preceding the first already-cached boundary 1574 do { 1575 backupPosition = backupPosition - 30; 1576 if (backupPosition <= textBegin) { 1577 backupPosition = textBegin; 1578 } else { 1579 backupPosition = handleSafePrevious(backupPosition); 1580 } 1581 if (backupPosition == BreakIterator.DONE || backupPosition == textBegin) { 1582 position = textBegin; 1583 positionStatusIdx = 0; 1584 } else { 1585 // Advance to the boundary following the backup position. 1586 // There is a complication: the safe reverse rules identify pairs of code points 1587 // that are safe. If advancing from the safe point moves forwards by less than 1588 // two code points, we need to advance one more time to ensure that the boundary 1589 // is good, including a correct rules status value. 1590 // 1591 fPosition = backupPosition; // TODO: pass starting position in a clearer way. 1592 position = handleNext(); 1593 if (position == backupPosition + 1 || 1594 (position == backupPosition + 2 && 1595 Character.isHighSurrogate(fText.setIndex(backupPosition)) && 1596 Character.isLowSurrogate(fText.next()))) { 1597 // The initial handleNext() only advanced by a single code point. Go again. 1598 // Safe rules identify safe pairs. 1599 position = handleNext(); 1600 } 1601 positionStatusIdx = fRuleStatusIndex; 1602 } 1603 } while (position >= fromPosition); 1604 1605 // Find boundaries between the one we just located and the first already-cached boundary 1606 // Put them in a side buffer, because we don't yet know where they will fall in the circular cache buffer. 1607 1608 fSideBuffer.removeAllElements(); 1609 fSideBuffer.push(position); 1610 fSideBuffer.push(positionStatusIdx); 1611 1612 do { 1613 int prevPosition = fPosition = position; 1614 int prevStatusIdx = positionStatusIdx; 1615 position = handleNext(); 1616 positionStatusIdx = fRuleStatusIndex; 1617 if (position == BreakIterator.DONE) { 1618 break; 1619 } 1620 1621 boolean segmentHandledByDictionary = false; 1622 if (fDictionaryCharCount != 0) { 1623 // Segment from the rules includes dictionary characters. 1624 // Subdivide it, with subdivided results going into the dictionary cache. 1625 int dictSegEndPosition = position; 1626 fDictionaryCache.populateDictionary(prevPosition, dictSegEndPosition, prevStatusIdx, positionStatusIdx); 1627 while (fDictionaryCache.following(prevPosition)) { 1628 position = fDictionaryCache.fBoundary; 1629 positionStatusIdx = fDictionaryCache.fStatusIndex; 1630 segmentHandledByDictionary = true; 1631 assert(position > prevPosition); 1632 if (position >= fromPosition) { 1633 break; 1634 } 1635 assert(position <= dictSegEndPosition); 1636 fSideBuffer.push(position); 1637 fSideBuffer.push(positionStatusIdx); 1638 prevPosition = position; 1639 } 1640 assert(position==dictSegEndPosition || position>=fromPosition); 1641 } 1642 1643 if (!segmentHandledByDictionary && position < fromPosition) { 1644 fSideBuffer.push(position); 1645 fSideBuffer.push(positionStatusIdx); 1646 } 1647 } while (position < fromPosition); 1648 1649 // Move boundaries from the side buffer to the main circular buffer. 1650 boolean success = false; 1651 if (!fSideBuffer.isEmpty()) { 1652 positionStatusIdx = fSideBuffer.pop(); 1653 position = fSideBuffer.pop(); 1654 addPreceding(position, positionStatusIdx, UpdateCachePosition); 1655 success = true; 1656 } 1657 1658 while (!fSideBuffer.isEmpty()) { 1659 positionStatusIdx = fSideBuffer.pop(); 1660 position = fSideBuffer.pop(); 1661 if (!addPreceding(position, positionStatusIdx, RetainCachePosition)) { 1662 // No space in circular buffer to hold a new preceding result while 1663 // also retaining the current cache (iteration) position. 1664 // Bailing out is safe; the cache will refill again if needed. 1665 break; 1666 } 1667 } 1668 return success; 1669 }; 1670 1671 1672 static final boolean RetainCachePosition = false; 1673 static final boolean UpdateCachePosition = true; 1674 1675 /** 1676 * Add the boundary following the current position. 1677 * The current position can be left as it was, or changed to the newly added boundary, 1678 * as specified by the update parameter. 1679 */ addFollowing(int position, int ruleStatusIdx, boolean update)1680 void addFollowing(int position, int ruleStatusIdx, boolean update) { 1681 assert(position > fBoundaries[fEndBufIdx]); 1682 assert(ruleStatusIdx <= Short.MAX_VALUE); 1683 int nextIdx = modChunkSize(fEndBufIdx + 1); 1684 if (nextIdx == fStartBufIdx) { 1685 fStartBufIdx = modChunkSize(fStartBufIdx + 6); // TODO: experiment. Probably revert to 1. 1686 } 1687 fBoundaries[nextIdx] = position; 1688 fStatuses[nextIdx] = (short)ruleStatusIdx; 1689 fEndBufIdx = nextIdx; 1690 if (update == UpdateCachePosition) { 1691 // Set current position to the newly added boundary. 1692 fBufIdx = nextIdx; 1693 fTextIdx = position; 1694 } else { 1695 // Retaining the original cache position. 1696 // Check if the added boundary wraps around the buffer, and would over-write the original position. 1697 // It's the responsibility of callers of this function to not add too many. 1698 assert(nextIdx != fBufIdx); 1699 } 1700 1701 }; 1702 1703 1704 /** 1705 * Add the boundary preceding the current position. 1706 * The current position can be left as it was, or changed to the newly added boundary, 1707 * as specified by the update parameter. 1708 */ addPreceding(int position, int ruleStatusIdx, boolean update)1709 boolean addPreceding(int position, int ruleStatusIdx, boolean update) { 1710 assert(position < fBoundaries[fStartBufIdx]); 1711 assert(ruleStatusIdx <= Short.MAX_VALUE); 1712 int nextIdx = modChunkSize(fStartBufIdx - 1); 1713 if (nextIdx == fEndBufIdx) { 1714 if (fBufIdx == fEndBufIdx && update == RetainCachePosition) { 1715 // Failure. The insertion of the new boundary would claim the buffer position that is the 1716 // current iteration position. And we also want to retain the current iteration position. 1717 // (The buffer is already completely full of entries that precede the iteration position.) 1718 return false; 1719 } 1720 fEndBufIdx = modChunkSize(fEndBufIdx - 1); 1721 } 1722 fBoundaries[nextIdx] = position; 1723 fStatuses[nextIdx] = (short)ruleStatusIdx; 1724 fStartBufIdx = nextIdx; 1725 if (update == UpdateCachePosition) { 1726 fBufIdx = nextIdx; 1727 fTextIdx = position; 1728 } 1729 return true; 1730 }; 1731 1732 /** 1733 * Set the cache position to the specified position, or, if the position 1734 * falls between to cached boundaries, to the preceding boundary. 1735 * Fails if the requested position is outside of the range of boundaries currently held by the cache. 1736 * The startPosition must be on a code point boundary. 1737 * 1738 * Return true if successful, false if the specified position is after 1739 * the last cached boundary or before the first. 1740 */ 1741 boolean seek(int pos) { 1742 if (pos < fBoundaries[fStartBufIdx] || pos > fBoundaries[fEndBufIdx]) { 1743 return false; 1744 } 1745 if (pos == fBoundaries[fStartBufIdx]) { 1746 // Common case: seek(0), from BreakIterator::first() 1747 fBufIdx = fStartBufIdx; 1748 fTextIdx = fBoundaries[fBufIdx]; 1749 return true; 1750 } 1751 if (pos == fBoundaries[fEndBufIdx]) { 1752 fBufIdx = fEndBufIdx; 1753 fTextIdx = fBoundaries[fBufIdx]; 1754 return true; 1755 } 1756 1757 int min = fStartBufIdx; 1758 int max = fEndBufIdx; 1759 while (min != max) { 1760 int probe = (min + max + (min>max ? CACHE_SIZE : 0)) / 2; 1761 probe = modChunkSize(probe); 1762 if (fBoundaries[probe] > pos) { 1763 max = probe; 1764 } else { 1765 min = modChunkSize(probe + 1); 1766 } 1767 } 1768 assert(fBoundaries[max] > pos); 1769 fBufIdx = modChunkSize(max - 1); 1770 fTextIdx = fBoundaries[fBufIdx]; 1771 assert(fTextIdx <= pos); 1772 return true; 1773 1774 }; 1775 1776 1777 /** 1778 * copy constructor, used from RuleBasedBreakIterator.clone(). 1779 * 1780 * @param src 1781 */ BreakCache(BreakCache src)1782 BreakCache(BreakCache src) { 1783 fStartBufIdx = src.fStartBufIdx; 1784 fEndBufIdx = src.fEndBufIdx; 1785 fTextIdx = src.fTextIdx; 1786 fBufIdx = src.fBufIdx; 1787 fBoundaries = src.fBoundaries.clone(); 1788 fStatuses = src.fStatuses.clone(); 1789 fSideBuffer = new DictionaryBreakEngine.DequeI(); // Transient, no need to clone contents. 1790 } 1791 dumpCache()1792 void dumpCache() { 1793 System.out.printf("fTextIdx:%d fBufIdx:%d%n", fTextIdx, fBufIdx); 1794 for (int i=fStartBufIdx; ; i=modChunkSize(i+1)) { 1795 System.out.printf("%d %d%n", i, fBoundaries[i]); 1796 if (i == fEndBufIdx) { 1797 break; 1798 } 1799 } 1800 }; 1801 modChunkSize(int index)1802 private final int modChunkSize(int index) { return index & (CACHE_SIZE - 1); }; 1803 1804 static final int CACHE_SIZE = 128; 1805 // static_assert((CACHE_SIZE & (CACHE_SIZE-1)) == 0, "CACHE_SIZE must be power of two."); 1806 1807 int fStartBufIdx; 1808 int fEndBufIdx; // inclusive 1809 1810 int fTextIdx; 1811 int fBufIdx; 1812 1813 int[] fBoundaries = new int[CACHE_SIZE]; 1814 short[] fStatuses = new short[CACHE_SIZE]; 1815 1816 DictionaryBreakEngine.DequeI fSideBuffer = new DictionaryBreakEngine.DequeI(); 1817 }; 1818 1819 1820 1821 1822 } 1823 1824