1 /* GENERATED SOURCE. DO NOT MODIFY. */ 2 // © 2016 and later: Unicode, Inc. and others. 3 // License & terms of use: http://www.unicode.org/copyright.html 4 /* 5 ******************************************************************************* 6 * Copyright (C) 2005-2016 International Business Machines Corporation and 7 * others. All Rights Reserved. 8 ******************************************************************************* 9 */ 10 11 package android.icu.text; 12 13 import static android.icu.impl.CharacterIteration.DONE32; 14 import static android.icu.impl.CharacterIteration.next32; 15 import static android.icu.impl.CharacterIteration.nextTrail32; 16 17 import java.io.ByteArrayOutputStream; 18 import java.io.IOException; 19 import java.io.InputStream; 20 import java.io.OutputStream; 21 import java.nio.ByteBuffer; 22 import java.text.CharacterIterator; 23 import java.util.ArrayList; 24 import java.util.List; 25 26 import android.icu.impl.CharacterIteration; 27 import android.icu.impl.ICUBinary; 28 import android.icu.impl.ICUDebug; 29 import android.icu.impl.RBBIDataWrapper; 30 import android.icu.lang.UCharacter; 31 import android.icu.lang.UProperty; 32 import android.icu.lang.UScript; 33 import android.icu.util.CodePointTrie; 34 35 /** 36 * Rule Based Break Iterator 37 * This is a port of the C++ class RuleBasedBreakIterator from ICU4C. 38 * 39 * @hide Only a subset of ICU is exposed in Android 40 */ 41 public class RuleBasedBreakIterator extends BreakIterator { 42 //======================================================================= 43 // Constructors & Factories 44 //======================================================================= 45 46 /** 47 * private constructor 48 */ RuleBasedBreakIterator()49 private RuleBasedBreakIterator() { 50 fDictionaryCharCount = 0; 51 synchronized(gAllBreakEngines) { 52 fBreakEngines = new ArrayList<>(gAllBreakEngines); 53 } 54 } 55 56 /** 57 * Create a break iterator from a precompiled set of break rules. 58 * 59 * Creating a break iterator from the binary rules is much faster than 60 * creating one from source rules. 61 * 62 * The binary rules are generated by the RuleBasedBreakIterator.compileRules() function. 63 * Binary break iterator rules are not guaranteed to be compatible between 64 * different versions of ICU. 65 * 66 * @param is an input stream supplying the compiled binary rules. 67 * @throws IOException if there is an error while reading the rules from the InputStream. 68 * @see #compileRules(String, OutputStream) 69 */ getInstanceFromCompiledRules(InputStream is)70 public static RuleBasedBreakIterator getInstanceFromCompiledRules(InputStream is) throws IOException { 71 RuleBasedBreakIterator This = new RuleBasedBreakIterator(); 72 This.fRData = RBBIDataWrapper.get(ICUBinary.getByteBufferFromInputStreamAndCloseStream(is)); 73 This.fLookAheadMatches = new int[This.fRData.fFTable.fLookAheadResultsSize]; 74 return This; 75 } 76 77 /** 78 * Create a break iterator from a precompiled set of break rules. 79 * 80 * Creating a break iterator from the binary rules is much faster than 81 * creating one from source rules. 82 * 83 * The binary rules are generated by the RuleBasedBreakIterator.compileRules() function. 84 * Binary break iterator rules are not guaranteed to be compatible between 85 * different versions of ICU. 86 * 87 * @param bytes a buffer supplying the compiled binary rules. 88 * @throws IOException if there is an error while reading the rules from the buffer. 89 * @see #compileRules(String, OutputStream) 90 * @deprecated This API is ICU internal only. 91 * @hide draft / provisional / internal are hidden on Android 92 */ 93 @Deprecated getInstanceFromCompiledRules(ByteBuffer bytes)94 public static RuleBasedBreakIterator getInstanceFromCompiledRules(ByteBuffer bytes) throws IOException { 95 RuleBasedBreakIterator This = new RuleBasedBreakIterator(); 96 This.fRData = RBBIDataWrapper.get(bytes); 97 This.fLookAheadMatches = new int[This.fRData.fFTable.fLookAheadResultsSize]; 98 return This; 99 } 100 101 /** 102 * Construct a RuleBasedBreakIterator from a set of rules supplied as a string. 103 * @param rules The break rules to be used. 104 */ RuleBasedBreakIterator(String rules)105 public RuleBasedBreakIterator(String rules) { 106 this(); 107 try { 108 ByteArrayOutputStream ruleOS = new ByteArrayOutputStream(); 109 compileRules(rules, ruleOS); 110 fRData = RBBIDataWrapper.get(ByteBuffer.wrap(ruleOS.toByteArray())); 111 fLookAheadMatches = new int[fRData.fFTable.fLookAheadResultsSize]; 112 } catch (IOException e) { 113 ///CLOVER:OFF 114 // An IO exception can only arrive here if there is a bug in the RBBI Rule compiler, 115 // causing bogus compiled rules to be produced, but with no compile error raised. 116 RuntimeException rte = new RuntimeException("RuleBasedBreakIterator rule compilation internal error: " 117 + e.getMessage()); 118 throw rte; 119 ///CLOVER:ON 120 } 121 } 122 123 //======================================================================= 124 // Boilerplate 125 //======================================================================= 126 127 /** 128 * Clones this iterator. 129 * @return A newly-constructed RuleBasedBreakIterator with the same 130 * behavior as this one. 131 */ 132 @Override clone()133 public Object clone() { 134 RuleBasedBreakIterator result; 135 result = (RuleBasedBreakIterator)super.clone(); 136 if (fText != null) { 137 result.fText = (CharacterIterator)(fText.clone()); 138 } 139 synchronized (gAllBreakEngines) { 140 result.fBreakEngines = new ArrayList<>(gAllBreakEngines); 141 } 142 result.fLookAheadMatches = new int[fRData.fFTable.fLookAheadResultsSize]; 143 result.fBreakCache = result.new BreakCache(fBreakCache); 144 result.fDictionaryCache = result.new DictionaryCache(fDictionaryCache); 145 return result; 146 } 147 148 149 /** 150 * Returns true if both BreakIterators are of the same class, have the same 151 * rules, and iterate over the same text. 152 */ 153 @Override equals(Object that)154 public boolean equals(Object that) { 155 if (that == null) { 156 return false; 157 } 158 if (this == that) { 159 return true; 160 } 161 try { 162 RuleBasedBreakIterator other = (RuleBasedBreakIterator) that; 163 if (fRData != other.fRData && (fRData == null || other.fRData == null)) { 164 return false; 165 } 166 if (fRData != null && other.fRData != null && 167 (!fRData.fRuleSource.equals(other.fRData.fRuleSource))) { 168 return false; 169 } 170 if (fText == null && other.fText == null) { 171 return true; 172 } 173 if (fText == null || other.fText == null || !fText.equals(other.fText)) { 174 return false; 175 } 176 return fPosition == other.fPosition; 177 } 178 catch(ClassCastException e) { 179 return false; 180 } 181 } 182 183 /** 184 * Returns the description (rules) used to create this iterator. 185 * (In ICU4C, the same function is RuleBasedBreakIterator::getRules()) 186 */ 187 @Override toString()188 public String toString() { 189 String retStr = ""; 190 if (fRData != null) { 191 retStr = fRData.fRuleSource; 192 } 193 return retStr; 194 } 195 196 /** 197 * Compute a hashcode for this BreakIterator 198 * @return A hash code 199 */ 200 @Override hashCode()201 public int hashCode() 202 { 203 return fRData.fRuleSource.hashCode(); 204 } 205 206 207 private static final int START_STATE = 1; // The state number of the starting state 208 private static final int STOP_STATE = 0; // The state-transition value indicating "stop" 209 210 // RBBIRunMode - the state machine runs an extra iteration at the beginning and end 211 // of user text. A variable with this enum type keeps track of where we 212 // are. The state machine only fetches user text input while in RUN mode. 213 private static final int RBBI_START = 0; 214 private static final int RBBI_RUN = 1; 215 private static final int RBBI_END = 2; 216 217 /** 218 * The character iterator through which this BreakIterator accesses the text. 219 */ 220 private CharacterIterator fText = new java.text.StringCharacterIterator(""); 221 222 /** 223 * The rule data for this BreakIterator instance. 224 * Not intended for public use. Declared public for testing purposes only. 225 * @deprecated This API is ICU internal only. 226 * @hide draft / provisional / internal are hidden on Android 227 */ 228 @Deprecated 229 public RBBIDataWrapper fRData; 230 231 /** 232 * The iteration state - current position, rule status for the current position, 233 * and whether the iterator ran off the end, yielding UBRK_DONE. 234 * Current position is pinned to be 0 < position <= text.length. 235 * Current position is always set to a boundary. 236 * 237 * The current position of the iterator. Pinned, 0 < fPosition <= text.length. 238 * Never has the value UBRK_DONE (-1). 239 */ 240 private int fPosition; 241 242 /** 243 * Index of the Rule {tag} values for the most recent match. 244 */ 245 private int fRuleStatusIndex; 246 247 /** 248 * True when iteration has run off the end, and iterator functions should return UBRK_DONE. 249 */ 250 private boolean fDone; 251 252 /** 253 * Array of look-ahead tentative results. 254 */ 255 private int[] fLookAheadMatches; 256 257 /** 258 * Cache of previously determined boundary positions. 259 */ 260 private BreakCache fBreakCache = new BreakCache(); 261 262 263 /** 264 * Counter for the number of characters encountered with the "dictionary" 265 * flag set. Normal RBBI iterators don't use it, although the code 266 * for updating it is live. Dictionary Based break iterators (a subclass 267 * of us) access this field directly. 268 * @hide draft / provisional / internal are hidden on Android 269 */ 270 private int fDictionaryCharCount; 271 272 private DictionaryCache fDictionaryCache = new DictionaryCache(); 273 274 /** 275 * ICU debug argument name for RBBI 276 */ 277 private static final String RBBI_DEBUG_ARG = "rbbi"; 278 279 /** 280 * Debugging flag. Trace operation of state machine when true. 281 */ 282 private static final boolean TRACE = ICUDebug.enabled(RBBI_DEBUG_ARG) 283 && ICUDebug.value(RBBI_DEBUG_ARG).indexOf("trace") >= 0; 284 285 /** 286 * The "default" break engine - just skips over ranges of dictionary words, 287 * producing no breaks. Should only be used if characters need to be handled 288 * by a dictionary but we have no dictionary implementation for them. 289 * 290 * Only one instance; shared by all break iterators. 291 */ 292 private static final UnhandledBreakEngine gUnhandledBreakEngine; 293 294 /** 295 * List of all known break engines, common for all break iterators. 296 * Lazily updated as break engines are needed, because instantiation of 297 * break engines is expensive. 298 * 299 * Because gAllBreakEngines can be referenced concurrently from different 300 * BreakIterator instances, all access is synchronized. 301 */ 302 private static final List<LanguageBreakEngine> gAllBreakEngines; 303 304 static { 305 gUnhandledBreakEngine = new UnhandledBreakEngine(); 306 gAllBreakEngines = new ArrayList<>(); 307 gAllBreakEngines.add(gUnhandledBreakEngine); 308 } 309 310 /** 311 * List of all known break engines. Similar to gAllBreakEngines, but local to a 312 * break iterator, allowing it to be used without synchronization. 313 */ 314 private List<LanguageBreakEngine> fBreakEngines; 315 316 /** 317 * Dump the contents of the state table and character classes for this break iterator. 318 * For debugging only. 319 * @deprecated This API is ICU internal only. 320 * @hide draft / provisional / internal are hidden on Android 321 */ 322 @Deprecated dump(java.io.PrintStream out)323 public void dump(java.io.PrintStream out) { 324 if (out == null) { 325 out = System.out; 326 } 327 this.fRData.dump(out); 328 } 329 330 /** 331 * Compile a set of source break rules into the binary state tables used 332 * by the break iterator engine. Creating a break iterator from precompiled 333 * rules is much faster than creating one from source rules. 334 * 335 * Binary break rules are not guaranteed to be compatible between different 336 * versions of ICU. 337 * 338 * 339 * @param rules The source form of the break rules 340 * @param ruleBinary An output stream to receive the compiled rules. 341 * @throws IOException If there is an error writing the output. 342 * @see #getInstanceFromCompiledRules(InputStream) 343 */ compileRules(String rules, OutputStream ruleBinary)344 public static void compileRules(String rules, OutputStream ruleBinary) throws IOException { 345 RBBIRuleBuilder.compileRules(rules, ruleBinary); 346 } 347 348 //======================================================================= 349 // BreakIterator overrides 350 //======================================================================= 351 352 /** 353 * Sets the current iteration position to the beginning of the text. 354 * (i.e., the CharacterIterator's starting offset). 355 * @return The offset of the beginning of the text. 356 */ 357 @Override first()358 public int first() { 359 if (fText == null) { 360 return BreakIterator.DONE; 361 } 362 fText.first(); 363 int start = fText.getIndex(); 364 if (!fBreakCache.seek(start)) { 365 fBreakCache.populateNear(start); 366 } 367 fBreakCache.current(); 368 assert(fPosition == start); 369 return fPosition; 370 } 371 372 /** 373 * Sets the current iteration position to the end of the text. 374 * (i.e., the CharacterIterator's ending offset). 375 * @return The text's past-the-end offset. 376 */ 377 @Override last()378 public int last() { 379 if (fText == null) { 380 return BreakIterator.DONE; 381 } 382 int endPos = fText.getEndIndex(); 383 boolean endShouldBeBoundary = isBoundary(endPos); // Has side effect of setting iterator position. 384 assert(endShouldBeBoundary); 385 if (fPosition != endPos) { 386 assert(fPosition == endPos); 387 } 388 return endPos; 389 } 390 391 /** 392 * Advances the iterator either forward or backward the specified number of steps. 393 * Negative values move backward, and positive values move forward. This is 394 * equivalent to repeatedly calling next() or previous(). 395 * @param n The number of steps to move. The sign indicates the direction 396 * (negative is backwards, and positive is forwards). 397 * @return The character offset of the boundary position n boundaries away from 398 * the current one. 399 */ 400 @Override next(int n)401 public int next(int n) { 402 int result = 0; 403 if (n > 0) { 404 for (; n > 0 && result != DONE; --n) { 405 result = next(); 406 } 407 } else if (n < 0) { 408 for (; n < 0 && result != DONE; ++n) { 409 result = previous(); 410 } 411 } else { 412 result = current(); 413 } 414 return result; 415 } 416 417 /** 418 * Advances the iterator to the next boundary position. 419 * @return The position of the first boundary after this one. 420 */ 421 @Override next()422 public int next() { 423 fBreakCache.next(); 424 return fDone ? DONE : fPosition; 425 } 426 427 /** 428 * Moves the iterator backwards, to the boundary preceding the current one. 429 * @return The position of the boundary position immediately preceding the starting position. 430 */ 431 @Override previous()432 public int previous() { 433 fBreakCache.previous(); 434 return fDone ? DONE : fPosition; 435 } 436 437 /** 438 * Sets the iterator to refer to the first boundary position following 439 * the specified position. 440 * @param startPos The position from which to begin searching for a break position. 441 * @return The position of the first break after the current position. 442 */ 443 @Override following(int startPos)444 public int following(int startPos) { 445 // if the supplied position is before the beginning, return the 446 // text's starting offset 447 if (startPos < fText.getBeginIndex()) { 448 return first(); 449 } 450 451 // Move requested offset to a code point start. It might be between a lead and trail surrogate. 452 // Or it may be beyond the end of the text. 453 startPos = CISetIndex32(fText, startPos); 454 fBreakCache.following(startPos); 455 return fDone ? DONE : fPosition; 456 } 457 458 459 /** 460 * Sets the iterator to refer to the last boundary position before the 461 * specified position. 462 * @param offset The position to begin searching for a break from. 463 * @return The position of the last boundary before the starting position. 464 */ 465 @Override preceding(int offset)466 public int preceding(int offset) { 467 if (fText == null || offset > fText.getEndIndex()) { 468 return last(); 469 } else if (offset < fText.getBeginIndex()) { 470 return first(); 471 } 472 473 // Move requested offset to a code point start. It might be between a lead and trail surrogate. 474 // int adjustedOffset = CISetIndex32(fText, offset); // TODO: restore to match ICU4C behavior. 475 int adjustedOffset = offset; 476 fBreakCache.preceding(adjustedOffset); 477 return fDone ? DONE : fPosition; 478 479 } 480 481 482 /** 483 * Throw IllegalArgumentException unless begin <= offset < end. 484 */ checkOffset(int offset, CharacterIterator text)485 protected static final void checkOffset(int offset, CharacterIterator text) { 486 if (offset < text.getBeginIndex() || offset > text.getEndIndex()) { 487 throw new IllegalArgumentException("offset out of bounds"); 488 } 489 } 490 491 492 /** 493 * Returns true if the specified position is a boundary position. As a side 494 * effect, leaves the iterator pointing to the first boundary position at 495 * or after "offset". 496 * @param offset the offset to check. 497 * @return True if "offset" is a boundary position. 498 */ 499 @Override isBoundary(int offset)500 public boolean isBoundary(int offset) { 501 // TODO: behavior difference with ICU4C, which considers out-of-range offsets 502 // to not be boundaries, and to not be errors. 503 checkOffset(offset, fText); 504 505 // Adjust offset to be on a code point boundary and not beyond the end of the text. 506 // Note that isBoundary() is always false for offsets that are not on code point boundaries. 507 // But we still need the side effect of leaving iteration at the following boundary. 508 int adjustedOffset = CISetIndex32(fText, offset); 509 510 boolean result = false; 511 if (fBreakCache.seek(adjustedOffset) || fBreakCache.populateNear(adjustedOffset)) { 512 result = (fBreakCache.current() == offset); 513 } 514 515 if (!result) { 516 // Not on a boundary. isBoundary() must leave iterator on the following boundary. 517 // fBreakCache.seek(), above, left us on the preceding boundary, so advance one. 518 next(); 519 } 520 return result; 521 522 } 523 524 /** 525 * Returns the current iteration position. Note that DONE is never 526 * returned from this function; if iteration has run to the end of a 527 * string, current() will return the length of the string while 528 * next() will return BreakIterator.DONE). 529 * @return The current iteration position. 530 */ 531 @Override current()532 public int current() { 533 return (fText != null) ? fPosition : BreakIterator.DONE; 534 } 535 536 537 /** 538 * Return the status tag from the break rule that determined the boundary at 539 * the current iteration position. The values appear in the rule source 540 * within brackets, {123}, for example. For rules that do not specify a 541 * status, a default value of 0 is returned. If more than one rule applies, 542 * the numerically largest of the possible status values is returned. 543 * <p> 544 * Of the standard types of ICU break iterators, only the word and line break 545 * iterator provides status values. The values are defined in 546 * class RuleBasedBreakIterator, and allow distinguishing between words 547 * that contain alphabetic letters, "words" that appear to be numbers, 548 * punctuation and spaces, words containing ideographic characters, and 549 * more. Call <code>getRuleStatus</code> after obtaining a boundary 550 * position from <code>next()</code>, <code>previous()</code>, or 551 * any other break iterator functions that returns a boundary position. 552 * <p> 553 * Note that <code>getRuleStatus()</code> returns the value corresponding to 554 * <code>current()</code> index even after <code>next()</code> has returned DONE. 555 * <p> 556 557 * @return the status from the break rule that determined the boundary 558 * at the current iteration position. 559 */ 560 561 @Override getRuleStatus()562 public int getRuleStatus() { 563 // Status records have this form: 564 // Count N <-- fLastRuleStatusIndex points here. 565 // Status val 0 566 // Status val 1 567 // ... 568 // Status val N-1 <-- the value we need to return 569 // The status values are sorted in ascending order. 570 // This function returns the last (largest) of the array of status values. 571 int idx = fRuleStatusIndex + fRData.fStatusTable[fRuleStatusIndex]; 572 int tagVal = fRData.fStatusTable[idx]; 573 return tagVal; 574 } 575 576 /** 577 * Get the status (tag) values from the break rule(s) that determined the boundary 578 * at the current iteration position. The values appear in the rule source 579 * within brackets, {123}, for example. The default status value for rules 580 * that do not explicitly provide one is zero. 581 * <p> 582 * The status values used by the standard ICU break rules are defined 583 * as public constants in class RuleBasedBreakIterator. 584 * <p> 585 * If the size of the output array is insufficient to hold the data, 586 * the output will be truncated to the available length. No exception 587 * will be thrown. 588 * 589 * @param fillInArray an array to be filled in with the status values. 590 * @return The number of rule status values from the rules that determined 591 * the boundary at the current iteration position. 592 * In the event that the array is too small, the return value 593 * is the total number of status values that were available, 594 * not the reduced number that were actually returned. 595 */ 596 @Override getRuleStatusVec(int[] fillInArray)597 public int getRuleStatusVec(int[] fillInArray) { 598 int numStatusVals = fRData.fStatusTable[fRuleStatusIndex]; 599 if (fillInArray != null) { 600 int numToCopy = Math.min(numStatusVals, fillInArray.length); 601 for (int i=0; i<numToCopy; i++) { 602 fillInArray[i] = fRData.fStatusTable[fRuleStatusIndex + i + 1]; 603 } 604 } 605 return numStatusVals; 606 } 607 608 /** 609 * Returns a CharacterIterator over the text being analyzed. 610 * <p> 611 * <b><i>Caution:</i></b>The state of the returned CharacterIterator 612 * must not be modified in any way while the BreakIterator is still in use. 613 * Doing so will lead to undefined behavior of the BreakIterator. 614 * Clone the returned CharacterIterator first and work with that. 615 * <p> 616 * The returned CharacterIterator is a reference 617 * to the <b>actual iterator being used</b> by the BreakIterator. 618 * No guarantees are made about the current position 619 * of this iterator when it is returned; it may differ from the 620 * BreakIterators current position. If you need to move that 621 * position to examine the text, clone this function's return value first. 622 * @return An iterator over the text being analyzed. 623 */ 624 @Override getText()625 public CharacterIterator getText() { 626 return fText; 627 } 628 629 /** 630 * Set the iterator to analyze a new piece of text. This function resets 631 * the current iteration position to the beginning of the text. 632 * (The old iterator is dropped.) 633 * <p> 634 * <b><i>Caution:</i></b> The supplied CharacterIterator is used 635 * directly by the BreakIterator, and must not be altered in any 636 * way by code outside of the BreakIterator. 637 * Doing so will lead to undefined behavior of the BreakIterator. 638 * 639 * @param newText An iterator over the text to analyze. 640 */ 641 @Override setText(CharacterIterator newText)642 public void setText(CharacterIterator newText) { 643 if (newText != null) { 644 fBreakCache.reset(newText.getBeginIndex(), 0); 645 } else { 646 fBreakCache.reset(); 647 } 648 fDictionaryCache.reset(); 649 fText = newText; 650 this.first(); 651 } 652 653 /** 654 * Control debug, trace and dump options. 655 * @deprecated This API is ICU internal only. 656 * @hide draft / provisional / internal are hidden on Android 657 */ 658 @Deprecated 659 public static final String fDebugEnv = ICUDebug.enabled(RBBI_DEBUG_ARG) ? 660 ICUDebug.value(RBBI_DEBUG_ARG) : null; 661 662 getLanguageBreakEngine(int c)663 private LanguageBreakEngine getLanguageBreakEngine(int c) { 664 665 // We have a dictionary character. 666 // Does an already instantiated break engine handle it? 667 for (LanguageBreakEngine candidate : fBreakEngines) { 668 if (candidate.handles(c)) { 669 return candidate; 670 } 671 } 672 673 synchronized (gAllBreakEngines) { 674 // This break iterator's list of break engines didn't handle the character. 675 // Check the global list, another break iterator may have instantiated the 676 // desired engine. 677 for (LanguageBreakEngine candidate : gAllBreakEngines) { 678 if (candidate.handles(c)) { 679 fBreakEngines.add(candidate); 680 return candidate; 681 } 682 } 683 684 // The global list doesn't have an existing engine, build one. 685 int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT); 686 if (script == UScript.KATAKANA || script == UScript.HIRAGANA) { 687 // Katakana, Hiragana and Han are handled by the same dictionary engine. 688 // Fold them together for mapping from script -> engine. 689 script = UScript.HAN; 690 } 691 692 LanguageBreakEngine eng; 693 try { 694 switch (script) { 695 case UScript.THAI: 696 eng = new ThaiBreakEngine(); 697 break; 698 case UScript.LAO: 699 eng = new LaoBreakEngine(); 700 break; 701 case UScript.MYANMAR: 702 eng = new BurmeseBreakEngine(); 703 break; 704 case UScript.KHMER: 705 eng = new KhmerBreakEngine(); 706 break; 707 case UScript.HAN: 708 eng = new CjkBreakEngine(false); 709 break; 710 case UScript.HANGUL: 711 eng = new CjkBreakEngine(true); 712 break; 713 default: 714 gUnhandledBreakEngine.handleChar(c); 715 eng = gUnhandledBreakEngine; 716 break; 717 } 718 } catch (IOException e) { 719 eng = null; 720 } 721 722 if (eng != null && eng != gUnhandledBreakEngine) { 723 gAllBreakEngines.add(eng); 724 fBreakEngines.add(eng); 725 } 726 return eng; 727 } // end synchronized(gAllBreakEngines) 728 } 729 730 /** 731 * The State Machine Engine for moving forward is here. 732 * This function is the heart of the RBBI run time engine. 733 * 734 * Input 735 * fPosition, the position in the text to begin from. 736 * Output 737 * fPosition: the boundary following the starting position. 738 * fDictionaryCharCount the number of dictionary characters encountered. 739 * If > 0, the segment will be further subdivided 740 * fRuleStatusIndex Info from the state table indicating which rules caused the boundary. 741 * 742 * @return the new iterator position 743 * 744 * A note on supplementary characters and the position of underlying 745 * Java CharacterIterator: Normally, a character iterator is positioned at 746 * the char most recently returned by next(). Within this function, when 747 * a supplementary char is being processed, the char iterator is left 748 * sitting on the trail surrogate, in the middle of the code point. 749 * This is different from everywhere else, where an iterator always 750 * points at the lead surrogate of a supplementary. 751 */ handleNext()752 private int handleNext() { 753 if (TRACE) { 754 System.out.println("Handle Next pos char state category"); 755 } 756 757 // handleNext always sets the break tag value. 758 // Set the default for it. 759 fRuleStatusIndex = 0; 760 fDictionaryCharCount = 0; 761 762 // caches for quicker access 763 CharacterIterator text = fText; 764 CodePointTrie trie = fRData.fTrie; 765 766 char[] stateTable = fRData.fFTable.fTable; 767 int initialPosition = fPosition; 768 text.setIndex(initialPosition); 769 int result = initialPosition; 770 771 // Set up the starting char 772 int c = text.current(); 773 if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) { 774 c = nextTrail32(text, c); 775 if (c == DONE32) { 776 fDone = true; 777 return BreakIterator.DONE; 778 } 779 } 780 781 // Set the initial state for the state machine 782 int state = START_STATE; 783 int row = fRData.getRowIndex(state); 784 short category = 3; 785 int flagsState = fRData.fFTable.fFlags; 786 int dictStart = fRData.fFTable.fDictCategoriesStart; 787 int mode = RBBI_RUN; 788 if ((flagsState & RBBIDataWrapper.RBBI_BOF_REQUIRED) != 0) { 789 category = 2; 790 mode = RBBI_START; 791 if (TRACE) { 792 System.out.print(" " + RBBIDataWrapper.intToString(text.getIndex(), 5)); 793 System.out.print(RBBIDataWrapper.intToHexString(c, 10)); 794 System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6)); 795 } 796 } 797 798 // loop until we reach the end of the text or transition to state 0 799 while (state != STOP_STATE) { 800 if (c == DONE32) { 801 // Reached end of input string. 802 if (mode == RBBI_END) { 803 // We have already run the loop one last time with the 804 // character set to the pseudo {eof} value. Now it is time 805 // to unconditionally bail out. 806 break; 807 } 808 // Run the loop one last time with the fake end-of-input character category 809 mode = RBBI_END; 810 category = 1; 811 } 812 else if (mode == RBBI_RUN) { 813 // Get the char category. An incoming category of 1 or 2 mens that 814 // we are preset for doing the beginning or end of input, and 815 // that we shouldn't get a category from an actual text input character. 816 // 817 818 // look up the current character's character category, which tells us 819 // which column in the state table to look at. 820 // 821 category = (short) trie.get(c); 822 823 // Check for categories that require word dictionary handling. 824 if (category >= dictStart) { 825 fDictionaryCharCount++; 826 } 827 828 if (TRACE) { 829 System.out.print(" " + RBBIDataWrapper.intToString(text.getIndex(), 5)); 830 System.out.print(RBBIDataWrapper.intToHexString(c, 10)); 831 System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6)); 832 } 833 834 // Advance to the next character. 835 // If this is a beginning-of-input loop iteration, don't advance. 836 // The next iteration will be processing the first real input character. 837 c = text.next(); 838 if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) { 839 c = nextTrail32(text, c); 840 } 841 } 842 else { 843 mode = RBBI_RUN; 844 } 845 846 // look up a state transition in the state table 847 state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category]; 848 row = fRData.getRowIndex(state); 849 int accepting = stateTable[row + RBBIDataWrapper.ACCEPTING]; 850 if (accepting == RBBIDataWrapper.ACCEPTING_UNCONDITIONAL) { 851 // Match found, common case 852 result = text.getIndex(); 853 if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c <= UTF16.CODEPOINT_MAX_VALUE) { 854 // The iterator has been left in the middle of a surrogate pair. 855 // We want the start of it. 856 result--; 857 } 858 859 // Remember the break status (tag) values. 860 fRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGSIDX]; 861 } else if (accepting > RBBIDataWrapper.ACCEPTING_UNCONDITIONAL) { 862 // Lookahead match is completed 863 int lookaheadResult = fLookAheadMatches[accepting]; 864 if (lookaheadResult >= 0) { 865 fRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGSIDX]; 866 fPosition = lookaheadResult; 867 return lookaheadResult; 868 } 869 } 870 871 872 // If we are at the position of the '/' in a look-ahead (hard break) rule; 873 // record the current position, to be returned later, if the full rule matches. 874 // TODO: Move this check before the previous check of fAccepting. 875 // This would enable hard-break rules with no following context. 876 // But there are line break test failures when trying this. Investigate. 877 // Issue ICU-20837 878 int rule = stateTable[row + RBBIDataWrapper.LOOKAHEAD]; 879 if (rule != 0) { 880 int pos = text.getIndex(); 881 if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c <= UTF16.CODEPOINT_MAX_VALUE) { 882 // The iterator has been left in the middle of a surrogate pair. 883 // We want the beginning of it. 884 pos--; 885 } 886 fLookAheadMatches[rule] = pos; 887 } 888 889 890 } // End of state machine main loop 891 892 // The state machine is done. Check whether it found a match... 893 894 // If the iterator failed to advance in the match engine force it ahead by one. 895 // This indicates a defect in the break rules, which should always match 896 // at least one character. 897 898 if (result == initialPosition) { 899 if (TRACE) { 900 System.out.println("Iterator did not move. Advancing by 1."); 901 } 902 text.setIndex(initialPosition); 903 next32(text); 904 result = text.getIndex(); 905 fRuleStatusIndex = 0; 906 } 907 908 // Leave the iterator at our result position. 909 // (we may have advanced beyond the last accepting position chasing after 910 // longer matches that never completed.) 911 fPosition = result; 912 913 if (TRACE) { 914 System.out.println("result = " + result); 915 } 916 return result; 917 } 918 919 /** 920 * Iterate backwards from an arbitrary position in the input text using the Safe Reverse rules. 921 * This locates a "Safe Position" from which the forward break rules 922 * will operate correctly. A Safe Position is not necessarily a boundary itself. 923 * 924 * The logic of this function is very similar to handleNext(), above, but simpler 925 * because the safe table does not require as many options. 926 * 927 * @param fromPosition the position in the input text to begin the iteration. 928 * @hide draft / provisional / internal are hidden on Android 929 */ handleSafePrevious(int fromPosition)930 private int handleSafePrevious(int fromPosition) { 931 char state; 932 short category = 0; 933 int result = 0; 934 935 // caches for quicker access 936 CharacterIterator text = fText; 937 CodePointTrie trie = fRData.fTrie; 938 char[] stateTable = fRData.fRTable.fTable; 939 940 CISetIndex32(text, fromPosition); 941 if (TRACE) { 942 System.out.print("Handle Previous pos char state category"); 943 } 944 945 // if we're already at the start of the text, return DONE. 946 if (text.getIndex() == text.getBeginIndex()) { 947 return BreakIterator.DONE; 948 } 949 950 // Set the initial state for the state machine 951 int c = CharacterIteration.previous32(text); 952 state = START_STATE; 953 int row = fRData.getRowIndex(state); 954 955 // loop until we reach the start of the text or transition to state 0 956 // 957 for (; c != DONE32; c = CharacterIteration.previous32(text)) { 958 959 // look up the current character's character category, which tells us 960 // which column in the state table to look at. 961 // 962 // And off the dictionary flag bit. For reverse iteration it is not used. 963 category = (short) trie.get(c); 964 if (TRACE) { 965 System.out.print(" " + RBBIDataWrapper.intToString(text.getIndex(), 5)); 966 System.out.print(RBBIDataWrapper.intToHexString(c, 10)); 967 System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6)); 968 } 969 970 // State Transition - move machine to its next state 971 // 972 assert(category < fRData.fHeader.fCatCount); 973 state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category]; 974 row = fRData.getRowIndex(state); 975 976 if (state == STOP_STATE) { 977 // This is the normal exit from the lookup state machine. 978 // Transition to state zero means we have found a safe point. 979 break; 980 } 981 } 982 983 // The state machine is done. 984 result = text.getIndex(); 985 if (TRACE) { 986 System.out.println("result = " + result); 987 } 988 return result; 989 } 990 991 /** 992 * Set the index of a CharacterIterator. 993 * Pin the index to the valid range range of BeginIndex <= index <= EndIndex. 994 * If the index points to a trail surrogate of a supplementary character, adjust it 995 * to the start (lead surrogate) index. 996 * 997 * @param ci A CharacterIterator to set 998 * @param index the index to set 999 * @return the resulting index, possibly pinned or adjusted. 1000 */ 1001 private static int CISetIndex32(CharacterIterator ci, int index) { 1002 if (index <= ci.getBeginIndex()) { 1003 ci.first(); 1004 } else if (index >= ci.getEndIndex()) { 1005 ci.setIndex(ci.getEndIndex()); 1006 } else if (Character.isLowSurrogate(ci.setIndex(index))) { 1007 if (!Character.isHighSurrogate(ci.previous())) { 1008 ci.next(); 1009 } 1010 } 1011 return ci.getIndex(); 1012 } 1013 1014 /** DictionaryCache stores the boundaries obtained from a run of dictionary characters. 1015 * Dictionary boundaries are moved first to this cache, then from here 1016 * to the main BreakCache, where they may inter-leave with non-dictionary 1017 * boundaries. The public BreakIterator API always fetches directly 1018 * from the main BreakCache, not from here. 1019 * 1020 * In common situations, the number of boundaries in a single dictionary run 1021 * should be quite small, it will be terminated by punctuation, spaces, 1022 * or any other non-dictionary characters. The main BreakCache may end 1023 * up with boundaries from multiple dictionary based runs. 1024 * 1025 * The boundaries are stored in a simple ArrayList (vector), with the 1026 * assumption that they will be accessed sequentially. 1027 */ 1028 class DictionaryCache { 1029 1030 void reset() { 1031 fPositionInCache = -1; 1032 fStart = 0; 1033 fLimit = 0; 1034 fFirstRuleStatusIndex = 0; 1035 fOtherRuleStatusIndex = 0; 1036 fBreaks.removeAllElements(); 1037 }; 1038 1039 boolean following(int fromPos) { 1040 if (fromPos >= fLimit || fromPos < fStart) { 1041 fPositionInCache = -1; 1042 return false; 1043 } 1044 1045 // Sequential iteration, move from previous boundary to the following 1046 1047 int r = 0; 1048 if (fPositionInCache >= 0 && fPositionInCache < fBreaks.size() && fBreaks.elementAt(fPositionInCache) == fromPos) { 1049 ++fPositionInCache; 1050 if (fPositionInCache >= fBreaks.size()) { 1051 fPositionInCache = -1; 1052 return false; 1053 } 1054 r = fBreaks.elementAt(fPositionInCache); 1055 assert(r > fromPos); 1056 fBoundary = r; 1057 fStatusIndex = fOtherRuleStatusIndex; 1058 return true; 1059 } 1060 1061 // Random indexing. Linear search for the boundary following the given position. 1062 1063 for (fPositionInCache = 0; fPositionInCache < fBreaks.size(); ++fPositionInCache) { 1064 r= fBreaks.elementAt(fPositionInCache); 1065 if (r > fromPos) { 1066 fBoundary = r; 1067 fStatusIndex = fOtherRuleStatusIndex; 1068 return true; 1069 } 1070 } 1071 1072 // Internal error. fStart <= fromPos < fLimit, but no cached boundary. 1073 assert(false); 1074 fPositionInCache = -1; 1075 return false; 1076 }; 1077 preceding(int fromPos)1078 boolean preceding(int fromPos) { 1079 if (fromPos <= fStart || fromPos > fLimit) { 1080 fPositionInCache = -1; 1081 return false; 1082 } 1083 1084 if (fromPos == fLimit) { 1085 fPositionInCache = fBreaks.size() - 1; 1086 if (fPositionInCache >= 0) { 1087 assert(fBreaks.elementAt(fPositionInCache) == fromPos); 1088 } 1089 } 1090 1091 int r; 1092 if (fPositionInCache > 0 && fPositionInCache < fBreaks.size() && fBreaks.elementAt(fPositionInCache) == fromPos) { 1093 --fPositionInCache; 1094 r = fBreaks.elementAt(fPositionInCache); 1095 assert(r < fromPos); 1096 fBoundary = r; 1097 fStatusIndex = ( r== fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex; 1098 return true; 1099 } 1100 1101 if (fPositionInCache == 0) { 1102 fPositionInCache = -1; 1103 return false; 1104 } 1105 1106 for (fPositionInCache = fBreaks.size()-1; fPositionInCache >= 0; --fPositionInCache) { 1107 r = fBreaks.elementAt(fPositionInCache); 1108 if (r < fromPos) { 1109 fBoundary = r; 1110 fStatusIndex = ( r == fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex; 1111 return true; 1112 } 1113 } 1114 assert(false); 1115 fPositionInCache = -1; 1116 return false; 1117 }; 1118 1119 /** 1120 * Populate the cache with the dictionary based boundaries within a region of text. 1121 * @param startPos The start position of a range of text 1122 * @param endPos The end position of a range of text 1123 * @param firstRuleStatus The rule status index that applies to the break at startPos 1124 * @param otherRuleStatus The rule status index that applies to boundaries other than startPos 1125 * @hide draft / provisional / internal are hidden on Android 1126 */ populateDictionary(int startPos, int endPos, int firstRuleStatus, int otherRuleStatus)1127 void populateDictionary(int startPos, int endPos, 1128 int firstRuleStatus, int otherRuleStatus) { 1129 if ((endPos - startPos) <= 1) { 1130 return; 1131 } 1132 1133 reset(); 1134 fFirstRuleStatusIndex = firstRuleStatus; 1135 fOtherRuleStatusIndex = otherRuleStatus; 1136 1137 int rangeStart = startPos; 1138 int rangeEnd = endPos; 1139 1140 int category; 1141 int current; 1142 int foundBreakCount = 0; 1143 1144 // Loop through the text, looking for ranges of dictionary characters. 1145 // For each span, find the appropriate break engine, and ask it to find 1146 // any breaks within the span. 1147 1148 fText.setIndex(rangeStart); 1149 int c = CharacterIteration.current32(fText); 1150 category = (short)fRData.fTrie.get(c); 1151 int dictStart = fRData.fFTable.fDictCategoriesStart; 1152 1153 while(true) { 1154 while((current = fText.getIndex()) < rangeEnd && (category < dictStart)) { 1155 c = CharacterIteration.next32(fText); // pre-increment 1156 category = (short)fRData.fTrie.get(c); 1157 } 1158 if (current >= rangeEnd) { 1159 break; 1160 } 1161 1162 // We now have a dictionary character. Get the appropriate language object 1163 // to deal with it. 1164 LanguageBreakEngine lbe = getLanguageBreakEngine(c); 1165 1166 // Ask the language object if there are any breaks. It will add them to the cache and 1167 // leave the text pointer on the other side of its range, ready to search for the next one. 1168 if (lbe != null) { 1169 foundBreakCount += lbe.findBreaks(fText, rangeStart, rangeEnd, fBreaks); 1170 } 1171 1172 // Reload the loop variables for the next go-round 1173 c = CharacterIteration.current32(fText); 1174 category = (short)fRData.fTrie.get(c); 1175 } 1176 1177 // If we found breaks, ensure that the first and last entries are 1178 // the original starting and ending position. And initialize the 1179 // cache iteration position to the first entry. 1180 1181 // System.out.printf("foundBreakCount = %d%n", foundBreakCount); 1182 if (foundBreakCount > 0) { 1183 assert(foundBreakCount == fBreaks.size()); 1184 if (startPos < fBreaks.elementAt(0)) { 1185 // The dictionary did not place a boundary at the start of the segment of text. 1186 // Add one now. This should not commonly happen, but it would be easy for interactions 1187 // of the rules for dictionary segments and the break engine implementations to 1188 // inadvertently cause it. Cover it here, just in case. 1189 fBreaks.offer(startPos); 1190 } 1191 if (endPos > fBreaks.peek()) { 1192 fBreaks.push(endPos); 1193 } 1194 fPositionInCache = 0; 1195 // Note: Dictionary matching may extend beyond the original limit. 1196 fStart = fBreaks.elementAt(0); 1197 fLimit = fBreaks.peek(); 1198 } else { 1199 // there were no language-based breaks, even though the segment contained 1200 // dictionary characters. Subsequent attempts to fetch boundaries from the dictionary cache 1201 // for this range will fail, and the calling code will fall back to the rule based boundaries. 1202 } 1203 1204 }; 1205 1206 DictionaryCache()1207 DictionaryCache() { 1208 fPositionInCache = -1; 1209 fBreaks = new DictionaryBreakEngine.DequeI(); 1210 } 1211 1212 /** 1213 * copy constructor. Used by RuleBasedBreakIterator.clone(). 1214 * 1215 * @param src the source object to be copied. 1216 */ DictionaryCache(DictionaryCache src)1217 DictionaryCache(DictionaryCache src) { 1218 try { 1219 fBreaks = (DictionaryBreakEngine.DequeI)src.fBreaks.clone(); 1220 } 1221 catch (CloneNotSupportedException e) { 1222 throw new RuntimeException(e); 1223 } 1224 fPositionInCache = src.fPositionInCache; 1225 fStart = src.fStart; 1226 fLimit = src.fLimit; 1227 fFirstRuleStatusIndex = src.fFirstRuleStatusIndex; 1228 fOtherRuleStatusIndex = src.fOtherRuleStatusIndex; 1229 fBoundary = src.fBoundary; 1230 fStatusIndex = src.fStatusIndex; 1231 } 1232 1233 // A data structure containing the boundaries themselves. Essentially a vector of raw ints. 1234 DictionaryBreakEngine.DequeI fBreaks; 1235 int fPositionInCache; // Index in fBreaks of last boundary returned by following() 1236 // // or preceding(). Optimizes sequential access. 1237 int fStart; // Text position of first boundary in cache. 1238 int fLimit; // Last boundary in cache. Which is the limit of the 1239 // // text segment being handled by the dictionary. 1240 int fFirstRuleStatusIndex; // Rule status info for first boundary. 1241 int fOtherRuleStatusIndex; // Rule status info for 2nd through last boundaries. 1242 int fBoundary; // Current boundary. Set by preceding(), following(). 1243 int fStatusIndex; // Current rule status index. Set by preceding, following(). 1244 }; 1245 1246 1247 1248 1249 /* 1250 * class BreakCache 1251 * 1252 * Cache of break boundary positions and rule status values. 1253 * Break iterator API functions, next(), previous(), etc., will use cached results 1254 * when possible, and otherwise cache new results as they are obtained. 1255 * 1256 * Uniformly caches both dictionary and rule based (non-dictionary) boundaries. 1257 * 1258 * The cache is implemented as a single circular buffer. 1259 */ 1260 1261 /* 1262 * size of the circular cache buffer. 1263 */ 1264 1265 class BreakCache { 1266 BreakCache()1267 BreakCache() { 1268 reset(); 1269 }; 1270 reset(int pos, int ruleStatus)1271 void reset(int pos, int ruleStatus) { 1272 fStartBufIdx = 0; 1273 fEndBufIdx = 0; 1274 fTextIdx = pos; 1275 fBufIdx = 0; 1276 fBoundaries[0] = pos; 1277 fStatuses[0] = (short)ruleStatus; 1278 } 1279 reset()1280 void reset() {reset(0, 0); }; 1281 next()1282 void next() { 1283 if (fBufIdx == fEndBufIdx) { 1284 fDone = !populateFollowing(); 1285 fPosition = fTextIdx; 1286 fRuleStatusIndex = fStatuses[fBufIdx]; 1287 } else { 1288 fBufIdx = modChunkSize(fBufIdx + 1); 1289 fTextIdx = fPosition = fBoundaries[fBufIdx]; 1290 fRuleStatusIndex = fStatuses[fBufIdx]; 1291 } 1292 }; 1293 previous()1294 void previous() { 1295 int initialBufIdx = fBufIdx; 1296 if (fBufIdx == fStartBufIdx) { 1297 // At start of cache. Prepend to it. 1298 populatePreceding(); 1299 } else { 1300 // Cache already holds the next boundary 1301 fBufIdx = modChunkSize(fBufIdx - 1); 1302 fTextIdx = fBoundaries[fBufIdx]; 1303 } 1304 fDone = (fBufIdx == initialBufIdx); 1305 fPosition = fTextIdx; 1306 fRuleStatusIndex = fStatuses[fBufIdx]; 1307 return; 1308 }; 1309 1310 // Move the iteration state to the position following the startPosition. 1311 // Input position must be pinned to the input length. following(int startPos)1312 void following(int startPos) { 1313 if (startPos == fTextIdx || seek(startPos) || populateNear(startPos)) { 1314 // startPos is in the cache. Do a next() from that position. 1315 // TODO: an awkward set of interactions with bi->fDone 1316 // seek() does not clear it; it can't because of interactions with populateNear(). 1317 // next() does not clear it in the fast-path case, where everything matters. Maybe it should. 1318 // So clear it here, for the case where seek() succeeded on an iterator that had previously run off the end. 1319 fDone = false; 1320 next(); 1321 } 1322 1323 }; 1324 preceding(int startPos)1325 void preceding(int startPos) { 1326 if (startPos == fTextIdx || seek(startPos) || populateNear(startPos)) { 1327 if (startPos == fTextIdx) { 1328 previous(); 1329 } else { 1330 // seek() leaves the BreakCache positioned at the preceding boundary 1331 // if the requested position is between two bounaries. 1332 // current() pushes the BreakCache position out to the BreakIterator itself. 1333 assert(startPos > fTextIdx); 1334 current(); 1335 } 1336 } 1337 return; 1338 }; 1339 1340 /** 1341 * Update the state of the public BreakIterator (fBI) to reflect the 1342 * current state of the break iterator cache (this). 1343 */ current()1344 int current() { 1345 fPosition = fTextIdx; 1346 fRuleStatusIndex = fStatuses[fBufIdx]; 1347 fDone = false; 1348 return fTextIdx; 1349 }; 1350 1351 /** 1352 * Add boundaries to the cache near the specified position. 1353 * The given position need not be a boundary itself. 1354 * The input position must be within the range of the text, and 1355 * on a code point boundary. 1356 * If the requested position is a break boundary, leave the iteration 1357 * position on it. 1358 * If the requested position is not a boundary, leave the iteration 1359 * position on the preceding boundary and include both the the 1360 * preceding and following boundaries in the cache. 1361 * Additional boundaries, either preceding or following, may be added 1362 * to the cache as a side effect. 1363 * 1364 * Return false if the operation failed. 1365 */ populateNear(int position)1366 boolean populateNear(int position) { 1367 assert(position < fBoundaries[fStartBufIdx] || position > fBoundaries[fEndBufIdx]); 1368 1369 // Find a boundary somewhere in the vicinity of the requested position. 1370 // Depending on the safe rules and the text data, it could be either before, at, or after 1371 // the requested position. 1372 1373 1374 // If the requested position is not near already cached positions, clear the existing cache, 1375 // find a near-by boundary and begin new cache contents there. 1376 1377 if ((position < fBoundaries[fStartBufIdx] - 15) || position > (fBoundaries[fEndBufIdx] + 15)) { 1378 int aBoundary = fText.getBeginIndex(); 1379 int ruleStatusIndex = 0; 1380 if (position > aBoundary + 20) { 1381 int backupPos = handleSafePrevious(position); 1382 if (backupPos > aBoundary) { 1383 // Advance to the boundary following the backup position. 1384 // There is a complication: the safe reverse rules identify pairs of code points 1385 // that are safe. If advancing from the safe point moves forwards by less than 1386 // two code points, we need to advance one more time to ensure that the boundary 1387 // is good, including a correct rules status value. 1388 // 1389 fPosition = backupPos; 1390 aBoundary = handleNext(); 1391 if (aBoundary == backupPos + 1 || 1392 (aBoundary == backupPos + 2 && 1393 Character.isHighSurrogate(fText.setIndex(backupPos)) && 1394 Character.isLowSurrogate(fText.next()))) { 1395 // The initial handleNext() only advanced by a single code point. Go again. 1396 // Safe rules identify safe pairs. 1397 aBoundary = handleNext(); 1398 } 1399 } 1400 ruleStatusIndex = fRuleStatusIndex; 1401 } 1402 reset(aBoundary, ruleStatusIndex); // Reset cache to hold aBoundary as a single starting point. 1403 } 1404 1405 // Fill in boundaries between existing cache content and the new requested position. 1406 1407 if (fBoundaries[fEndBufIdx] < position) { 1408 // The last position in the cache precedes the requested position. 1409 // Add following position(s) to the cache. 1410 while (fBoundaries[fEndBufIdx] < position) { 1411 if (!populateFollowing()) { 1412 assert false; 1413 return false; 1414 } 1415 } 1416 fBufIdx = fEndBufIdx; // Set iterator position to the end of the buffer. 1417 fTextIdx = fBoundaries[fBufIdx]; // Required because populateFollowing may add extra boundaries. 1418 while (fTextIdx > position) { // Move backwards to a position at or preceding the requested pos. 1419 previous(); 1420 } 1421 return true; 1422 } 1423 1424 if (fBoundaries[fStartBufIdx] > position) { 1425 // The first position in the cache is beyond the requested position. 1426 // back up more until we get a boundary <= the requested position. 1427 while (fBoundaries[fStartBufIdx] > position) { 1428 populatePreceding(); 1429 } 1430 fBufIdx = fStartBufIdx; // Set iterator position to the start of the buffer. 1431 fTextIdx = fBoundaries[fBufIdx]; // Required because populatePreceding may add extra boundaries. 1432 while (fTextIdx < position) { // Move forwards to a position at or following the requested pos. 1433 next(); 1434 } 1435 if (fTextIdx > position) { 1436 // If position is not itself a boundary, the next() loop above will overshoot. 1437 // Back up one, leaving cache position at the boundary preceding the requested position. 1438 previous(); 1439 } 1440 return true; 1441 } 1442 1443 assert fTextIdx == position; 1444 return true; 1445 1446 }; 1447 1448 /** 1449 * Add boundary(s) to the cache following the current last boundary. 1450 * Return false if at the end of the text, and no more boundaries can be added. 1451 * Leave iteration position at the first newly added boundary, or unchanged if no boundary was added. 1452 */ populateFollowing()1453 boolean populateFollowing() { 1454 int fromPosition = fBoundaries[fEndBufIdx]; 1455 int fromRuleStatusIdx = fStatuses[fEndBufIdx]; 1456 int pos = 0; 1457 int ruleStatusIdx = 0; 1458 1459 if (fDictionaryCache.following(fromPosition)) { 1460 addFollowing(fDictionaryCache.fBoundary, fDictionaryCache.fStatusIndex, UpdateCachePosition); 1461 return true; 1462 } 1463 1464 fPosition = fromPosition; 1465 pos = handleNext(); 1466 if (pos == BreakIterator.DONE) { 1467 return false; 1468 } 1469 1470 ruleStatusIdx = fRuleStatusIndex; 1471 if (fDictionaryCharCount > 0) { 1472 // The text segment obtained from the rules includes dictionary characters. 1473 // Subdivide it, with subdivided results going into the dictionary cache. 1474 fDictionaryCache.populateDictionary(fromPosition, pos, fromRuleStatusIdx, ruleStatusIdx); 1475 if (fDictionaryCache.following(fromPosition)) { 1476 addFollowing(fDictionaryCache.fBoundary, fDictionaryCache.fStatusIndex, UpdateCachePosition); 1477 return true; 1478 // TODO: may want to move a sizable chunk of the dictionary cache to the break cache at this point. 1479 // But be careful with interactions with populateNear(). 1480 } 1481 } 1482 1483 // Rule based segment did not include dictionary characters. 1484 // Or, it did contain dictionary chars, but the dictionary segmenter didn't handle them, 1485 // meaning that we didn't take the return, above. 1486 // Add its end point to the cache. 1487 addFollowing(pos, ruleStatusIdx, UpdateCachePosition); 1488 1489 // Add several non-dictionary boundaries at this point, to optimize straight forward iteration. 1490 // (subsequent calls to BreakIterator::next() will take the fast path, getting cached results. 1491 // 1492 for (int count=0; count<6; ++count) { 1493 pos = handleNext(); 1494 if (pos == BreakIterator.DONE || fDictionaryCharCount > 0) { 1495 break; 1496 } 1497 addFollowing(pos, fRuleStatusIndex, RetainCachePosition); 1498 } 1499 return true; 1500 }; 1501 1502 /** 1503 * Add one or more boundaries to the cache preceding the first currently cached boundary. 1504 * Leave the iteration position on the first added boundary. 1505 * Return false if no boundaries could be added (if at the start of the text.) 1506 */ populatePreceding()1507 boolean populatePreceding() { 1508 int textBegin = fText.getBeginIndex(); 1509 int fromPosition = fBoundaries[fStartBufIdx]; 1510 if (fromPosition == textBegin) { 1511 return false; 1512 } 1513 1514 int position = textBegin; 1515 int positionStatusIdx = 0; 1516 1517 if (fDictionaryCache.preceding(fromPosition)) { 1518 addPreceding(fDictionaryCache.fBoundary, fDictionaryCache.fStatusIndex, UpdateCachePosition); 1519 return true; 1520 } 1521 1522 int backupPosition = fromPosition; 1523 1524 // Find a boundary somewhere preceding the first already-cached boundary 1525 do { 1526 backupPosition = backupPosition - 30; 1527 if (backupPosition <= textBegin) { 1528 backupPosition = textBegin; 1529 } else { 1530 backupPosition = handleSafePrevious(backupPosition); 1531 } 1532 if (backupPosition == BreakIterator.DONE || backupPosition == textBegin) { 1533 position = textBegin; 1534 positionStatusIdx = 0; 1535 } else { 1536 // Advance to the boundary following the backup position. 1537 // There is a complication: the safe reverse rules identify pairs of code points 1538 // that are safe. If advancing from the safe point moves forwards by less than 1539 // two code points, we need to advance one more time to ensure that the boundary 1540 // is good, including a correct rules status value. 1541 // 1542 fPosition = backupPosition; // TODO: pass starting position in a clearer way. 1543 position = handleNext(); 1544 if (position == backupPosition + 1 || 1545 (position == backupPosition + 2 && 1546 Character.isHighSurrogate(fText.setIndex(backupPosition)) && 1547 Character.isLowSurrogate(fText.next()))) { 1548 // The initial handleNext() only advanced by a single code point. Go again. 1549 // Safe rules identify safe pairs. 1550 position = handleNext(); 1551 } 1552 positionStatusIdx = fRuleStatusIndex; 1553 } 1554 } while (position >= fromPosition); 1555 1556 // Find boundaries between the one we just located and the first already-cached boundary 1557 // Put them in a side buffer, because we don't yet know where they will fall in the circular cache buffer. 1558 1559 fSideBuffer.removeAllElements(); 1560 fSideBuffer.push(position); 1561 fSideBuffer.push(positionStatusIdx); 1562 1563 do { 1564 int prevPosition = fPosition = position; 1565 int prevStatusIdx = positionStatusIdx; 1566 position = handleNext(); 1567 positionStatusIdx = fRuleStatusIndex; 1568 if (position == BreakIterator.DONE) { 1569 break; 1570 } 1571 1572 boolean segmentHandledByDictionary = false; 1573 if (fDictionaryCharCount != 0) { 1574 // Segment from the rules includes dictionary characters. 1575 // Subdivide it, with subdivided results going into the dictionary cache. 1576 int dictSegEndPosition = position; 1577 fDictionaryCache.populateDictionary(prevPosition, dictSegEndPosition, prevStatusIdx, positionStatusIdx); 1578 while (fDictionaryCache.following(prevPosition)) { 1579 position = fDictionaryCache.fBoundary; 1580 positionStatusIdx = fDictionaryCache.fStatusIndex; 1581 segmentHandledByDictionary = true; 1582 assert(position > prevPosition); 1583 if (position >= fromPosition) { 1584 break; 1585 } 1586 assert(position <= dictSegEndPosition); 1587 fSideBuffer.push(position); 1588 fSideBuffer.push(positionStatusIdx); 1589 prevPosition = position; 1590 } 1591 assert(position==dictSegEndPosition || position>=fromPosition); 1592 } 1593 1594 if (!segmentHandledByDictionary && position < fromPosition) { 1595 fSideBuffer.push(position); 1596 fSideBuffer.push(positionStatusIdx); 1597 } 1598 } while (position < fromPosition); 1599 1600 // Move boundaries from the side buffer to the main circular buffer. 1601 boolean success = false; 1602 if (!fSideBuffer.isEmpty()) { 1603 positionStatusIdx = fSideBuffer.pop(); 1604 position = fSideBuffer.pop(); 1605 addPreceding(position, positionStatusIdx, UpdateCachePosition); 1606 success = true; 1607 } 1608 1609 while (!fSideBuffer.isEmpty()) { 1610 positionStatusIdx = fSideBuffer.pop(); 1611 position = fSideBuffer.pop(); 1612 if (!addPreceding(position, positionStatusIdx, RetainCachePosition)) { 1613 // No space in circular buffer to hold a new preceding result while 1614 // also retaining the current cache (iteration) position. 1615 // Bailing out is safe; the cache will refill again if needed. 1616 break; 1617 } 1618 } 1619 return success; 1620 }; 1621 1622 1623 static final boolean RetainCachePosition = false; 1624 static final boolean UpdateCachePosition = true; 1625 1626 /** 1627 * Add the boundary following the current position. 1628 * The current position can be left as it was, or changed to the newly added boundary, 1629 * as specified by the update parameter. 1630 */ addFollowing(int position, int ruleStatusIdx, boolean update)1631 void addFollowing(int position, int ruleStatusIdx, boolean update) { 1632 assert(position > fBoundaries[fEndBufIdx]); 1633 assert(ruleStatusIdx <= Short.MAX_VALUE); 1634 int nextIdx = modChunkSize(fEndBufIdx + 1); 1635 if (nextIdx == fStartBufIdx) { 1636 fStartBufIdx = modChunkSize(fStartBufIdx + 6); // TODO: experiment. Probably revert to 1. 1637 } 1638 fBoundaries[nextIdx] = position; 1639 fStatuses[nextIdx] = (short)ruleStatusIdx; 1640 fEndBufIdx = nextIdx; 1641 if (update == UpdateCachePosition) { 1642 // Set current position to the newly added boundary. 1643 fBufIdx = nextIdx; 1644 fTextIdx = position; 1645 } else { 1646 // Retaining the original cache position. 1647 // Check if the added boundary wraps around the buffer, and would over-write the original position. 1648 // It's the responsibility of callers of this function to not add too many. 1649 assert(nextIdx != fBufIdx); 1650 } 1651 1652 }; 1653 1654 1655 /** 1656 * Add the boundary preceding the current position. 1657 * The current position can be left as it was, or changed to the newly added boundary, 1658 * as specified by the update parameter. 1659 */ addPreceding(int position, int ruleStatusIdx, boolean update)1660 boolean addPreceding(int position, int ruleStatusIdx, boolean update) { 1661 assert(position < fBoundaries[fStartBufIdx]); 1662 assert(ruleStatusIdx <= Short.MAX_VALUE); 1663 int nextIdx = modChunkSize(fStartBufIdx - 1); 1664 if (nextIdx == fEndBufIdx) { 1665 if (fBufIdx == fEndBufIdx && update == RetainCachePosition) { 1666 // Failure. The insertion of the new boundary would claim the buffer position that is the 1667 // current iteration position. And we also want to retain the current iteration position. 1668 // (The buffer is already completely full of entries that precede the iteration position.) 1669 return false; 1670 } 1671 fEndBufIdx = modChunkSize(fEndBufIdx - 1); 1672 } 1673 fBoundaries[nextIdx] = position; 1674 fStatuses[nextIdx] = (short)ruleStatusIdx; 1675 fStartBufIdx = nextIdx; 1676 if (update == UpdateCachePosition) { 1677 fBufIdx = nextIdx; 1678 fTextIdx = position; 1679 } 1680 return true; 1681 }; 1682 1683 /** 1684 * Set the cache position to the specified position, or, if the position 1685 * falls between to cached boundaries, to the preceding boundary. 1686 * Fails if the requested position is outside of the range of boundaries currently held by the cache. 1687 * The startPosition must be on a code point boundary. 1688 * 1689 * Return true if successful, false if the specified position is after 1690 * the last cached boundary or before the first. 1691 */ 1692 boolean seek(int pos) { 1693 if (pos < fBoundaries[fStartBufIdx] || pos > fBoundaries[fEndBufIdx]) { 1694 return false; 1695 } 1696 if (pos == fBoundaries[fStartBufIdx]) { 1697 // Common case: seek(0), from BreakIterator::first() 1698 fBufIdx = fStartBufIdx; 1699 fTextIdx = fBoundaries[fBufIdx]; 1700 return true; 1701 } 1702 if (pos == fBoundaries[fEndBufIdx]) { 1703 fBufIdx = fEndBufIdx; 1704 fTextIdx = fBoundaries[fBufIdx]; 1705 return true; 1706 } 1707 1708 int min = fStartBufIdx; 1709 int max = fEndBufIdx; 1710 while (min != max) { 1711 int probe = (min + max + (min>max ? CACHE_SIZE : 0)) / 2; 1712 probe = modChunkSize(probe); 1713 if (fBoundaries[probe] > pos) { 1714 max = probe; 1715 } else { 1716 min = modChunkSize(probe + 1); 1717 } 1718 } 1719 assert(fBoundaries[max] > pos); 1720 fBufIdx = modChunkSize(max - 1); 1721 fTextIdx = fBoundaries[fBufIdx]; 1722 assert(fTextIdx <= pos); 1723 return true; 1724 1725 }; 1726 1727 1728 /** 1729 * copy constructor, used from RuleBasedBreakIterator.clone(). 1730 * 1731 * @param src 1732 */ BreakCache(BreakCache src)1733 BreakCache(BreakCache src) { 1734 fStartBufIdx = src.fStartBufIdx; 1735 fEndBufIdx = src.fEndBufIdx; 1736 fTextIdx = src.fTextIdx; 1737 fBufIdx = src.fBufIdx; 1738 fBoundaries = src.fBoundaries.clone(); 1739 fStatuses = src.fStatuses.clone(); 1740 fSideBuffer = new DictionaryBreakEngine.DequeI(); // Transient, no need to clone contents. 1741 } 1742 dumpCache()1743 void dumpCache() { 1744 System.out.printf("fTextIdx:%d fBufIdx:%d%n", fTextIdx, fBufIdx); 1745 for (int i=fStartBufIdx; ; i=modChunkSize(i+1)) { 1746 System.out.printf("%d %d%n", i, fBoundaries[i]); 1747 if (i == fEndBufIdx) { 1748 break; 1749 } 1750 } 1751 }; 1752 modChunkSize(int index)1753 private final int modChunkSize(int index) { return index & (CACHE_SIZE - 1); }; 1754 1755 static final int CACHE_SIZE = 128; 1756 // static_assert((CACHE_SIZE & (CACHE_SIZE-1)) == 0, "CACHE_SIZE must be power of two."); 1757 1758 int fStartBufIdx; 1759 int fEndBufIdx; // inclusive 1760 1761 int fTextIdx; 1762 int fBufIdx; 1763 1764 int[] fBoundaries = new int[CACHE_SIZE]; 1765 short[] fStatuses = new short[CACHE_SIZE]; 1766 1767 DictionaryBreakEngine.DequeI fSideBuffer = new DictionaryBreakEngine.DequeI(); 1768 }; 1769 1770 1771 1772 1773 } 1774 1775