1 /* GENERATED SOURCE. DO NOT MODIFY. */ 2 // © 2016 and later: Unicode, Inc. and others. 3 // License & terms of use: http://www.unicode.org/copyright.html 4 /* 5 ******************************************************************************* 6 * Copyright (C) 2005-2016 International Business Machines Corporation and 7 * others. All Rights Reserved. 8 ******************************************************************************* 9 */ 10 11 package android.icu.text; 12 13 import static android.icu.impl.CharacterIteration.DONE32; 14 import static android.icu.impl.CharacterIteration.next32; 15 import static android.icu.impl.CharacterIteration.nextTrail32; 16 17 import java.io.ByteArrayOutputStream; 18 import java.io.IOException; 19 import java.io.InputStream; 20 import java.io.OutputStream; 21 import java.nio.ByteBuffer; 22 import java.text.CharacterIterator; 23 import java.util.MissingResourceException; 24 import java.util.concurrent.ConcurrentLinkedQueue; 25 26 import android.icu.impl.CharacterIteration; 27 import android.icu.impl.ICUBinary; 28 import android.icu.impl.ICUDebug; 29 import android.icu.impl.RBBIDataWrapper; 30 import android.icu.impl.breakiter.BurmeseBreakEngine; 31 import android.icu.impl.breakiter.CjkBreakEngine; 32 import android.icu.impl.breakiter.DictionaryBreakEngine; 33 import android.icu.impl.breakiter.KhmerBreakEngine; 34 import android.icu.impl.breakiter.LSTMBreakEngine; 35 import android.icu.impl.breakiter.LanguageBreakEngine; 36 import android.icu.impl.breakiter.LaoBreakEngine; 37 import android.icu.impl.breakiter.ThaiBreakEngine; 38 import android.icu.impl.breakiter.UnhandledBreakEngine; 39 import android.icu.lang.UCharacter; 40 import android.icu.lang.UProperty; 41 import android.icu.lang.UScript; 42 import android.icu.util.CodePointTrie; 43 44 /** 45 * Rule Based Break Iterator 46 * This is a port of the C++ class RuleBasedBreakIterator from ICU4C. 47 * 48 * @hide Only a subset of ICU is exposed in Android 49 */ 50 public class RuleBasedBreakIterator extends BreakIterator { 51 //======================================================================= 52 // Constructors & Factories 53 //======================================================================= 54 55 /** 56 * private constructor 57 */ RuleBasedBreakIterator()58 private RuleBasedBreakIterator() { 59 fDictionaryCharCount = 0; 60 } 61 62 /** 63 * Create a break iterator from a precompiled set of break rules. 64 * 65 * Creating a break iterator from the binary rules is much faster than 66 * creating one from source rules. 67 * 68 * The binary rules are generated by the RuleBasedBreakIterator.compileRules() function. 69 * Binary break iterator rules are not guaranteed to be compatible between 70 * different versions of ICU. 71 * 72 * @param is an input stream supplying the compiled binary rules. 73 * @throws IOException if there is an error while reading the rules from the InputStream. 74 * @see #compileRules(String, OutputStream) 75 */ getInstanceFromCompiledRules(InputStream is)76 public static RuleBasedBreakIterator getInstanceFromCompiledRules(InputStream is) throws IOException { 77 RuleBasedBreakIterator This = new RuleBasedBreakIterator(); 78 This.fRData = RBBIDataWrapper.get(ICUBinary.getByteBufferFromInputStreamAndCloseStream(is)); 79 This.fLookAheadMatches = new int[This.fRData.fFTable.fLookAheadResultsSize]; 80 return This; 81 } 82 83 /** 84 * This factory method doesn't have an access modifier; it is only accessible in the same 85 * package. 86 * 87 * Create a break iterator from a precompiled set of break rules. 88 * 89 * Creating a break iterator from the binary rules is much faster than 90 * creating one from source rules. 91 * 92 * The binary rules are generated by the RuleBasedBreakIterator.compileRules() function. 93 * Binary break iterator rules are not guaranteed to be compatible between 94 * different versions of ICU. 95 * 96 * @param bytes a buffer supplying the compiled binary rules. 97 * @param phraseBreaking a flag indicating if phrase breaking is required. 98 * @throws IOException if there is an error while reading the rules from the buffer. 99 * @see #compileRules(String, OutputStream) 100 * @hide draft / provisional / internal are hidden on Android 101 */ getInstanceFromCompiledRules( ByteBuffer bytes, boolean phraseBreaking)102 /* package-potected */ static RuleBasedBreakIterator getInstanceFromCompiledRules( 103 ByteBuffer bytes, boolean phraseBreaking) throws IOException { 104 RuleBasedBreakIterator instance = getInstanceFromCompiledRules(bytes); 105 instance.fPhraseBreaking = phraseBreaking; 106 return instance; 107 } 108 109 /** 110 * Create a break iterator from a precompiled set of break rules. 111 * 112 * Creating a break iterator from the binary rules is much faster than 113 * creating one from source rules. 114 * 115 * The binary rules are generated by the RuleBasedBreakIterator.compileRules() function. 116 * Binary break iterator rules are not guaranteed to be compatible between 117 * different versions of ICU. 118 * 119 * @param bytes a buffer supplying the compiled binary rules. 120 * @throws IOException if there is an error while reading the rules from the buffer. 121 * @see #compileRules(String, OutputStream) 122 * @deprecated This API is ICU internal only. 123 * @hide draft / provisional / internal are hidden on Android 124 */ 125 @Deprecated getInstanceFromCompiledRules(ByteBuffer bytes)126 public static RuleBasedBreakIterator getInstanceFromCompiledRules(ByteBuffer bytes) throws IOException { 127 RuleBasedBreakIterator This = new RuleBasedBreakIterator(); 128 This.fRData = RBBIDataWrapper.get(bytes); 129 This.fLookAheadMatches = new int[This.fRData.fFTable.fLookAheadResultsSize]; 130 return This; 131 } 132 133 /** 134 * Construct a RuleBasedBreakIterator from a set of rules supplied as a string. 135 * @param rules The break rules to be used. 136 */ RuleBasedBreakIterator(String rules)137 public RuleBasedBreakIterator(String rules) { 138 this(); 139 try { 140 ByteArrayOutputStream ruleOS = new ByteArrayOutputStream(); 141 compileRules(rules, ruleOS); 142 fRData = RBBIDataWrapper.get(ByteBuffer.wrap(ruleOS.toByteArray())); 143 fLookAheadMatches = new int[fRData.fFTable.fLookAheadResultsSize]; 144 } catch (IOException e) { 145 ///CLOVER:OFF 146 // An IO exception can only arrive here if there is a bug in the RBBI Rule compiler, 147 // causing bogus compiled rules to be produced, but with no compile error raised. 148 RuntimeException rte = new RuntimeException("RuleBasedBreakIterator rule compilation internal error: " 149 + e.getMessage()); 150 throw rte; 151 ///CLOVER:ON 152 } 153 } 154 155 //======================================================================= 156 // Boilerplate 157 //======================================================================= 158 159 /** 160 * Clones this iterator. 161 * @return A newly-constructed RuleBasedBreakIterator with the same 162 * behavior as this one. 163 */ 164 @Override clone()165 public Object clone() { 166 RuleBasedBreakIterator result; 167 result = (RuleBasedBreakIterator)super.clone(); 168 if (fText != null) { 169 result.fText = (CharacterIterator)(fText.clone()); 170 } 171 result.fLookAheadMatches = new int[fRData.fFTable.fLookAheadResultsSize]; 172 result.fBreakCache = result.new BreakCache(fBreakCache); 173 result.fDictionaryCache = result.new DictionaryCache(fDictionaryCache); 174 return result; 175 } 176 177 178 /** 179 * Returns true if both BreakIterators are of the same class, have the same 180 * rules, and iterate over the same text. 181 */ 182 @Override equals(Object that)183 public boolean equals(Object that) { 184 if (that == null) { 185 return false; 186 } 187 if (this == that) { 188 return true; 189 } 190 try { 191 RuleBasedBreakIterator other = (RuleBasedBreakIterator) that; 192 if (fRData != other.fRData && (fRData == null || other.fRData == null)) { 193 return false; 194 } 195 if (fRData != null && other.fRData != null && 196 (!fRData.fRuleSource.equals(other.fRData.fRuleSource))) { 197 return false; 198 } 199 if (fText == null && other.fText == null) { 200 return true; 201 } 202 if (fText == null || other.fText == null || !fText.equals(other.fText)) { 203 return false; 204 } 205 return fPosition == other.fPosition; 206 } 207 catch(ClassCastException e) { 208 return false; 209 } 210 } 211 212 /** 213 * Returns the description (rules) used to create this iterator. 214 * (In ICU4C, the same function is RuleBasedBreakIterator::getRules()) 215 */ 216 @Override toString()217 public String toString() { 218 String retStr = ""; 219 if (fRData != null) { 220 retStr = fRData.fRuleSource; 221 } 222 return retStr; 223 } 224 225 /** 226 * Compute a hashcode for this BreakIterator 227 * @return A hash code 228 */ 229 @Override hashCode()230 public int hashCode() 231 { 232 return fRData.fRuleSource.hashCode(); 233 } 234 235 236 private static final int START_STATE = 1; // The state number of the starting state 237 private static final int STOP_STATE = 0; // The state-transition value indicating "stop" 238 239 // RBBIRunMode - the state machine runs an extra iteration at the beginning and end 240 // of user text. A variable with this enum type keeps track of where we 241 // are. The state machine only fetches user text input while in RUN mode. 242 private static final int RBBI_START = 0; 243 private static final int RBBI_RUN = 1; 244 private static final int RBBI_END = 2; 245 246 /** 247 * The character iterator through which this BreakIterator accesses the text. 248 */ 249 private CharacterIterator fText = new java.text.StringCharacterIterator(""); 250 251 /** 252 * The rule data for this BreakIterator instance. 253 * Not intended for public use. Declared public for testing purposes only. 254 * @deprecated This API is ICU internal only. 255 * @hide draft / provisional / internal are hidden on Android 256 */ 257 @Deprecated 258 public RBBIDataWrapper fRData; 259 260 /** 261 * The iteration state - current position, rule status for the current position, 262 * and whether the iterator ran off the end, yielding UBRK_DONE. 263 * Current position is pinned to be 0 < position <= text.length. 264 * Current position is always set to a boundary. 265 * 266 * The current position of the iterator. Pinned, 0 < fPosition <= text.length. 267 * Never has the value UBRK_DONE (-1). 268 */ 269 private int fPosition; 270 271 /** 272 * Index of the Rule {tag} values for the most recent match. 273 */ 274 private int fRuleStatusIndex; 275 276 /** 277 * True when iteration has run off the end, and iterator functions should return UBRK_DONE. 278 */ 279 private boolean fDone; 280 281 /** 282 * Array of look-ahead tentative results. 283 */ 284 private int[] fLookAheadMatches; 285 286 /** 287 * Cache of previously determined boundary positions. 288 */ 289 private BreakCache fBreakCache = new BreakCache(); 290 291 /** 292 * Flag used to indicate if phrase breaking is required. 293 */ 294 private boolean fPhraseBreaking = false; 295 296 297 /** 298 * Counter for the number of characters encountered with the "dictionary" 299 * flag set. Normal RBBI iterators don't use it, although the code 300 * for updating it is live. Dictionary Based break iterators (a subclass 301 * of us) access this field directly. 302 * @hide draft / provisional / internal are hidden on Android 303 */ 304 private int fDictionaryCharCount; 305 306 private DictionaryCache fDictionaryCache = new DictionaryCache(); 307 308 /** 309 * ICU debug argument name for RBBI 310 */ 311 private static final String RBBI_DEBUG_ARG = "rbbi"; 312 313 /** 314 * Debugging flag. Trace operation of state machine when true. 315 */ 316 private static final boolean TRACE = ICUDebug.enabled(RBBI_DEBUG_ARG) 317 && ICUDebug.value(RBBI_DEBUG_ARG).indexOf("trace") >= 0; 318 319 /** 320 * The "default" break engine - just skips over ranges of dictionary words, 321 * producing no breaks. Should only be used if characters need to be handled 322 * by a dictionary but we have no dictionary implementation for them. 323 * 324 * Only one instance; shared by all break iterators. 325 */ 326 private static final UnhandledBreakEngine gUnhandledBreakEngine; 327 328 /** 329 * List of all known break engines, common for all break iterators. 330 * Lazily updated as break engines are needed, because instantiation of 331 * break engines is expensive. 332 * 333 * Important notes: 334 * <ul>Because we don't want to add the same LanguageBreakEngine multiple times, all writes 335 * are synchronized. 336 * <ul>Read access avoids explicit synchronization, but will end up being synchronized if 337 * needed. 338 */ 339 private static final ConcurrentLinkedQueue<LanguageBreakEngine> gAllBreakEngines; 340 341 static { 342 gUnhandledBreakEngine = new UnhandledBreakEngine(); 343 gAllBreakEngines = new ConcurrentLinkedQueue<>(); 344 gAllBreakEngines.add(gUnhandledBreakEngine); 345 } 346 347 /** 348 * Dump the contents of the state table and character classes for this break iterator. 349 * For debugging only. 350 * @deprecated This API is ICU internal only. 351 * @hide draft / provisional / internal are hidden on Android 352 */ 353 @Deprecated dump(java.io.PrintStream out)354 public void dump(java.io.PrintStream out) { 355 if (out == null) { 356 out = System.out; 357 } 358 this.fRData.dump(out); 359 } 360 361 /** 362 * Compile a set of source break rules into the binary state tables used 363 * by the break iterator engine. Creating a break iterator from precompiled 364 * rules is much faster than creating one from source rules. 365 * 366 * Binary break rules are not guaranteed to be compatible between different 367 * versions of ICU. 368 * 369 * 370 * @param rules The source form of the break rules 371 * @param ruleBinary An output stream to receive the compiled rules. 372 * @throws IOException If there is an error writing the output. 373 * @see #getInstanceFromCompiledRules(InputStream) 374 */ compileRules(String rules, OutputStream ruleBinary)375 public static void compileRules(String rules, OutputStream ruleBinary) throws IOException { 376 RBBIRuleBuilder.compileRules(rules, ruleBinary); 377 } 378 379 //======================================================================= 380 // BreakIterator overrides 381 //======================================================================= 382 383 /** 384 * Sets the current iteration position to the beginning of the text. 385 * (i.e., the CharacterIterator's starting offset). 386 * @return The offset of the beginning of the text. 387 */ 388 @Override first()389 public int first() { 390 if (fText == null) { 391 return BreakIterator.DONE; 392 } 393 fText.first(); 394 int start = fText.getIndex(); 395 if (!fBreakCache.seek(start)) { 396 fBreakCache.populateNear(start); 397 } 398 fBreakCache.current(); 399 assert(fPosition == start); 400 return fPosition; 401 } 402 403 /** 404 * Sets the current iteration position to the end of the text. 405 * (i.e., the CharacterIterator's ending offset). 406 * @return The text's past-the-end offset. 407 */ 408 @Override last()409 public int last() { 410 if (fText == null) { 411 return BreakIterator.DONE; 412 } 413 int endPos = fText.getEndIndex(); 414 boolean endShouldBeBoundary = isBoundary(endPos); // Has side effect of setting iterator position. 415 assert(endShouldBeBoundary); 416 if (fPosition != endPos) { 417 assert(fPosition == endPos); 418 } 419 return endPos; 420 } 421 422 /** 423 * Advances the iterator either forward or backward the specified number of steps. 424 * Negative values move backward, and positive values move forward. This is 425 * equivalent to repeatedly calling next() or previous(). 426 * @param n The number of steps to move. The sign indicates the direction 427 * (negative is backwards, and positive is forwards). 428 * @return The character offset of the boundary position n boundaries away from 429 * the current one. 430 */ 431 @Override next(int n)432 public int next(int n) { 433 int result = 0; 434 if (n > 0) { 435 for (; n > 0 && result != DONE; --n) { 436 result = next(); 437 } 438 } else if (n < 0) { 439 for (; n < 0 && result != DONE; ++n) { 440 result = previous(); 441 } 442 } else { 443 result = current(); 444 } 445 return result; 446 } 447 448 /** 449 * Advances the iterator to the next boundary position. 450 * @return The position of the first boundary after this one. 451 */ 452 @Override next()453 public int next() { 454 fBreakCache.next(); 455 return fDone ? DONE : fPosition; 456 } 457 458 /** 459 * Moves the iterator backwards, to the boundary preceding the current one. 460 * @return The position of the boundary position immediately preceding the starting position. 461 */ 462 @Override previous()463 public int previous() { 464 fBreakCache.previous(); 465 return fDone ? DONE : fPosition; 466 } 467 468 /** 469 * Sets the iterator to refer to the first boundary position following 470 * the specified position. 471 * @param startPos The position from which to begin searching for a break position. 472 * @return The position of the first break after the current position. 473 */ 474 @Override following(int startPos)475 public int following(int startPos) { 476 // if the supplied position is before the beginning, return the 477 // text's starting offset 478 if (startPos < fText.getBeginIndex()) { 479 return first(); 480 } 481 482 // Move requested offset to a code point start. It might be between a lead and trail surrogate. 483 // Or it may be beyond the end of the text. 484 startPos = CISetIndex32(fText, startPos); 485 fBreakCache.following(startPos); 486 return fDone ? DONE : fPosition; 487 } 488 489 490 /** 491 * Sets the iterator to refer to the last boundary position before the 492 * specified position. 493 * @param offset The position to begin searching for a break from. 494 * @return The position of the last boundary before the starting position. 495 */ 496 @Override preceding(int offset)497 public int preceding(int offset) { 498 if (fText == null || offset > fText.getEndIndex()) { 499 return last(); 500 } else if (offset < fText.getBeginIndex()) { 501 return first(); 502 } 503 504 // Move requested offset to a code point start. It might be between a lead and trail surrogate. 505 // int adjustedOffset = CISetIndex32(fText, offset); // TODO: restore to match ICU4C behavior. 506 int adjustedOffset = offset; 507 fBreakCache.preceding(adjustedOffset); 508 return fDone ? DONE : fPosition; 509 510 } 511 512 513 /** 514 * Throw IllegalArgumentException unless begin <= offset < end. 515 */ checkOffset(int offset, CharacterIterator text)516 protected static final void checkOffset(int offset, CharacterIterator text) { 517 if (offset < text.getBeginIndex() || offset > text.getEndIndex()) { 518 throw new IllegalArgumentException("offset out of bounds"); 519 } 520 } 521 522 523 /** 524 * Returns true if the specified position is a boundary position. As a side 525 * effect, leaves the iterator pointing to the first boundary position at 526 * or after "offset". 527 * @param offset the offset to check. 528 * @return True if "offset" is a boundary position. 529 */ 530 @Override isBoundary(int offset)531 public boolean isBoundary(int offset) { 532 // TODO: behavior difference with ICU4C, which considers out-of-range offsets 533 // to not be boundaries, and to not be errors. 534 checkOffset(offset, fText); 535 536 // Adjust offset to be on a code point boundary and not beyond the end of the text. 537 // Note that isBoundary() is always false for offsets that are not on code point boundaries. 538 // But we still need the side effect of leaving iteration at the following boundary. 539 int adjustedOffset = CISetIndex32(fText, offset); 540 541 boolean result = false; 542 if (fBreakCache.seek(adjustedOffset) || fBreakCache.populateNear(adjustedOffset)) { 543 result = (fBreakCache.current() == offset); 544 } 545 546 if (!result) { 547 // Not on a boundary. isBoundary() must leave iterator on the following boundary. 548 // fBreakCache.seek(), above, left us on the preceding boundary, so advance one. 549 next(); 550 } 551 return result; 552 553 } 554 555 /** 556 * Returns the current iteration position. Note that DONE is never 557 * returned from this function; if iteration has run to the end of a 558 * string, current() will return the length of the string while 559 * next() will return BreakIterator.DONE). 560 * @return The current iteration position. 561 */ 562 @Override current()563 public int current() { 564 return (fText != null) ? fPosition : BreakIterator.DONE; 565 } 566 567 568 /** 569 * Return the status tag from the break rule that determined the boundary at 570 * the current iteration position. The values appear in the rule source 571 * within brackets, {123}, for example. For rules that do not specify a 572 * status, a default value of 0 is returned. If more than one rule applies, 573 * the numerically largest of the possible status values is returned. 574 * <p> 575 * Of the standard types of ICU break iterators, only the word and line break 576 * iterator provides status values. The values are defined in 577 * class RuleBasedBreakIterator, and allow distinguishing between words 578 * that contain alphabetic letters, "words" that appear to be numbers, 579 * punctuation and spaces, words containing ideographic characters, and 580 * more. Call <code>getRuleStatus</code> after obtaining a boundary 581 * position from <code>next()</code>, <code>previous()</code>, or 582 * any other break iterator functions that returns a boundary position. 583 * <p> 584 * Note that <code>getRuleStatus()</code> returns the value corresponding to 585 * <code>current()</code> index even after <code>next()</code> has returned DONE. 586 * <p> 587 588 * @return the status from the break rule that determined the boundary 589 * at the current iteration position. 590 */ 591 592 @Override getRuleStatus()593 public int getRuleStatus() { 594 // Status records have this form: 595 // Count N <-- fLastRuleStatusIndex points here. 596 // Status val 0 597 // Status val 1 598 // ... 599 // Status val N-1 <-- the value we need to return 600 // The status values are sorted in ascending order. 601 // This function returns the last (largest) of the array of status values. 602 int idx = fRuleStatusIndex + fRData.fStatusTable[fRuleStatusIndex]; 603 int tagVal = fRData.fStatusTable[idx]; 604 return tagVal; 605 } 606 607 /** 608 * Get the status (tag) values from the break rule(s) that determined the boundary 609 * at the current iteration position. The values appear in the rule source 610 * within brackets, {123}, for example. The default status value for rules 611 * that do not explicitly provide one is zero. 612 * <p> 613 * The status values used by the standard ICU break rules are defined 614 * as public constants in class RuleBasedBreakIterator. 615 * <p> 616 * If the size of the output array is insufficient to hold the data, 617 * the output will be truncated to the available length. No exception 618 * will be thrown. 619 * 620 * @param fillInArray an array to be filled in with the status values. 621 * @return The number of rule status values from the rules that determined 622 * the boundary at the current iteration position. 623 * In the event that the array is too small, the return value 624 * is the total number of status values that were available, 625 * not the reduced number that were actually returned. 626 */ 627 @Override getRuleStatusVec(int[] fillInArray)628 public int getRuleStatusVec(int[] fillInArray) { 629 int numStatusVals = fRData.fStatusTable[fRuleStatusIndex]; 630 if (fillInArray != null) { 631 int numToCopy = Math.min(numStatusVals, fillInArray.length); 632 for (int i=0; i<numToCopy; i++) { 633 fillInArray[i] = fRData.fStatusTable[fRuleStatusIndex + i + 1]; 634 } 635 } 636 return numStatusVals; 637 } 638 639 /** 640 * Returns a CharacterIterator over the text being analyzed. 641 * <p> 642 * <b><i>Caution:</i></b>The state of the returned CharacterIterator 643 * must not be modified in any way while the BreakIterator is still in use. 644 * Doing so will lead to undefined behavior of the BreakIterator. 645 * Clone the returned CharacterIterator first and work with that. 646 * <p> 647 * The returned CharacterIterator is a reference 648 * to the <b>actual iterator being used</b> by the BreakIterator. 649 * No guarantees are made about the current position 650 * of this iterator when it is returned; it may differ from the 651 * BreakIterators current position. If you need to move that 652 * position to examine the text, clone this function's return value first. 653 * @return An iterator over the text being analyzed. 654 */ 655 @Override getText()656 public CharacterIterator getText() { 657 return fText; 658 } 659 660 /** 661 * Set the iterator to analyze a new piece of text. This function resets 662 * the current iteration position to the beginning of the text. 663 * (The old iterator is dropped.) 664 * <p> 665 * <b><i>Caution:</i></b> The supplied CharacterIterator is used 666 * directly by the BreakIterator, and must not be altered in any 667 * way by code outside of the BreakIterator. 668 * Doing so will lead to undefined behavior of the BreakIterator. 669 * 670 * @param newText An iterator over the text to analyze. 671 */ 672 @Override setText(CharacterIterator newText)673 public void setText(CharacterIterator newText) { 674 if (newText != null) { 675 fBreakCache.reset(newText.getBeginIndex(), 0); 676 } else { 677 fBreakCache.reset(); 678 } 679 fDictionaryCache.reset(); 680 fText = newText; 681 this.first(); 682 } 683 684 /** 685 * Control debug, trace and dump options. 686 * @deprecated This API is ICU internal only. 687 * @hide draft / provisional / internal are hidden on Android 688 */ 689 @Deprecated 690 public static final String fDebugEnv = ICUDebug.enabled(RBBI_DEBUG_ARG) ? 691 ICUDebug.value(RBBI_DEBUG_ARG) : null; 692 693 getLanguageBreakEngine(int c)694 private LanguageBreakEngine getLanguageBreakEngine(int c) { 695 696 // We have a dictionary character. 697 // Does an already instantiated break engine handle it? 698 // First read without synchronization, which could lead to a new language 699 // break engine being added and we didn't go over it. 700 for (LanguageBreakEngine candidate : gAllBreakEngines) { 701 if (candidate.handles(c)) { 702 return candidate; 703 } 704 } 705 706 synchronized (gAllBreakEngines) { 707 // Another break iterator may have instantiated the desired engine. 708 for (LanguageBreakEngine candidate : gAllBreakEngines) { 709 if (candidate.handles(c)) { 710 return candidate; 711 } 712 } 713 714 // The global list doesn't have an existing engine, build one. 715 int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT); 716 if (script == UScript.KATAKANA || script == UScript.HIRAGANA) { 717 // Katakana, Hiragana and Han are handled by the same dictionary engine. 718 // Fold them together for mapping from script -> engine. 719 script = UScript.HAN; 720 } 721 722 LanguageBreakEngine eng; 723 try { 724 switch (script) { 725 case UScript.THAI: 726 try { 727 eng = LSTMBreakEngine.create(script, LSTMBreakEngine.createData(script)); 728 } catch (MissingResourceException e) { 729 eng = new ThaiBreakEngine(); 730 } 731 break; 732 case UScript.LAO: 733 eng = new LaoBreakEngine(); 734 break; 735 case UScript.MYANMAR: 736 try { 737 eng = LSTMBreakEngine.create(script, LSTMBreakEngine.createData(script)); 738 } catch (MissingResourceException e) { 739 eng = new BurmeseBreakEngine(); 740 } 741 break; 742 case UScript.KHMER: 743 eng = new KhmerBreakEngine(); 744 break; 745 case UScript.HAN: 746 eng = new CjkBreakEngine(false); 747 break; 748 case UScript.HANGUL: 749 eng = new CjkBreakEngine(true); 750 break; 751 default: 752 gUnhandledBreakEngine.handleChar(c); 753 eng = gUnhandledBreakEngine; 754 break; 755 } 756 } catch (IOException e) { 757 eng = null; 758 } 759 760 if (eng != null && eng != gUnhandledBreakEngine) { 761 gAllBreakEngines.add(eng); 762 } 763 return eng; 764 } // end synchronized(gAllBreakEngines) 765 } 766 767 /** 768 * The State Machine Engine for moving forward is here. 769 * This function is the heart of the RBBI run time engine. 770 * 771 * Input 772 * fPosition, the position in the text to begin from. 773 * Output 774 * fPosition: the boundary following the starting position. 775 * fDictionaryCharCount the number of dictionary characters encountered. 776 * If > 0, the segment will be further subdivided 777 * fRuleStatusIndex Info from the state table indicating which rules caused the boundary. 778 * 779 * @return the new iterator position 780 * 781 * A note on supplementary characters and the position of underlying 782 * Java CharacterIterator: Normally, a character iterator is positioned at 783 * the char most recently returned by next(). Within this function, when 784 * a supplementary char is being processed, the char iterator is left 785 * sitting on the trail surrogate, in the middle of the code point. 786 * This is different from everywhere else, where an iterator always 787 * points at the lead surrogate of a supplementary. 788 */ handleNext()789 private int handleNext() { 790 if (TRACE) { 791 System.out.println("Handle Next pos char state category"); 792 } 793 794 // handleNext always sets the break tag value. 795 // Set the default for it. 796 fRuleStatusIndex = 0; 797 fDictionaryCharCount = 0; 798 799 // caches for quicker access 800 CharacterIterator text = fText; 801 CodePointTrie trie = fRData.fTrie; 802 803 char[] stateTable = fRData.fFTable.fTable; 804 int initialPosition = fPosition; 805 text.setIndex(initialPosition); 806 int result = initialPosition; 807 808 // Set up the starting char 809 int c = text.current(); 810 if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) { 811 c = nextTrail32(text, c); 812 if (c == DONE32) { 813 fDone = true; 814 return BreakIterator.DONE; 815 } 816 } 817 818 // Set the initial state for the state machine 819 int state = START_STATE; 820 int row = fRData.getRowIndex(state); 821 short category = 3; 822 int flagsState = fRData.fFTable.fFlags; 823 int dictStart = fRData.fFTable.fDictCategoriesStart; 824 int mode = RBBI_RUN; 825 if ((flagsState & RBBIDataWrapper.RBBI_BOF_REQUIRED) != 0) { 826 category = 2; 827 mode = RBBI_START; 828 if (TRACE) { 829 System.out.print(" " + RBBIDataWrapper.intToString(text.getIndex(), 5)); 830 System.out.print(RBBIDataWrapper.intToHexString(c, 10)); 831 System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6)); 832 } 833 } 834 835 // loop until we reach the end of the text or transition to state 0 836 while (state != STOP_STATE) { 837 if (c == DONE32) { 838 // Reached end of input string. 839 if (mode == RBBI_END) { 840 // We have already run the loop one last time with the 841 // character set to the pseudo {eof} value. Now it is time 842 // to unconditionally bail out. 843 break; 844 } 845 // Run the loop one last time with the fake end-of-input character category 846 mode = RBBI_END; 847 category = 1; 848 } 849 else if (mode == RBBI_RUN) { 850 // Get the char category. An incoming category of 1 or 2 mens that 851 // we are preset for doing the beginning or end of input, and 852 // that we shouldn't get a category from an actual text input character. 853 // 854 855 // look up the current character's character category, which tells us 856 // which column in the state table to look at. 857 // 858 category = (short) trie.get(c); 859 860 // Check for categories that require word dictionary handling. 861 if (category >= dictStart) { 862 fDictionaryCharCount++; 863 } 864 865 if (TRACE) { 866 System.out.print(" " + RBBIDataWrapper.intToString(text.getIndex(), 5)); 867 System.out.print(RBBIDataWrapper.intToHexString(c, 10)); 868 System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6)); 869 } 870 871 // Advance to the next character. 872 // If this is a beginning-of-input loop iteration, don't advance. 873 // The next iteration will be processing the first real input character. 874 c = text.next(); 875 if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) { 876 c = nextTrail32(text, c); 877 } 878 } 879 else { 880 mode = RBBI_RUN; 881 } 882 883 // look up a state transition in the state table 884 state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category]; 885 row = fRData.getRowIndex(state); 886 int accepting = stateTable[row + RBBIDataWrapper.ACCEPTING]; 887 if (accepting == RBBIDataWrapper.ACCEPTING_UNCONDITIONAL) { 888 // Match found, common case 889 result = text.getIndex(); 890 if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c <= UTF16.CODEPOINT_MAX_VALUE) { 891 // The iterator has been left in the middle of a surrogate pair. 892 // We want the start of it. 893 result--; 894 } 895 896 // Remember the break status (tag) values. 897 fRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGSIDX]; 898 } else if (accepting > RBBIDataWrapper.ACCEPTING_UNCONDITIONAL) { 899 // Lookahead match is completed 900 int lookaheadResult = fLookAheadMatches[accepting]; 901 if (lookaheadResult >= 0) { 902 fRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGSIDX]; 903 fPosition = lookaheadResult; 904 return lookaheadResult; 905 } 906 } 907 908 909 // If we are at the position of the '/' in a look-ahead (hard break) rule; 910 // record the current position, to be returned later, if the full rule matches. 911 // TODO: Move this check before the previous check of fAccepting. 912 // This would enable hard-break rules with no following context. 913 // But there are line break test failures when trying this. Investigate. 914 // Issue ICU-20837 915 int rule = stateTable[row + RBBIDataWrapper.LOOKAHEAD]; 916 if (rule != 0) { 917 int pos = text.getIndex(); 918 if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c <= UTF16.CODEPOINT_MAX_VALUE) { 919 // The iterator has been left in the middle of a surrogate pair. 920 // We want the beginning of it. 921 pos--; 922 } 923 fLookAheadMatches[rule] = pos; 924 } 925 926 927 } // End of state machine main loop 928 929 // The state machine is done. Check whether it found a match... 930 931 // If the iterator failed to advance in the match engine force it ahead by one. 932 // This indicates a defect in the break rules, which should always match 933 // at least one character. 934 935 if (result == initialPosition) { 936 if (TRACE) { 937 System.out.println("Iterator did not move. Advancing by 1."); 938 } 939 text.setIndex(initialPosition); 940 next32(text); 941 result = text.getIndex(); 942 fRuleStatusIndex = 0; 943 } 944 945 // Leave the iterator at our result position. 946 // (we may have advanced beyond the last accepting position chasing after 947 // longer matches that never completed.) 948 fPosition = result; 949 950 if (TRACE) { 951 System.out.println("result = " + result); 952 } 953 return result; 954 } 955 956 /** 957 * Iterate backwards from an arbitrary position in the input text using the Safe Reverse rules. 958 * This locates a "Safe Position" from which the forward break rules 959 * will operate correctly. A Safe Position is not necessarily a boundary itself. 960 * 961 * The logic of this function is very similar to handleNext(), above, but simpler 962 * because the safe table does not require as many options. 963 * 964 * @param fromPosition the position in the input text to begin the iteration. 965 * @hide draft / provisional / internal are hidden on Android 966 */ handleSafePrevious(int fromPosition)967 private int handleSafePrevious(int fromPosition) { 968 char state; 969 short category = 0; 970 int result = 0; 971 972 // caches for quicker access 973 CharacterIterator text = fText; 974 CodePointTrie trie = fRData.fTrie; 975 char[] stateTable = fRData.fRTable.fTable; 976 977 CISetIndex32(text, fromPosition); 978 if (TRACE) { 979 System.out.print("Handle Previous pos char state category"); 980 } 981 982 // if we're already at the start of the text, return DONE. 983 if (text.getIndex() == text.getBeginIndex()) { 984 return BreakIterator.DONE; 985 } 986 987 // Set the initial state for the state machine 988 int c = CharacterIteration.previous32(text); 989 state = START_STATE; 990 int row = fRData.getRowIndex(state); 991 992 // loop until we reach the start of the text or transition to state 0 993 // 994 for (; c != DONE32; c = CharacterIteration.previous32(text)) { 995 996 // look up the current character's character category, which tells us 997 // which column in the state table to look at. 998 // 999 // And off the dictionary flag bit. For reverse iteration it is not used. 1000 category = (short) trie.get(c); 1001 if (TRACE) { 1002 System.out.print(" " + RBBIDataWrapper.intToString(text.getIndex(), 5)); 1003 System.out.print(RBBIDataWrapper.intToHexString(c, 10)); 1004 System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6)); 1005 } 1006 1007 // State Transition - move machine to its next state 1008 // 1009 assert(category < fRData.fHeader.fCatCount); 1010 state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category]; 1011 row = fRData.getRowIndex(state); 1012 1013 if (state == STOP_STATE) { 1014 // This is the normal exit from the lookup state machine. 1015 // Transition to state zero means we have found a safe point. 1016 break; 1017 } 1018 } 1019 1020 // The state machine is done. 1021 result = text.getIndex(); 1022 if (TRACE) { 1023 System.out.println("result = " + result); 1024 } 1025 return result; 1026 } 1027 1028 /** 1029 * Set the index of a CharacterIterator. 1030 * Pin the index to the valid range range of BeginIndex <= index <= EndIndex. 1031 * If the index points to a trail surrogate of a supplementary character, adjust it 1032 * to the start (lead surrogate) index. 1033 * 1034 * @param ci A CharacterIterator to set 1035 * @param index the index to set 1036 * @return the resulting index, possibly pinned or adjusted. 1037 */ 1038 private static int CISetIndex32(CharacterIterator ci, int index) { 1039 if (index <= ci.getBeginIndex()) { 1040 ci.first(); 1041 } else if (index >= ci.getEndIndex()) { 1042 ci.setIndex(ci.getEndIndex()); 1043 } else if (Character.isLowSurrogate(ci.setIndex(index))) { 1044 if (!Character.isHighSurrogate(ci.previous())) { 1045 ci.next(); 1046 } 1047 } 1048 return ci.getIndex(); 1049 } 1050 1051 /** DictionaryCache stores the boundaries obtained from a run of dictionary characters. 1052 * Dictionary boundaries are moved first to this cache, then from here 1053 * to the main BreakCache, where they may inter-leave with non-dictionary 1054 * boundaries. The public BreakIterator API always fetches directly 1055 * from the main BreakCache, not from here. 1056 * 1057 * In common situations, the number of boundaries in a single dictionary run 1058 * should be quite small, it will be terminated by punctuation, spaces, 1059 * or any other non-dictionary characters. The main BreakCache may end 1060 * up with boundaries from multiple dictionary based runs. 1061 * 1062 * The boundaries are stored in a simple ArrayList (vector), with the 1063 * assumption that they will be accessed sequentially. 1064 */ 1065 class DictionaryCache { 1066 1067 void reset() { 1068 fPositionInCache = -1; 1069 fStart = 0; 1070 fLimit = 0; 1071 fFirstRuleStatusIndex = 0; 1072 fOtherRuleStatusIndex = 0; 1073 fBreaks.removeAllElements(); 1074 }; 1075 1076 boolean following(int fromPos) { 1077 if (fromPos >= fLimit || fromPos < fStart) { 1078 fPositionInCache = -1; 1079 return false; 1080 } 1081 1082 // Sequential iteration, move from previous boundary to the following 1083 1084 int r = 0; 1085 if (fPositionInCache >= 0 && fPositionInCache < fBreaks.size() && fBreaks.elementAt(fPositionInCache) == fromPos) { 1086 ++fPositionInCache; 1087 if (fPositionInCache >= fBreaks.size()) { 1088 fPositionInCache = -1; 1089 return false; 1090 } 1091 r = fBreaks.elementAt(fPositionInCache); 1092 assert(r > fromPos); 1093 fBoundary = r; 1094 fStatusIndex = fOtherRuleStatusIndex; 1095 return true; 1096 } 1097 1098 // Random indexing. Linear search for the boundary following the given position. 1099 1100 for (fPositionInCache = 0; fPositionInCache < fBreaks.size(); ++fPositionInCache) { 1101 r= fBreaks.elementAt(fPositionInCache); 1102 if (r > fromPos) { 1103 fBoundary = r; 1104 fStatusIndex = fOtherRuleStatusIndex; 1105 return true; 1106 } 1107 } 1108 1109 // Internal error. fStart <= fromPos < fLimit, but no cached boundary. 1110 assert(false); 1111 fPositionInCache = -1; 1112 return false; 1113 }; 1114 preceding(int fromPos)1115 boolean preceding(int fromPos) { 1116 if (fromPos <= fStart || fromPos > fLimit) { 1117 fPositionInCache = -1; 1118 return false; 1119 } 1120 1121 if (fromPos == fLimit) { 1122 fPositionInCache = fBreaks.size() - 1; 1123 if (fPositionInCache >= 0) { 1124 assert(fBreaks.elementAt(fPositionInCache) == fromPos); 1125 } 1126 } 1127 1128 int r; 1129 if (fPositionInCache > 0 && fPositionInCache < fBreaks.size() && fBreaks.elementAt(fPositionInCache) == fromPos) { 1130 --fPositionInCache; 1131 r = fBreaks.elementAt(fPositionInCache); 1132 assert(r < fromPos); 1133 fBoundary = r; 1134 fStatusIndex = ( r== fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex; 1135 return true; 1136 } 1137 1138 if (fPositionInCache == 0) { 1139 fPositionInCache = -1; 1140 return false; 1141 } 1142 1143 for (fPositionInCache = fBreaks.size()-1; fPositionInCache >= 0; --fPositionInCache) { 1144 r = fBreaks.elementAt(fPositionInCache); 1145 if (r < fromPos) { 1146 fBoundary = r; 1147 fStatusIndex = ( r == fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex; 1148 return true; 1149 } 1150 } 1151 assert(false); 1152 fPositionInCache = -1; 1153 return false; 1154 }; 1155 1156 /** 1157 * Populate the cache with the dictionary based boundaries within a region of text. 1158 * @param startPos The start position of a range of text 1159 * @param endPos The end position of a range of text 1160 * @param firstRuleStatus The rule status index that applies to the break at startPos 1161 * @param otherRuleStatus The rule status index that applies to boundaries other than startPos 1162 * @hide draft / provisional / internal are hidden on Android 1163 */ populateDictionary(int startPos, int endPos, int firstRuleStatus, int otherRuleStatus)1164 void populateDictionary(int startPos, int endPos, 1165 int firstRuleStatus, int otherRuleStatus) { 1166 if ((endPos - startPos) <= 1) { 1167 return; 1168 } 1169 1170 reset(); 1171 fFirstRuleStatusIndex = firstRuleStatus; 1172 fOtherRuleStatusIndex = otherRuleStatus; 1173 1174 int rangeStart = startPos; 1175 int rangeEnd = endPos; 1176 1177 int category; 1178 int current; 1179 int foundBreakCount = 0; 1180 1181 // Loop through the text, looking for ranges of dictionary characters. 1182 // For each span, find the appropriate break engine, and ask it to find 1183 // any breaks within the span. 1184 1185 fText.setIndex(rangeStart); 1186 int c = CharacterIteration.current32(fText); 1187 category = (short)fRData.fTrie.get(c); 1188 int dictStart = fRData.fFTable.fDictCategoriesStart; 1189 1190 while(true) { 1191 while((current = fText.getIndex()) < rangeEnd && (category < dictStart)) { 1192 c = CharacterIteration.next32(fText); // pre-increment 1193 category = (short)fRData.fTrie.get(c); 1194 } 1195 if (current >= rangeEnd) { 1196 break; 1197 } 1198 1199 // We now have a dictionary character. Get the appropriate language object 1200 // to deal with it. 1201 LanguageBreakEngine lbe = getLanguageBreakEngine(c); 1202 1203 // Ask the language object if there are any breaks. It will add them to the cache and 1204 // leave the text pointer on the other side of its range, ready to search for the next one. 1205 if (lbe != null) { 1206 foundBreakCount += lbe.findBreaks(fText, rangeStart, rangeEnd, fBreaks, fPhraseBreaking); 1207 } 1208 1209 // Reload the loop variables for the next go-round 1210 c = CharacterIteration.current32(fText); 1211 category = (short)fRData.fTrie.get(c); 1212 } 1213 1214 // If we found breaks, ensure that the first and last entries are 1215 // the original starting and ending position. And initialize the 1216 // cache iteration position to the first entry. 1217 1218 // System.out.printf("foundBreakCount = %d%n", foundBreakCount); 1219 if (foundBreakCount > 0) { 1220 assert(foundBreakCount == fBreaks.size()); 1221 if (startPos < fBreaks.elementAt(0)) { 1222 // The dictionary did not place a boundary at the start of the segment of text. 1223 // Add one now. This should not commonly happen, but it would be easy for interactions 1224 // of the rules for dictionary segments and the break engine implementations to 1225 // inadvertently cause it. Cover it here, just in case. 1226 fBreaks.offer(startPos); 1227 } 1228 if (endPos > fBreaks.peek()) { 1229 fBreaks.push(endPos); 1230 } 1231 fPositionInCache = 0; 1232 // Note: Dictionary matching may extend beyond the original limit. 1233 fStart = fBreaks.elementAt(0); 1234 fLimit = fBreaks.peek(); 1235 } else { 1236 // there were no language-based breaks, even though the segment contained 1237 // dictionary characters. Subsequent attempts to fetch boundaries from the dictionary cache 1238 // for this range will fail, and the calling code will fall back to the rule based boundaries. 1239 } 1240 1241 }; 1242 1243 DictionaryCache()1244 DictionaryCache() { 1245 fPositionInCache = -1; 1246 fBreaks = new DictionaryBreakEngine.DequeI(); 1247 } 1248 1249 /** 1250 * copy constructor. Used by RuleBasedBreakIterator.clone(). 1251 * 1252 * @param src the source object to be copied. 1253 */ DictionaryCache(DictionaryCache src)1254 DictionaryCache(DictionaryCache src) { 1255 try { 1256 fBreaks = (DictionaryBreakEngine.DequeI)src.fBreaks.clone(); 1257 } 1258 catch (CloneNotSupportedException e) { 1259 throw new RuntimeException(e); 1260 } 1261 fPositionInCache = src.fPositionInCache; 1262 fStart = src.fStart; 1263 fLimit = src.fLimit; 1264 fFirstRuleStatusIndex = src.fFirstRuleStatusIndex; 1265 fOtherRuleStatusIndex = src.fOtherRuleStatusIndex; 1266 fBoundary = src.fBoundary; 1267 fStatusIndex = src.fStatusIndex; 1268 } 1269 1270 // A data structure containing the boundaries themselves. Essentially a vector of raw ints. 1271 DictionaryBreakEngine.DequeI fBreaks; 1272 int fPositionInCache; // Index in fBreaks of last boundary returned by following() 1273 // // or preceding(). Optimizes sequential access. 1274 int fStart; // Text position of first boundary in cache. 1275 int fLimit; // Last boundary in cache. Which is the limit of the 1276 // // text segment being handled by the dictionary. 1277 int fFirstRuleStatusIndex; // Rule status info for first boundary. 1278 int fOtherRuleStatusIndex; // Rule status info for 2nd through last boundaries. 1279 int fBoundary; // Current boundary. Set by preceding(), following(). 1280 int fStatusIndex; // Current rule status index. Set by preceding, following(). 1281 }; 1282 1283 1284 1285 1286 /* 1287 * class BreakCache 1288 * 1289 * Cache of break boundary positions and rule status values. 1290 * Break iterator API functions, next(), previous(), etc., will use cached results 1291 * when possible, and otherwise cache new results as they are obtained. 1292 * 1293 * Uniformly caches both dictionary and rule based (non-dictionary) boundaries. 1294 * 1295 * The cache is implemented as a single circular buffer. 1296 */ 1297 1298 /* 1299 * size of the circular cache buffer. 1300 */ 1301 1302 class BreakCache { 1303 BreakCache()1304 BreakCache() { 1305 reset(); 1306 }; 1307 reset(int pos, int ruleStatus)1308 void reset(int pos, int ruleStatus) { 1309 fStartBufIdx = 0; 1310 fEndBufIdx = 0; 1311 fTextIdx = pos; 1312 fBufIdx = 0; 1313 fBoundaries[0] = pos; 1314 fStatuses[0] = (short)ruleStatus; 1315 } 1316 reset()1317 void reset() {reset(0, 0); }; 1318 next()1319 void next() { 1320 if (fBufIdx == fEndBufIdx) { 1321 fDone = !populateFollowing(); 1322 fPosition = fTextIdx; 1323 fRuleStatusIndex = fStatuses[fBufIdx]; 1324 } else { 1325 fBufIdx = modChunkSize(fBufIdx + 1); 1326 fTextIdx = fPosition = fBoundaries[fBufIdx]; 1327 fRuleStatusIndex = fStatuses[fBufIdx]; 1328 } 1329 }; 1330 previous()1331 void previous() { 1332 int initialBufIdx = fBufIdx; 1333 if (fBufIdx == fStartBufIdx) { 1334 // At start of cache. Prepend to it. 1335 populatePreceding(); 1336 } else { 1337 // Cache already holds the next boundary 1338 fBufIdx = modChunkSize(fBufIdx - 1); 1339 fTextIdx = fBoundaries[fBufIdx]; 1340 } 1341 fDone = (fBufIdx == initialBufIdx); 1342 fPosition = fTextIdx; 1343 fRuleStatusIndex = fStatuses[fBufIdx]; 1344 return; 1345 }; 1346 1347 // Move the iteration state to the position following the startPosition. 1348 // Input position must be pinned to the input length. following(int startPos)1349 void following(int startPos) { 1350 if (startPos == fTextIdx || seek(startPos) || populateNear(startPos)) { 1351 // startPos is in the cache. Do a next() from that position. 1352 // TODO: an awkward set of interactions with bi->fDone 1353 // seek() does not clear it; it can't because of interactions with populateNear(). 1354 // next() does not clear it in the fast-path case, where everything matters. Maybe it should. 1355 // So clear it here, for the case where seek() succeeded on an iterator that had previously run off the end. 1356 fDone = false; 1357 next(); 1358 } 1359 1360 }; 1361 preceding(int startPos)1362 void preceding(int startPos) { 1363 if (startPos == fTextIdx || seek(startPos) || populateNear(startPos)) { 1364 if (startPos == fTextIdx) { 1365 previous(); 1366 } else { 1367 // seek() leaves the BreakCache positioned at the preceding boundary 1368 // if the requested position is between two boundaries. 1369 // current() pushes the BreakCache position out to the BreakIterator itself. 1370 assert(startPos > fTextIdx); 1371 current(); 1372 } 1373 } 1374 return; 1375 }; 1376 1377 /** 1378 * Update the state of the public BreakIterator (fBI) to reflect the 1379 * current state of the break iterator cache (this). 1380 */ current()1381 int current() { 1382 fPosition = fTextIdx; 1383 fRuleStatusIndex = fStatuses[fBufIdx]; 1384 fDone = false; 1385 return fTextIdx; 1386 }; 1387 1388 /** 1389 * Add boundaries to the cache near the specified position. 1390 * The given position need not be a boundary itself. 1391 * The input position must be within the range of the text, and 1392 * on a code point boundary. 1393 * If the requested position is a break boundary, leave the iteration 1394 * position on it. 1395 * If the requested position is not a boundary, leave the iteration 1396 * position on the preceding boundary and include both the the 1397 * preceding and following boundaries in the cache. 1398 * Additional boundaries, either preceding or following, may be added 1399 * to the cache as a side effect. 1400 * 1401 * Return false if the operation failed. 1402 */ populateNear(int position)1403 boolean populateNear(int position) { 1404 assert(position < fBoundaries[fStartBufIdx] || position > fBoundaries[fEndBufIdx]); 1405 1406 // Add boundaries to the cache near the specified position. 1407 // The given position need not be a boundary itself. 1408 // The input position must be within the range of the text, and 1409 // on a code point boundary. 1410 // If the requested position is a break boundary, leave the iteration 1411 // position on it. 1412 // If the requested position is not a boundary, leave the iteration 1413 // position on the preceding boundary and include both the 1414 // preceding and following boundaries in the cache. 1415 // Additional boundaries, either preceding or following, may be added 1416 // to the cache as a side effect. 1417 1418 // If the requested position is not near already cached positions, clear the existing cache, 1419 // find a near-by boundary and begin new cache contents there. 1420 1421 // Threshold for a text position to be considered near to existing cache contents. 1422 // TODO: See issue ICU-22024 "perf tuning of Cache needed." 1423 // This value is subject to change. See the ticket for more details. 1424 final int CACHE_NEAR = 15; 1425 1426 int startOfText = fText.getBeginIndex(); 1427 int aBoundary = -1; 1428 int ruleStatusIndex = 0; 1429 boolean retainCache = false; 1430 if ((position > fBoundaries[fStartBufIdx] - CACHE_NEAR) && position < (fBoundaries[fEndBufIdx] + CACHE_NEAR)) { 1431 // Requested position is near the existing cache. Retain it. 1432 retainCache = true; 1433 } else if (position <= startOfText + CACHE_NEAR) { 1434 // Requested position is near the start of the text. Fill cache from start, skipping 1435 // the need to find a safe point. 1436 retainCache = false; 1437 aBoundary = startOfText; 1438 } else { 1439 // Requested position is not near the existing cache. 1440 // Find a safe point to refill the cache from. 1441 int backupPos = handleSafePrevious(position); 1442 1443 if (fBoundaries[fEndBufIdx] < position && fBoundaries[fEndBufIdx] >= (backupPos - CACHE_NEAR)) { 1444 // The requested position is beyond the end of the existing cache, but the 1445 // reverse rules produced a position near or before the cached region. 1446 // Retain the existing cache, and fill from the end of it. 1447 retainCache = true; 1448 } else if (backupPos < startOfText + CACHE_NEAR) { 1449 // The safe reverse rules moved us to near the start of text. 1450 // Take that (index 0) as the backup boundary, avoiding the complication 1451 // (in the following block) of moving forward from the safe point to a known boundary. 1452 // 1453 // Retain the cache if it begins not too far from the requested position. 1454 aBoundary = startOfText; 1455 retainCache = (fBoundaries[fStartBufIdx] <= (position + CACHE_NEAR)); 1456 } else { 1457 // The safe reverse rules produced a position that is neither near the existing 1458 // cache, nor near the start of text. 1459 // Advance to the boundary following. 1460 // There is a complication: the safe reverse rules identify pairs of code points 1461 // that are safe. If advancing from the safe point moves forwards by less than 1462 // two code points, we need to advance one more time to ensure that the boundary 1463 // is good, including a correct rules status value. 1464 // 1465 retainCache = false; 1466 fPosition = backupPos; 1467 aBoundary = handleNext(); 1468 if (aBoundary == backupPos + 1 || 1469 (aBoundary == backupPos + 2 && 1470 Character.isHighSurrogate(fText.setIndex(backupPos)) && 1471 Character.isLowSurrogate(fText.next()))) { 1472 // The initial handleNext() only advanced by a single code point. Go again. 1473 // Safe rules identify safe pairs. 1474 aBoundary = handleNext(); 1475 } 1476 if (aBoundary == BreakIterator.DONE) { 1477 aBoundary = fText.getEndIndex(); 1478 } 1479 ruleStatusIndex = fRuleStatusIndex; 1480 } 1481 } 1482 1483 if (!retainCache) { 1484 assert(aBoundary != -1); 1485 reset(aBoundary, ruleStatusIndex); // Reset cache to hold aBoundary as a single starting point. 1486 } 1487 1488 1489 // Fill in boundaries between existing cache content and the new requested position. 1490 1491 if (fBoundaries[fEndBufIdx] < position) { 1492 // The last position in the cache precedes the requested position. 1493 // Add following position(s) to the cache. 1494 while (fBoundaries[fEndBufIdx] < position) { 1495 if (!populateFollowing()) { 1496 assert false; 1497 return false; 1498 } 1499 } 1500 fBufIdx = fEndBufIdx; // Set iterator position to the end of the buffer. 1501 fTextIdx = fBoundaries[fBufIdx]; // Required because populateFollowing may add extra boundaries. 1502 while (fTextIdx > position) { // Move backwards to a position at or preceding the requested pos. 1503 previous(); 1504 } 1505 return true; 1506 } 1507 1508 if (fBoundaries[fStartBufIdx] > position) { 1509 // The first position in the cache is beyond the requested position. 1510 // back up more until we get a boundary <= the requested position. 1511 while (fBoundaries[fStartBufIdx] > position) { 1512 populatePreceding(); 1513 } 1514 fBufIdx = fStartBufIdx; // Set iterator position to the start of the buffer. 1515 fTextIdx = fBoundaries[fBufIdx]; // Required because populatePreceding may add extra boundaries. 1516 while (fTextIdx < position) { // Move forwards to a position at or following the requested pos. 1517 next(); 1518 } 1519 if (fTextIdx > position) { 1520 // If position is not itself a boundary, the next() loop above will overshoot. 1521 // Back up one, leaving cache position at the boundary preceding the requested position. 1522 previous(); 1523 } 1524 return true; 1525 } 1526 1527 assert fTextIdx == position; 1528 return true; 1529 1530 }; 1531 1532 /** 1533 * Add boundary(s) to the cache following the current last boundary. 1534 * Return false if at the end of the text, and no more boundaries can be added. 1535 * Leave iteration position at the first newly added boundary, or unchanged if no boundary was added. 1536 */ populateFollowing()1537 boolean populateFollowing() { 1538 int fromPosition = fBoundaries[fEndBufIdx]; 1539 int fromRuleStatusIdx = fStatuses[fEndBufIdx]; 1540 int pos = 0; 1541 int ruleStatusIdx = 0; 1542 1543 if (fDictionaryCache.following(fromPosition)) { 1544 addFollowing(fDictionaryCache.fBoundary, fDictionaryCache.fStatusIndex, UpdateCachePosition); 1545 return true; 1546 } 1547 1548 fPosition = fromPosition; 1549 pos = handleNext(); 1550 if (pos == BreakIterator.DONE) { 1551 return false; 1552 } 1553 1554 ruleStatusIdx = fRuleStatusIndex; 1555 if (fDictionaryCharCount > 0) { 1556 // The text segment obtained from the rules includes dictionary characters. 1557 // Subdivide it, with subdivided results going into the dictionary cache. 1558 fDictionaryCache.populateDictionary(fromPosition, pos, fromRuleStatusIdx, ruleStatusIdx); 1559 if (fDictionaryCache.following(fromPosition)) { 1560 addFollowing(fDictionaryCache.fBoundary, fDictionaryCache.fStatusIndex, UpdateCachePosition); 1561 return true; 1562 // TODO: may want to move a sizable chunk of the dictionary cache to the break cache at this point. 1563 // But be careful with interactions with populateNear(). 1564 } 1565 } 1566 1567 // Rule based segment did not include dictionary characters. 1568 // Or, it did contain dictionary chars, but the dictionary segmenter didn't handle them, 1569 // meaning that we didn't take the return, above. 1570 // Add its end point to the cache. 1571 addFollowing(pos, ruleStatusIdx, UpdateCachePosition); 1572 1573 // Add several non-dictionary boundaries at this point, to optimize straight forward iteration. 1574 // (subsequent calls to BreakIterator::next() will take the fast path, getting cached results. 1575 // 1576 for (int count=0; count<6; ++count) { 1577 pos = handleNext(); 1578 if (pos == BreakIterator.DONE || fDictionaryCharCount > 0) { 1579 break; 1580 } 1581 addFollowing(pos, fRuleStatusIndex, RetainCachePosition); 1582 } 1583 return true; 1584 }; 1585 1586 /** 1587 * Add one or more boundaries to the cache preceding the first currently cached boundary. 1588 * Leave the iteration position on the first added boundary. 1589 * Return false if no boundaries could be added (if at the start of the text.) 1590 */ populatePreceding()1591 boolean populatePreceding() { 1592 int textBegin = fText.getBeginIndex(); 1593 int fromPosition = fBoundaries[fStartBufIdx]; 1594 if (fromPosition == textBegin) { 1595 return false; 1596 } 1597 1598 int position = textBegin; 1599 int positionStatusIdx = 0; 1600 1601 if (fDictionaryCache.preceding(fromPosition)) { 1602 addPreceding(fDictionaryCache.fBoundary, fDictionaryCache.fStatusIndex, UpdateCachePosition); 1603 return true; 1604 } 1605 1606 int backupPosition = fromPosition; 1607 1608 // Find a boundary somewhere preceding the first already-cached boundary 1609 do { 1610 backupPosition = backupPosition - 30; 1611 if (backupPosition <= textBegin) { 1612 backupPosition = textBegin; 1613 } else { 1614 backupPosition = handleSafePrevious(backupPosition); 1615 } 1616 if (backupPosition == BreakIterator.DONE || backupPosition == textBegin) { 1617 position = textBegin; 1618 positionStatusIdx = 0; 1619 } else { 1620 // Advance to the boundary following the backup position. 1621 // There is a complication: the safe reverse rules identify pairs of code points 1622 // that are safe. If advancing from the safe point moves forwards by less than 1623 // two code points, we need to advance one more time to ensure that the boundary 1624 // is good, including a correct rules status value. 1625 // 1626 fPosition = backupPosition; // TODO: pass starting position in a clearer way. 1627 position = handleNext(); 1628 if (position == backupPosition + 1 || 1629 (position == backupPosition + 2 && 1630 Character.isHighSurrogate(fText.setIndex(backupPosition)) && 1631 Character.isLowSurrogate(fText.next()))) { 1632 // The initial handleNext() only advanced by a single code point. Go again. 1633 // Safe rules identify safe pairs. 1634 position = handleNext(); 1635 } 1636 positionStatusIdx = fRuleStatusIndex; 1637 } 1638 } while (position >= fromPosition); 1639 1640 // Find boundaries between the one we just located and the first already-cached boundary 1641 // Put them in a side buffer, because we don't yet know where they will fall in the circular cache buffer. 1642 1643 fSideBuffer.removeAllElements(); 1644 fSideBuffer.push(position); 1645 fSideBuffer.push(positionStatusIdx); 1646 1647 do { 1648 int prevPosition = fPosition = position; 1649 int prevStatusIdx = positionStatusIdx; 1650 position = handleNext(); 1651 positionStatusIdx = fRuleStatusIndex; 1652 if (position == BreakIterator.DONE) { 1653 break; 1654 } 1655 1656 boolean segmentHandledByDictionary = false; 1657 if (fDictionaryCharCount != 0) { 1658 // Segment from the rules includes dictionary characters. 1659 // Subdivide it, with subdivided results going into the dictionary cache. 1660 int dictSegEndPosition = position; 1661 fDictionaryCache.populateDictionary(prevPosition, dictSegEndPosition, prevStatusIdx, positionStatusIdx); 1662 while (fDictionaryCache.following(prevPosition)) { 1663 position = fDictionaryCache.fBoundary; 1664 positionStatusIdx = fDictionaryCache.fStatusIndex; 1665 segmentHandledByDictionary = true; 1666 assert(position > prevPosition); 1667 if (position >= fromPosition) { 1668 break; 1669 } 1670 assert(position <= dictSegEndPosition); 1671 fSideBuffer.push(position); 1672 fSideBuffer.push(positionStatusIdx); 1673 prevPosition = position; 1674 } 1675 assert(position==dictSegEndPosition || position>=fromPosition); 1676 } 1677 1678 if (!segmentHandledByDictionary && position < fromPosition) { 1679 fSideBuffer.push(position); 1680 fSideBuffer.push(positionStatusIdx); 1681 } 1682 } while (position < fromPosition); 1683 1684 // Move boundaries from the side buffer to the main circular buffer. 1685 boolean success = false; 1686 if (!fSideBuffer.isEmpty()) { 1687 positionStatusIdx = fSideBuffer.pop(); 1688 position = fSideBuffer.pop(); 1689 addPreceding(position, positionStatusIdx, UpdateCachePosition); 1690 success = true; 1691 } 1692 1693 while (!fSideBuffer.isEmpty()) { 1694 positionStatusIdx = fSideBuffer.pop(); 1695 position = fSideBuffer.pop(); 1696 if (!addPreceding(position, positionStatusIdx, RetainCachePosition)) { 1697 // No space in circular buffer to hold a new preceding result while 1698 // also retaining the current cache (iteration) position. 1699 // Bailing out is safe; the cache will refill again if needed. 1700 break; 1701 } 1702 } 1703 return success; 1704 }; 1705 1706 1707 static final boolean RetainCachePosition = false; 1708 static final boolean UpdateCachePosition = true; 1709 1710 /** 1711 * Add the boundary following the current position. 1712 * The current position can be left as it was, or changed to the newly added boundary, 1713 * as specified by the update parameter. 1714 */ addFollowing(int position, int ruleStatusIdx, boolean update)1715 void addFollowing(int position, int ruleStatusIdx, boolean update) { 1716 assert(position > fBoundaries[fEndBufIdx]); 1717 assert(ruleStatusIdx <= Short.MAX_VALUE); 1718 int nextIdx = modChunkSize(fEndBufIdx + 1); 1719 if (nextIdx == fStartBufIdx) { 1720 fStartBufIdx = modChunkSize(fStartBufIdx + 6); // TODO: experiment. Probably revert to 1. 1721 } 1722 fBoundaries[nextIdx] = position; 1723 fStatuses[nextIdx] = (short)ruleStatusIdx; 1724 fEndBufIdx = nextIdx; 1725 if (update == UpdateCachePosition) { 1726 // Set current position to the newly added boundary. 1727 fBufIdx = nextIdx; 1728 fTextIdx = position; 1729 } else { 1730 // Retaining the original cache position. 1731 // Check if the added boundary wraps around the buffer, and would over-write the original position. 1732 // It's the responsibility of callers of this function to not add too many. 1733 assert(nextIdx != fBufIdx); 1734 } 1735 1736 }; 1737 1738 1739 /** 1740 * Add the boundary preceding the current position. 1741 * The current position can be left as it was, or changed to the newly added boundary, 1742 * as specified by the update parameter. 1743 */ addPreceding(int position, int ruleStatusIdx, boolean update)1744 boolean addPreceding(int position, int ruleStatusIdx, boolean update) { 1745 assert(position < fBoundaries[fStartBufIdx]); 1746 assert(ruleStatusIdx <= Short.MAX_VALUE); 1747 int nextIdx = modChunkSize(fStartBufIdx - 1); 1748 if (nextIdx == fEndBufIdx) { 1749 if (fBufIdx == fEndBufIdx && update == RetainCachePosition) { 1750 // Failure. The insertion of the new boundary would claim the buffer position that is the 1751 // current iteration position. And we also want to retain the current iteration position. 1752 // (The buffer is already completely full of entries that precede the iteration position.) 1753 return false; 1754 } 1755 fEndBufIdx = modChunkSize(fEndBufIdx - 1); 1756 } 1757 fBoundaries[nextIdx] = position; 1758 fStatuses[nextIdx] = (short)ruleStatusIdx; 1759 fStartBufIdx = nextIdx; 1760 if (update == UpdateCachePosition) { 1761 fBufIdx = nextIdx; 1762 fTextIdx = position; 1763 } 1764 return true; 1765 }; 1766 1767 /** 1768 * Set the cache position to the specified position, or, if the position 1769 * falls between to cached boundaries, to the preceding boundary. 1770 * Fails if the requested position is outside of the range of boundaries currently held by the cache. 1771 * The startPosition must be on a code point boundary. 1772 * 1773 * Return true if successful, false if the specified position is after 1774 * the last cached boundary or before the first. 1775 */ 1776 boolean seek(int pos) { 1777 if (pos < fBoundaries[fStartBufIdx] || pos > fBoundaries[fEndBufIdx]) { 1778 return false; 1779 } 1780 if (pos == fBoundaries[fStartBufIdx]) { 1781 // Common case: seek(0), from BreakIterator::first() 1782 fBufIdx = fStartBufIdx; 1783 fTextIdx = fBoundaries[fBufIdx]; 1784 return true; 1785 } 1786 if (pos == fBoundaries[fEndBufIdx]) { 1787 fBufIdx = fEndBufIdx; 1788 fTextIdx = fBoundaries[fBufIdx]; 1789 return true; 1790 } 1791 1792 int min = fStartBufIdx; 1793 int max = fEndBufIdx; 1794 while (min != max) { 1795 int probe = (min + max + (min>max ? CACHE_SIZE : 0)) / 2; 1796 probe = modChunkSize(probe); 1797 if (fBoundaries[probe] > pos) { 1798 max = probe; 1799 } else { 1800 min = modChunkSize(probe + 1); 1801 } 1802 } 1803 assert(fBoundaries[max] > pos); 1804 fBufIdx = modChunkSize(max - 1); 1805 fTextIdx = fBoundaries[fBufIdx]; 1806 assert(fTextIdx <= pos); 1807 return true; 1808 1809 }; 1810 1811 1812 /** 1813 * copy constructor, used from RuleBasedBreakIterator.clone(). 1814 * 1815 * @param src 1816 */ BreakCache(BreakCache src)1817 BreakCache(BreakCache src) { 1818 fStartBufIdx = src.fStartBufIdx; 1819 fEndBufIdx = src.fEndBufIdx; 1820 fTextIdx = src.fTextIdx; 1821 fBufIdx = src.fBufIdx; 1822 fBoundaries = src.fBoundaries.clone(); 1823 fStatuses = src.fStatuses.clone(); 1824 fSideBuffer = new DictionaryBreakEngine.DequeI(); // Transient, no need to clone contents. 1825 } 1826 dumpCache()1827 void dumpCache() { 1828 System.out.printf("fTextIdx:%d fBufIdx:%d%n", fTextIdx, fBufIdx); 1829 for (int i=fStartBufIdx; ; i=modChunkSize(i+1)) { 1830 System.out.printf("%d %d%n", i, fBoundaries[i]); 1831 if (i == fEndBufIdx) { 1832 break; 1833 } 1834 } 1835 }; 1836 modChunkSize(int index)1837 private final int modChunkSize(int index) { return index & (CACHE_SIZE - 1); }; 1838 1839 static final int CACHE_SIZE = 128; 1840 // static_assert((CACHE_SIZE & (CACHE_SIZE-1)) == 0, "CACHE_SIZE must be power of two."); 1841 1842 int fStartBufIdx; 1843 int fEndBufIdx; // inclusive 1844 1845 int fTextIdx; 1846 int fBufIdx; 1847 1848 int[] fBoundaries = new int[CACHE_SIZE]; 1849 short[] fStatuses = new short[CACHE_SIZE]; 1850 1851 DictionaryBreakEngine.DequeI fSideBuffer = new DictionaryBreakEngine.DequeI(); 1852 }; 1853 1854 1855 1856 1857 } 1858 1859