1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * Copyright (C) 2005-2016 International Business Machines Corporation and 6 * others. All Rights Reserved. 7 ******************************************************************************* 8 */ 9 10 package com.ibm.icu.text; 11 12 import static com.ibm.icu.impl.CharacterIteration.DONE32; 13 import static com.ibm.icu.impl.CharacterIteration.next32; 14 import static com.ibm.icu.impl.CharacterIteration.nextTrail32; 15 16 import java.io.ByteArrayOutputStream; 17 import java.io.IOException; 18 import java.io.InputStream; 19 import java.io.OutputStream; 20 import java.nio.ByteBuffer; 21 import java.text.CharacterIterator; 22 import java.util.ArrayList; 23 import java.util.List; 24 import java.util.MissingResourceException; 25 26 import com.ibm.icu.impl.CharacterIteration; 27 import com.ibm.icu.impl.ICUBinary; 28 import com.ibm.icu.impl.ICUDebug; 29 import com.ibm.icu.impl.RBBIDataWrapper; 30 import com.ibm.icu.impl.breakiter.BurmeseBreakEngine; 31 import com.ibm.icu.impl.breakiter.CjkBreakEngine; 32 import com.ibm.icu.impl.breakiter.DictionaryBreakEngine; 33 import com.ibm.icu.impl.breakiter.KhmerBreakEngine; 34 import com.ibm.icu.impl.breakiter.LSTMBreakEngine; 35 import com.ibm.icu.impl.breakiter.LanguageBreakEngine; 36 import com.ibm.icu.impl.breakiter.LaoBreakEngine; 37 import com.ibm.icu.impl.breakiter.ThaiBreakEngine; 38 import com.ibm.icu.impl.breakiter.UnhandledBreakEngine; 39 import com.ibm.icu.lang.UCharacter; 40 import com.ibm.icu.lang.UProperty; 41 import com.ibm.icu.lang.UScript; 42 import com.ibm.icu.util.CodePointTrie; 43 44 /** 45 * Rule Based Break Iterator 46 * This is a port of the C++ class RuleBasedBreakIterator from ICU4C. 47 * 48 * @stable ICU 2.0 49 */ 50 public class RuleBasedBreakIterator extends BreakIterator { 51 //======================================================================= 52 // Constructors & Factories 53 //======================================================================= 54 55 /** 56 * private constructor 57 */ RuleBasedBreakIterator()58 private RuleBasedBreakIterator() { 59 fDictionaryCharCount = 0; 60 synchronized(gAllBreakEngines) { 61 fBreakEngines = new ArrayList<>(gAllBreakEngines); 62 } 63 } 64 65 /** 66 * Create a break iterator from a precompiled set of break rules. 67 * 68 * Creating a break iterator from the binary rules is much faster than 69 * creating one from source rules. 70 * 71 * The binary rules are generated by the RuleBasedBreakIterator.compileRules() function. 72 * Binary break iterator rules are not guaranteed to be compatible between 73 * different versions of ICU. 74 * 75 * @param is an input stream supplying the compiled binary rules. 76 * @throws IOException if there is an error while reading the rules from the InputStream. 77 * @see #compileRules(String, OutputStream) 78 * @stable ICU 4.8 79 */ getInstanceFromCompiledRules(InputStream is)80 public static RuleBasedBreakIterator getInstanceFromCompiledRules(InputStream is) throws IOException { 81 RuleBasedBreakIterator This = new RuleBasedBreakIterator(); 82 This.fRData = RBBIDataWrapper.get(ICUBinary.getByteBufferFromInputStreamAndCloseStream(is)); 83 This.fLookAheadMatches = new int[This.fRData.fFTable.fLookAheadResultsSize]; 84 return This; 85 } 86 87 /** 88 * This factory method doesn't have an access modifier; it is only accessible in the same 89 * package. 90 * 91 * Create a break iterator from a precompiled set of break rules. 92 * 93 * Creating a break iterator from the binary rules is much faster than 94 * creating one from source rules. 95 * 96 * The binary rules are generated by the RuleBasedBreakIterator.compileRules() function. 97 * Binary break iterator rules are not guaranteed to be compatible between 98 * different versions of ICU. 99 * 100 * @param bytes a buffer supplying the compiled binary rules. 101 * @param phraseBreaking a flag indicating if phrase breaking is required. 102 * @throws IOException if there is an error while reading the rules from the buffer. 103 * @see #compileRules(String, OutputStream) 104 * @internal 105 */ getInstanceFromCompiledRules( ByteBuffer bytes, boolean phraseBreaking)106 /* package-potected */ static RuleBasedBreakIterator getInstanceFromCompiledRules( 107 ByteBuffer bytes, boolean phraseBreaking) throws IOException { 108 RuleBasedBreakIterator instance = getInstanceFromCompiledRules(bytes); 109 instance.fPhraseBreaking = phraseBreaking; 110 return instance; 111 } 112 113 /** 114 * Create a break iterator from a precompiled set of break rules. 115 * 116 * Creating a break iterator from the binary rules is much faster than 117 * creating one from source rules. 118 * 119 * The binary rules are generated by the RuleBasedBreakIterator.compileRules() function. 120 * Binary break iterator rules are not guaranteed to be compatible between 121 * different versions of ICU. 122 * 123 * @param bytes a buffer supplying the compiled binary rules. 124 * @throws IOException if there is an error while reading the rules from the buffer. 125 * @see #compileRules(String, OutputStream) 126 * @internal 127 * @deprecated This API is ICU internal only. 128 */ 129 @Deprecated getInstanceFromCompiledRules(ByteBuffer bytes)130 public static RuleBasedBreakIterator getInstanceFromCompiledRules(ByteBuffer bytes) throws IOException { 131 RuleBasedBreakIterator This = new RuleBasedBreakIterator(); 132 This.fRData = RBBIDataWrapper.get(bytes); 133 This.fLookAheadMatches = new int[This.fRData.fFTable.fLookAheadResultsSize]; 134 return This; 135 } 136 137 /** 138 * Construct a RuleBasedBreakIterator from a set of rules supplied as a string. 139 * @param rules The break rules to be used. 140 * @stable ICU 2.2 141 */ RuleBasedBreakIterator(String rules)142 public RuleBasedBreakIterator(String rules) { 143 this(); 144 try { 145 ByteArrayOutputStream ruleOS = new ByteArrayOutputStream(); 146 compileRules(rules, ruleOS); 147 fRData = RBBIDataWrapper.get(ByteBuffer.wrap(ruleOS.toByteArray())); 148 fLookAheadMatches = new int[fRData.fFTable.fLookAheadResultsSize]; 149 } catch (IOException e) { 150 ///CLOVER:OFF 151 // An IO exception can only arrive here if there is a bug in the RBBI Rule compiler, 152 // causing bogus compiled rules to be produced, but with no compile error raised. 153 RuntimeException rte = new RuntimeException("RuleBasedBreakIterator rule compilation internal error: " 154 + e.getMessage()); 155 throw rte; 156 ///CLOVER:ON 157 } 158 } 159 160 //======================================================================= 161 // Boilerplate 162 //======================================================================= 163 164 /** 165 * Clones this iterator. 166 * @return A newly-constructed RuleBasedBreakIterator with the same 167 * behavior as this one. 168 * @stable ICU 2.0 169 */ 170 @Override clone()171 public Object clone() { 172 RuleBasedBreakIterator result; 173 result = (RuleBasedBreakIterator)super.clone(); 174 if (fText != null) { 175 result.fText = (CharacterIterator)(fText.clone()); 176 } 177 synchronized (gAllBreakEngines) { 178 result.fBreakEngines = new ArrayList<>(gAllBreakEngines); 179 } 180 result.fLookAheadMatches = new int[fRData.fFTable.fLookAheadResultsSize]; 181 result.fBreakCache = result.new BreakCache(fBreakCache); 182 result.fDictionaryCache = result.new DictionaryCache(fDictionaryCache); 183 return result; 184 } 185 186 187 /** 188 * Returns true if both BreakIterators are of the same class, have the same 189 * rules, and iterate over the same text. 190 * @stable ICU 2.0 191 */ 192 @Override equals(Object that)193 public boolean equals(Object that) { 194 if (that == null) { 195 return false; 196 } 197 if (this == that) { 198 return true; 199 } 200 try { 201 RuleBasedBreakIterator other = (RuleBasedBreakIterator) that; 202 if (fRData != other.fRData && (fRData == null || other.fRData == null)) { 203 return false; 204 } 205 if (fRData != null && other.fRData != null && 206 (!fRData.fRuleSource.equals(other.fRData.fRuleSource))) { 207 return false; 208 } 209 if (fText == null && other.fText == null) { 210 return true; 211 } 212 if (fText == null || other.fText == null || !fText.equals(other.fText)) { 213 return false; 214 } 215 return fPosition == other.fPosition; 216 } 217 catch(ClassCastException e) { 218 return false; 219 } 220 } 221 222 /** 223 * Returns the description (rules) used to create this iterator. 224 * (In ICU4C, the same function is RuleBasedBreakIterator::getRules()) 225 * @stable ICU 2.0 226 */ 227 @Override toString()228 public String toString() { 229 String retStr = ""; 230 if (fRData != null) { 231 retStr = fRData.fRuleSource; 232 } 233 return retStr; 234 } 235 236 /** 237 * Compute a hashcode for this BreakIterator 238 * @return A hash code 239 * @stable ICU 2.0 240 */ 241 @Override hashCode()242 public int hashCode() 243 { 244 return fRData.fRuleSource.hashCode(); 245 } 246 247 248 private static final int START_STATE = 1; // The state number of the starting state 249 private static final int STOP_STATE = 0; // The state-transition value indicating "stop" 250 251 // RBBIRunMode - the state machine runs an extra iteration at the beginning and end 252 // of user text. A variable with this enum type keeps track of where we 253 // are. The state machine only fetches user text input while in RUN mode. 254 private static final int RBBI_START = 0; 255 private static final int RBBI_RUN = 1; 256 private static final int RBBI_END = 2; 257 258 /** 259 * The character iterator through which this BreakIterator accesses the text. 260 */ 261 private CharacterIterator fText = new java.text.StringCharacterIterator(""); 262 263 /** 264 * The rule data for this BreakIterator instance. 265 * Not intended for public use. Declared public for testing purposes only. 266 * @internal 267 * @deprecated This API is ICU internal only. 268 */ 269 @Deprecated 270 public RBBIDataWrapper fRData; 271 272 /** 273 * The iteration state - current position, rule status for the current position, 274 * and whether the iterator ran off the end, yielding UBRK_DONE. 275 * Current position is pinned to be 0 < position <= text.length. 276 * Current position is always set to a boundary. 277 * 278 * The current position of the iterator. Pinned, 0 < fPosition <= text.length. 279 * Never has the value UBRK_DONE (-1). 280 */ 281 private int fPosition; 282 283 /** 284 * Index of the Rule {tag} values for the most recent match. 285 */ 286 private int fRuleStatusIndex; 287 288 /** 289 * True when iteration has run off the end, and iterator functions should return UBRK_DONE. 290 */ 291 private boolean fDone; 292 293 /** 294 * Array of look-ahead tentative results. 295 */ 296 private int[] fLookAheadMatches; 297 298 /** 299 * Cache of previously determined boundary positions. 300 */ 301 private BreakCache fBreakCache = new BreakCache(); 302 303 /** 304 * Flag used to indicate if phrase breaking is required. 305 */ 306 private boolean fPhraseBreaking = false; 307 308 309 /** 310 * Counter for the number of characters encountered with the "dictionary" 311 * flag set. Normal RBBI iterators don't use it, although the code 312 * for updating it is live. Dictionary Based break iterators (a subclass 313 * of us) access this field directly. 314 * @internal 315 */ 316 private int fDictionaryCharCount; 317 318 private DictionaryCache fDictionaryCache = new DictionaryCache(); 319 320 /** 321 * ICU debug argument name for RBBI 322 */ 323 private static final String RBBI_DEBUG_ARG = "rbbi"; 324 325 /** 326 * Debugging flag. Trace operation of state machine when true. 327 */ 328 private static final boolean TRACE = ICUDebug.enabled(RBBI_DEBUG_ARG) 329 && ICUDebug.value(RBBI_DEBUG_ARG).indexOf("trace") >= 0; 330 331 /** 332 * The "default" break engine - just skips over ranges of dictionary words, 333 * producing no breaks. Should only be used if characters need to be handled 334 * by a dictionary but we have no dictionary implementation for them. 335 * 336 * Only one instance; shared by all break iterators. 337 */ 338 private static final UnhandledBreakEngine gUnhandledBreakEngine; 339 340 /** 341 * List of all known break engines, common for all break iterators. 342 * Lazily updated as break engines are needed, because instantiation of 343 * break engines is expensive. 344 * 345 * Because gAllBreakEngines can be referenced concurrently from different 346 * BreakIterator instances, all access is synchronized. 347 */ 348 private static final List<LanguageBreakEngine> gAllBreakEngines; 349 350 static { 351 gUnhandledBreakEngine = new UnhandledBreakEngine(); 352 gAllBreakEngines = new ArrayList<>(); 353 gAllBreakEngines.add(gUnhandledBreakEngine); 354 } 355 356 /** 357 * List of all known break engines. Similar to gAllBreakEngines, but local to a 358 * break iterator, allowing it to be used without synchronization. 359 */ 360 private List<LanguageBreakEngine> fBreakEngines; 361 362 /** 363 * Dump the contents of the state table and character classes for this break iterator. 364 * For debugging only. 365 * @internal 366 * @deprecated This API is ICU internal only. 367 */ 368 @Deprecated dump(java.io.PrintStream out)369 public void dump(java.io.PrintStream out) { 370 if (out == null) { 371 out = System.out; 372 } 373 this.fRData.dump(out); 374 } 375 376 /** 377 * Compile a set of source break rules into the binary state tables used 378 * by the break iterator engine. Creating a break iterator from precompiled 379 * rules is much faster than creating one from source rules. 380 * 381 * Binary break rules are not guaranteed to be compatible between different 382 * versions of ICU. 383 * 384 * 385 * @param rules The source form of the break rules 386 * @param ruleBinary An output stream to receive the compiled rules. 387 * @throws IOException If there is an error writing the output. 388 * @see #getInstanceFromCompiledRules(InputStream) 389 * @stable ICU 4.8 390 */ compileRules(String rules, OutputStream ruleBinary)391 public static void compileRules(String rules, OutputStream ruleBinary) throws IOException { 392 RBBIRuleBuilder.compileRules(rules, ruleBinary); 393 } 394 395 //======================================================================= 396 // BreakIterator overrides 397 //======================================================================= 398 399 /** 400 * Sets the current iteration position to the beginning of the text. 401 * (i.e., the CharacterIterator's starting offset). 402 * @return The offset of the beginning of the text. 403 * @stable ICU 2.0 404 */ 405 @Override first()406 public int first() { 407 if (fText == null) { 408 return BreakIterator.DONE; 409 } 410 fText.first(); 411 int start = fText.getIndex(); 412 if (!fBreakCache.seek(start)) { 413 fBreakCache.populateNear(start); 414 } 415 fBreakCache.current(); 416 assert(fPosition == start); 417 return fPosition; 418 } 419 420 /** 421 * Sets the current iteration position to the end of the text. 422 * (i.e., the CharacterIterator's ending offset). 423 * @return The text's past-the-end offset. 424 * @stable ICU 2.0 425 */ 426 @Override last()427 public int last() { 428 if (fText == null) { 429 return BreakIterator.DONE; 430 } 431 int endPos = fText.getEndIndex(); 432 boolean endShouldBeBoundary = isBoundary(endPos); // Has side effect of setting iterator position. 433 assert(endShouldBeBoundary); 434 if (fPosition != endPos) { 435 assert(fPosition == endPos); 436 } 437 return endPos; 438 } 439 440 /** 441 * Advances the iterator either forward or backward the specified number of steps. 442 * Negative values move backward, and positive values move forward. This is 443 * equivalent to repeatedly calling next() or previous(). 444 * @param n The number of steps to move. The sign indicates the direction 445 * (negative is backwards, and positive is forwards). 446 * @return The character offset of the boundary position n boundaries away from 447 * the current one. 448 * @stable ICU 2.0 449 */ 450 @Override next(int n)451 public int next(int n) { 452 int result = 0; 453 if (n > 0) { 454 for (; n > 0 && result != DONE; --n) { 455 result = next(); 456 } 457 } else if (n < 0) { 458 for (; n < 0 && result != DONE; ++n) { 459 result = previous(); 460 } 461 } else { 462 result = current(); 463 } 464 return result; 465 } 466 467 /** 468 * Advances the iterator to the next boundary position. 469 * @return The position of the first boundary after this one. 470 * @stable ICU 2.0 471 */ 472 @Override next()473 public int next() { 474 fBreakCache.next(); 475 return fDone ? DONE : fPosition; 476 } 477 478 /** 479 * Moves the iterator backwards, to the boundary preceding the current one. 480 * @return The position of the boundary position immediately preceding the starting position. 481 * @stable ICU 2.0 482 */ 483 @Override previous()484 public int previous() { 485 fBreakCache.previous(); 486 return fDone ? DONE : fPosition; 487 } 488 489 /** 490 * Sets the iterator to refer to the first boundary position following 491 * the specified position. 492 * @param startPos The position from which to begin searching for a break position. 493 * @return The position of the first break after the current position. 494 * @stable ICU 2.0 495 */ 496 @Override following(int startPos)497 public int following(int startPos) { 498 // if the supplied position is before the beginning, return the 499 // text's starting offset 500 if (startPos < fText.getBeginIndex()) { 501 return first(); 502 } 503 504 // Move requested offset to a code point start. It might be between a lead and trail surrogate. 505 // Or it may be beyond the end of the text. 506 startPos = CISetIndex32(fText, startPos); 507 fBreakCache.following(startPos); 508 return fDone ? DONE : fPosition; 509 } 510 511 512 /** 513 * Sets the iterator to refer to the last boundary position before the 514 * specified position. 515 * @param offset The position to begin searching for a break from. 516 * @return The position of the last boundary before the starting position. 517 * @stable ICU 2.0 518 */ 519 @Override preceding(int offset)520 public int preceding(int offset) { 521 if (fText == null || offset > fText.getEndIndex()) { 522 return last(); 523 } else if (offset < fText.getBeginIndex()) { 524 return first(); 525 } 526 527 // Move requested offset to a code point start. It might be between a lead and trail surrogate. 528 // int adjustedOffset = CISetIndex32(fText, offset); // TODO: restore to match ICU4C behavior. 529 int adjustedOffset = offset; 530 fBreakCache.preceding(adjustedOffset); 531 return fDone ? DONE : fPosition; 532 533 } 534 535 536 /** 537 * Throw IllegalArgumentException unless begin <= offset < end. 538 * @stable ICU 2.0 539 */ checkOffset(int offset, CharacterIterator text)540 protected static final void checkOffset(int offset, CharacterIterator text) { 541 if (offset < text.getBeginIndex() || offset > text.getEndIndex()) { 542 throw new IllegalArgumentException("offset out of bounds"); 543 } 544 } 545 546 547 /** 548 * Returns true if the specified position is a boundary position. As a side 549 * effect, leaves the iterator pointing to the first boundary position at 550 * or after "offset". 551 * @param offset the offset to check. 552 * @return True if "offset" is a boundary position. 553 * @stable ICU 2.0 554 */ 555 @Override isBoundary(int offset)556 public boolean isBoundary(int offset) { 557 // TODO: behavior difference with ICU4C, which considers out-of-range offsets 558 // to not be boundaries, and to not be errors. 559 checkOffset(offset, fText); 560 561 // Adjust offset to be on a code point boundary and not beyond the end of the text. 562 // Note that isBoundary() is always false for offsets that are not on code point boundaries. 563 // But we still need the side effect of leaving iteration at the following boundary. 564 int adjustedOffset = CISetIndex32(fText, offset); 565 566 boolean result = false; 567 if (fBreakCache.seek(adjustedOffset) || fBreakCache.populateNear(adjustedOffset)) { 568 result = (fBreakCache.current() == offset); 569 } 570 571 if (!result) { 572 // Not on a boundary. isBoundary() must leave iterator on the following boundary. 573 // fBreakCache.seek(), above, left us on the preceding boundary, so advance one. 574 next(); 575 } 576 return result; 577 578 } 579 580 /** 581 * Returns the current iteration position. Note that DONE is never 582 * returned from this function; if iteration has run to the end of a 583 * string, current() will return the length of the string while 584 * next() will return BreakIterator.DONE). 585 * @return The current iteration position. 586 * @stable ICU 2.0 587 */ 588 @Override current()589 public int current() { 590 return (fText != null) ? fPosition : BreakIterator.DONE; 591 } 592 593 594 /** 595 * Return the status tag from the break rule that determined the boundary at 596 * the current iteration position. The values appear in the rule source 597 * within brackets, {123}, for example. For rules that do not specify a 598 * status, a default value of 0 is returned. If more than one rule applies, 599 * the numerically largest of the possible status values is returned. 600 * <p> 601 * Of the standard types of ICU break iterators, only the word and line break 602 * iterator provides status values. The values are defined in 603 * class RuleBasedBreakIterator, and allow distinguishing between words 604 * that contain alphabetic letters, "words" that appear to be numbers, 605 * punctuation and spaces, words containing ideographic characters, and 606 * more. Call <code>getRuleStatus</code> after obtaining a boundary 607 * position from <code>next()</code>, <code>previous()</code>, or 608 * any other break iterator functions that returns a boundary position. 609 * <p> 610 * Note that <code>getRuleStatus()</code> returns the value corresponding to 611 * <code>current()</code> index even after <code>next()</code> has returned DONE. 612 * <p> 613 614 * @return the status from the break rule that determined the boundary 615 * at the current iteration position. 616 * 617 * @stable ICU 60 618 */ 619 620 @Override getRuleStatus()621 public int getRuleStatus() { 622 // Status records have this form: 623 // Count N <-- fLastRuleStatusIndex points here. 624 // Status val 0 625 // Status val 1 626 // ... 627 // Status val N-1 <-- the value we need to return 628 // The status values are sorted in ascending order. 629 // This function returns the last (largest) of the array of status values. 630 int idx = fRuleStatusIndex + fRData.fStatusTable[fRuleStatusIndex]; 631 int tagVal = fRData.fStatusTable[idx]; 632 return tagVal; 633 } 634 635 /** 636 * Get the status (tag) values from the break rule(s) that determined the boundary 637 * at the current iteration position. The values appear in the rule source 638 * within brackets, {123}, for example. The default status value for rules 639 * that do not explicitly provide one is zero. 640 * <p> 641 * The status values used by the standard ICU break rules are defined 642 * as public constants in class RuleBasedBreakIterator. 643 * <p> 644 * If the size of the output array is insufficient to hold the data, 645 * the output will be truncated to the available length. No exception 646 * will be thrown. 647 * 648 * @param fillInArray an array to be filled in with the status values. 649 * @return The number of rule status values from the rules that determined 650 * the boundary at the current iteration position. 651 * In the event that the array is too small, the return value 652 * is the total number of status values that were available, 653 * not the reduced number that were actually returned. 654 * @stable ICU 60 655 */ 656 @Override getRuleStatusVec(int[] fillInArray)657 public int getRuleStatusVec(int[] fillInArray) { 658 int numStatusVals = fRData.fStatusTable[fRuleStatusIndex]; 659 if (fillInArray != null) { 660 int numToCopy = Math.min(numStatusVals, fillInArray.length); 661 for (int i=0; i<numToCopy; i++) { 662 fillInArray[i] = fRData.fStatusTable[fRuleStatusIndex + i + 1]; 663 } 664 } 665 return numStatusVals; 666 } 667 668 /** 669 * Returns a CharacterIterator over the text being analyzed. 670 * <p> 671 * <b><i>Caution:</i></b>The state of the returned CharacterIterator 672 * must not be modified in any way while the BreakIterator is still in use. 673 * Doing so will lead to undefined behavior of the BreakIterator. 674 * Clone the returned CharacterIterator first and work with that. 675 * <p> 676 * The returned CharacterIterator is a reference 677 * to the <b>actual iterator being used</b> by the BreakIterator. 678 * No guarantees are made about the current position 679 * of this iterator when it is returned; it may differ from the 680 * BreakIterators current position. If you need to move that 681 * position to examine the text, clone this function's return value first. 682 * @return An iterator over the text being analyzed. 683 * @stable ICU 2.0 684 */ 685 @Override getText()686 public CharacterIterator getText() { 687 return fText; 688 } 689 690 /** 691 * Set the iterator to analyze a new piece of text. This function resets 692 * the current iteration position to the beginning of the text. 693 * (The old iterator is dropped.) 694 * <p> 695 * <b><i>Caution:</i></b> The supplied CharacterIterator is used 696 * directly by the BreakIterator, and must not be altered in any 697 * way by code outside of the BreakIterator. 698 * Doing so will lead to undefined behavior of the BreakIterator. 699 * 700 * @param newText An iterator over the text to analyze. 701 * @stable ICU 2.0 702 */ 703 @Override setText(CharacterIterator newText)704 public void setText(CharacterIterator newText) { 705 if (newText != null) { 706 fBreakCache.reset(newText.getBeginIndex(), 0); 707 } else { 708 fBreakCache.reset(); 709 } 710 fDictionaryCache.reset(); 711 fText = newText; 712 this.first(); 713 } 714 715 /** 716 * Control debug, trace and dump options. 717 * @internal 718 * @deprecated This API is ICU internal only. 719 */ 720 @Deprecated 721 public static final String fDebugEnv = ICUDebug.enabled(RBBI_DEBUG_ARG) ? 722 ICUDebug.value(RBBI_DEBUG_ARG) : null; 723 724 getLanguageBreakEngine(int c)725 private LanguageBreakEngine getLanguageBreakEngine(int c) { 726 727 // We have a dictionary character. 728 // Does an already instantiated break engine handle it? 729 for (LanguageBreakEngine candidate : fBreakEngines) { 730 if (candidate.handles(c)) { 731 return candidate; 732 } 733 } 734 735 synchronized (gAllBreakEngines) { 736 // This break iterator's list of break engines didn't handle the character. 737 // Check the global list, another break iterator may have instantiated the 738 // desired engine. 739 for (LanguageBreakEngine candidate : gAllBreakEngines) { 740 if (candidate.handles(c)) { 741 fBreakEngines.add(candidate); 742 return candidate; 743 } 744 } 745 746 // The global list doesn't have an existing engine, build one. 747 int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT); 748 if (script == UScript.KATAKANA || script == UScript.HIRAGANA) { 749 // Katakana, Hiragana and Han are handled by the same dictionary engine. 750 // Fold them together for mapping from script -> engine. 751 script = UScript.HAN; 752 } 753 754 LanguageBreakEngine eng; 755 try { 756 switch (script) { 757 case UScript.THAI: 758 try { 759 eng = LSTMBreakEngine.create(script, LSTMBreakEngine.createData(script)); 760 } catch (MissingResourceException e) { 761 eng = new ThaiBreakEngine(); 762 } 763 break; 764 case UScript.LAO: 765 eng = new LaoBreakEngine(); 766 break; 767 case UScript.MYANMAR: 768 try { 769 eng = LSTMBreakEngine.create(script, LSTMBreakEngine.createData(script)); 770 } catch (MissingResourceException e) { 771 eng = new BurmeseBreakEngine(); 772 } 773 break; 774 case UScript.KHMER: 775 eng = new KhmerBreakEngine(); 776 break; 777 case UScript.HAN: 778 eng = new CjkBreakEngine(false); 779 break; 780 case UScript.HANGUL: 781 eng = new CjkBreakEngine(true); 782 break; 783 default: 784 gUnhandledBreakEngine.handleChar(c); 785 eng = gUnhandledBreakEngine; 786 break; 787 } 788 } catch (IOException e) { 789 eng = null; 790 } 791 792 if (eng != null && eng != gUnhandledBreakEngine) { 793 gAllBreakEngines.add(eng); 794 fBreakEngines.add(eng); 795 } 796 return eng; 797 } // end synchronized(gAllBreakEngines) 798 } 799 800 /** 801 * The State Machine Engine for moving forward is here. 802 * This function is the heart of the RBBI run time engine. 803 * 804 * Input 805 * fPosition, the position in the text to begin from. 806 * Output 807 * fPosition: the boundary following the starting position. 808 * fDictionaryCharCount the number of dictionary characters encountered. 809 * If > 0, the segment will be further subdivided 810 * fRuleStatusIndex Info from the state table indicating which rules caused the boundary. 811 * 812 * @return the new iterator position 813 * 814 * A note on supplementary characters and the position of underlying 815 * Java CharacterIterator: Normally, a character iterator is positioned at 816 * the char most recently returned by next(). Within this function, when 817 * a supplementary char is being processed, the char iterator is left 818 * sitting on the trail surrogate, in the middle of the code point. 819 * This is different from everywhere else, where an iterator always 820 * points at the lead surrogate of a supplementary. 821 */ handleNext()822 private int handleNext() { 823 if (TRACE) { 824 System.out.println("Handle Next pos char state category"); 825 } 826 827 // handleNext always sets the break tag value. 828 // Set the default for it. 829 fRuleStatusIndex = 0; 830 fDictionaryCharCount = 0; 831 832 // caches for quicker access 833 CharacterIterator text = fText; 834 CodePointTrie trie = fRData.fTrie; 835 836 char[] stateTable = fRData.fFTable.fTable; 837 int initialPosition = fPosition; 838 text.setIndex(initialPosition); 839 int result = initialPosition; 840 841 // Set up the starting char 842 int c = text.current(); 843 if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) { 844 c = nextTrail32(text, c); 845 if (c == DONE32) { 846 fDone = true; 847 return BreakIterator.DONE; 848 } 849 } 850 851 // Set the initial state for the state machine 852 int state = START_STATE; 853 int row = fRData.getRowIndex(state); 854 short category = 3; 855 int flagsState = fRData.fFTable.fFlags; 856 int dictStart = fRData.fFTable.fDictCategoriesStart; 857 int mode = RBBI_RUN; 858 if ((flagsState & RBBIDataWrapper.RBBI_BOF_REQUIRED) != 0) { 859 category = 2; 860 mode = RBBI_START; 861 if (TRACE) { 862 System.out.print(" " + RBBIDataWrapper.intToString(text.getIndex(), 5)); 863 System.out.print(RBBIDataWrapper.intToHexString(c, 10)); 864 System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6)); 865 } 866 } 867 868 // loop until we reach the end of the text or transition to state 0 869 while (state != STOP_STATE) { 870 if (c == DONE32) { 871 // Reached end of input string. 872 if (mode == RBBI_END) { 873 // We have already run the loop one last time with the 874 // character set to the pseudo {eof} value. Now it is time 875 // to unconditionally bail out. 876 break; 877 } 878 // Run the loop one last time with the fake end-of-input character category 879 mode = RBBI_END; 880 category = 1; 881 } 882 else if (mode == RBBI_RUN) { 883 // Get the char category. An incoming category of 1 or 2 mens that 884 // we are preset for doing the beginning or end of input, and 885 // that we shouldn't get a category from an actual text input character. 886 // 887 888 // look up the current character's character category, which tells us 889 // which column in the state table to look at. 890 // 891 category = (short) trie.get(c); 892 893 // Check for categories that require word dictionary handling. 894 if (category >= dictStart) { 895 fDictionaryCharCount++; 896 } 897 898 if (TRACE) { 899 System.out.print(" " + RBBIDataWrapper.intToString(text.getIndex(), 5)); 900 System.out.print(RBBIDataWrapper.intToHexString(c, 10)); 901 System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6)); 902 } 903 904 // Advance to the next character. 905 // If this is a beginning-of-input loop iteration, don't advance. 906 // The next iteration will be processing the first real input character. 907 c = text.next(); 908 if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) { 909 c = nextTrail32(text, c); 910 } 911 } 912 else { 913 mode = RBBI_RUN; 914 } 915 916 // look up a state transition in the state table 917 state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category]; 918 row = fRData.getRowIndex(state); 919 int accepting = stateTable[row + RBBIDataWrapper.ACCEPTING]; 920 if (accepting == RBBIDataWrapper.ACCEPTING_UNCONDITIONAL) { 921 // Match found, common case 922 result = text.getIndex(); 923 if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c <= UTF16.CODEPOINT_MAX_VALUE) { 924 // The iterator has been left in the middle of a surrogate pair. 925 // We want the start of it. 926 result--; 927 } 928 929 // Remember the break status (tag) values. 930 fRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGSIDX]; 931 } else if (accepting > RBBIDataWrapper.ACCEPTING_UNCONDITIONAL) { 932 // Lookahead match is completed 933 int lookaheadResult = fLookAheadMatches[accepting]; 934 if (lookaheadResult >= 0) { 935 fRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGSIDX]; 936 fPosition = lookaheadResult; 937 return lookaheadResult; 938 } 939 } 940 941 942 // If we are at the position of the '/' in a look-ahead (hard break) rule; 943 // record the current position, to be returned later, if the full rule matches. 944 // TODO: Move this check before the previous check of fAccepting. 945 // This would enable hard-break rules with no following context. 946 // But there are line break test failures when trying this. Investigate. 947 // Issue ICU-20837 948 int rule = stateTable[row + RBBIDataWrapper.LOOKAHEAD]; 949 if (rule != 0) { 950 int pos = text.getIndex(); 951 if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c <= UTF16.CODEPOINT_MAX_VALUE) { 952 // The iterator has been left in the middle of a surrogate pair. 953 // We want the beginning of it. 954 pos--; 955 } 956 fLookAheadMatches[rule] = pos; 957 } 958 959 960 } // End of state machine main loop 961 962 // The state machine is done. Check whether it found a match... 963 964 // If the iterator failed to advance in the match engine force it ahead by one. 965 // This indicates a defect in the break rules, which should always match 966 // at least one character. 967 968 if (result == initialPosition) { 969 if (TRACE) { 970 System.out.println("Iterator did not move. Advancing by 1."); 971 } 972 text.setIndex(initialPosition); 973 next32(text); 974 result = text.getIndex(); 975 fRuleStatusIndex = 0; 976 } 977 978 // Leave the iterator at our result position. 979 // (we may have advanced beyond the last accepting position chasing after 980 // longer matches that never completed.) 981 fPosition = result; 982 983 if (TRACE) { 984 System.out.println("result = " + result); 985 } 986 return result; 987 } 988 989 /** 990 * Iterate backwards from an arbitrary position in the input text using the Safe Reverse rules. 991 * This locates a "Safe Position" from which the forward break rules 992 * will operate correctly. A Safe Position is not necessarily a boundary itself. 993 * 994 * The logic of this function is very similar to handleNext(), above, but simpler 995 * because the safe table does not require as many options. 996 * 997 * @param fromPosition the position in the input text to begin the iteration. 998 * @internal 999 */ handleSafePrevious(int fromPosition)1000 private int handleSafePrevious(int fromPosition) { 1001 char state; 1002 short category = 0; 1003 int result = 0; 1004 1005 // caches for quicker access 1006 CharacterIterator text = fText; 1007 CodePointTrie trie = fRData.fTrie; 1008 char[] stateTable = fRData.fRTable.fTable; 1009 1010 CISetIndex32(text, fromPosition); 1011 if (TRACE) { 1012 System.out.print("Handle Previous pos char state category"); 1013 } 1014 1015 // if we're already at the start of the text, return DONE. 1016 if (text.getIndex() == text.getBeginIndex()) { 1017 return BreakIterator.DONE; 1018 } 1019 1020 // Set the initial state for the state machine 1021 int c = CharacterIteration.previous32(text); 1022 state = START_STATE; 1023 int row = fRData.getRowIndex(state); 1024 1025 // loop until we reach the start of the text or transition to state 0 1026 // 1027 for (; c != DONE32; c = CharacterIteration.previous32(text)) { 1028 1029 // look up the current character's character category, which tells us 1030 // which column in the state table to look at. 1031 // 1032 // And off the dictionary flag bit. For reverse iteration it is not used. 1033 category = (short) trie.get(c); 1034 if (TRACE) { 1035 System.out.print(" " + RBBIDataWrapper.intToString(text.getIndex(), 5)); 1036 System.out.print(RBBIDataWrapper.intToHexString(c, 10)); 1037 System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6)); 1038 } 1039 1040 // State Transition - move machine to its next state 1041 // 1042 assert(category < fRData.fHeader.fCatCount); 1043 state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category]; 1044 row = fRData.getRowIndex(state); 1045 1046 if (state == STOP_STATE) { 1047 // This is the normal exit from the lookup state machine. 1048 // Transition to state zero means we have found a safe point. 1049 break; 1050 } 1051 } 1052 1053 // The state machine is done. 1054 result = text.getIndex(); 1055 if (TRACE) { 1056 System.out.println("result = " + result); 1057 } 1058 return result; 1059 } 1060 1061 /** 1062 * Set the index of a CharacterIterator. 1063 * Pin the index to the valid range range of BeginIndex <= index <= EndIndex. 1064 * If the index points to a trail surrogate of a supplementary character, adjust it 1065 * to the start (lead surrogate) index. 1066 * 1067 * @param ci A CharacterIterator to set 1068 * @param index the index to set 1069 * @return the resulting index, possibly pinned or adjusted. 1070 */ 1071 private static int CISetIndex32(CharacterIterator ci, int index) { 1072 if (index <= ci.getBeginIndex()) { 1073 ci.first(); 1074 } else if (index >= ci.getEndIndex()) { 1075 ci.setIndex(ci.getEndIndex()); 1076 } else if (Character.isLowSurrogate(ci.setIndex(index))) { 1077 if (!Character.isHighSurrogate(ci.previous())) { 1078 ci.next(); 1079 } 1080 } 1081 return ci.getIndex(); 1082 } 1083 1084 /** DictionaryCache stores the boundaries obtained from a run of dictionary characters. 1085 * Dictionary boundaries are moved first to this cache, then from here 1086 * to the main BreakCache, where they may inter-leave with non-dictionary 1087 * boundaries. The public BreakIterator API always fetches directly 1088 * from the main BreakCache, not from here. 1089 * 1090 * In common situations, the number of boundaries in a single dictionary run 1091 * should be quite small, it will be terminated by punctuation, spaces, 1092 * or any other non-dictionary characters. The main BreakCache may end 1093 * up with boundaries from multiple dictionary based runs. 1094 * 1095 * The boundaries are stored in a simple ArrayList (vector), with the 1096 * assumption that they will be accessed sequentially. 1097 */ 1098 class DictionaryCache { 1099 1100 void reset() { 1101 fPositionInCache = -1; 1102 fStart = 0; 1103 fLimit = 0; 1104 fFirstRuleStatusIndex = 0; 1105 fOtherRuleStatusIndex = 0; 1106 fBreaks.removeAllElements(); 1107 }; 1108 1109 boolean following(int fromPos) { 1110 if (fromPos >= fLimit || fromPos < fStart) { 1111 fPositionInCache = -1; 1112 return false; 1113 } 1114 1115 // Sequential iteration, move from previous boundary to the following 1116 1117 int r = 0; 1118 if (fPositionInCache >= 0 && fPositionInCache < fBreaks.size() && fBreaks.elementAt(fPositionInCache) == fromPos) { 1119 ++fPositionInCache; 1120 if (fPositionInCache >= fBreaks.size()) { 1121 fPositionInCache = -1; 1122 return false; 1123 } 1124 r = fBreaks.elementAt(fPositionInCache); 1125 assert(r > fromPos); 1126 fBoundary = r; 1127 fStatusIndex = fOtherRuleStatusIndex; 1128 return true; 1129 } 1130 1131 // Random indexing. Linear search for the boundary following the given position. 1132 1133 for (fPositionInCache = 0; fPositionInCache < fBreaks.size(); ++fPositionInCache) { 1134 r= fBreaks.elementAt(fPositionInCache); 1135 if (r > fromPos) { 1136 fBoundary = r; 1137 fStatusIndex = fOtherRuleStatusIndex; 1138 return true; 1139 } 1140 } 1141 1142 // Internal error. fStart <= fromPos < fLimit, but no cached boundary. 1143 assert(false); 1144 fPositionInCache = -1; 1145 return false; 1146 }; 1147 preceding(int fromPos)1148 boolean preceding(int fromPos) { 1149 if (fromPos <= fStart || fromPos > fLimit) { 1150 fPositionInCache = -1; 1151 return false; 1152 } 1153 1154 if (fromPos == fLimit) { 1155 fPositionInCache = fBreaks.size() - 1; 1156 if (fPositionInCache >= 0) { 1157 assert(fBreaks.elementAt(fPositionInCache) == fromPos); 1158 } 1159 } 1160 1161 int r; 1162 if (fPositionInCache > 0 && fPositionInCache < fBreaks.size() && fBreaks.elementAt(fPositionInCache) == fromPos) { 1163 --fPositionInCache; 1164 r = fBreaks.elementAt(fPositionInCache); 1165 assert(r < fromPos); 1166 fBoundary = r; 1167 fStatusIndex = ( r== fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex; 1168 return true; 1169 } 1170 1171 if (fPositionInCache == 0) { 1172 fPositionInCache = -1; 1173 return false; 1174 } 1175 1176 for (fPositionInCache = fBreaks.size()-1; fPositionInCache >= 0; --fPositionInCache) { 1177 r = fBreaks.elementAt(fPositionInCache); 1178 if (r < fromPos) { 1179 fBoundary = r; 1180 fStatusIndex = ( r == fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex; 1181 return true; 1182 } 1183 } 1184 assert(false); 1185 fPositionInCache = -1; 1186 return false; 1187 }; 1188 1189 /** 1190 * Populate the cache with the dictionary based boundaries within a region of text. 1191 * @param startPos The start position of a range of text 1192 * @param endPos The end position of a range of text 1193 * @param firstRuleStatus The rule status index that applies to the break at startPos 1194 * @param otherRuleStatus The rule status index that applies to boundaries other than startPos 1195 * @internal 1196 */ populateDictionary(int startPos, int endPos, int firstRuleStatus, int otherRuleStatus)1197 void populateDictionary(int startPos, int endPos, 1198 int firstRuleStatus, int otherRuleStatus) { 1199 if ((endPos - startPos) <= 1) { 1200 return; 1201 } 1202 1203 reset(); 1204 fFirstRuleStatusIndex = firstRuleStatus; 1205 fOtherRuleStatusIndex = otherRuleStatus; 1206 1207 int rangeStart = startPos; 1208 int rangeEnd = endPos; 1209 1210 int category; 1211 int current; 1212 int foundBreakCount = 0; 1213 1214 // Loop through the text, looking for ranges of dictionary characters. 1215 // For each span, find the appropriate break engine, and ask it to find 1216 // any breaks within the span. 1217 1218 fText.setIndex(rangeStart); 1219 int c = CharacterIteration.current32(fText); 1220 category = (short)fRData.fTrie.get(c); 1221 int dictStart = fRData.fFTable.fDictCategoriesStart; 1222 1223 while(true) { 1224 while((current = fText.getIndex()) < rangeEnd && (category < dictStart)) { 1225 c = CharacterIteration.next32(fText); // pre-increment 1226 category = (short)fRData.fTrie.get(c); 1227 } 1228 if (current >= rangeEnd) { 1229 break; 1230 } 1231 1232 // We now have a dictionary character. Get the appropriate language object 1233 // to deal with it. 1234 LanguageBreakEngine lbe = getLanguageBreakEngine(c); 1235 1236 // Ask the language object if there are any breaks. It will add them to the cache and 1237 // leave the text pointer on the other side of its range, ready to search for the next one. 1238 if (lbe != null) { 1239 foundBreakCount += lbe.findBreaks(fText, rangeStart, rangeEnd, fBreaks, fPhraseBreaking); 1240 } 1241 1242 // Reload the loop variables for the next go-round 1243 c = CharacterIteration.current32(fText); 1244 category = (short)fRData.fTrie.get(c); 1245 } 1246 1247 // If we found breaks, ensure that the first and last entries are 1248 // the original starting and ending position. And initialize the 1249 // cache iteration position to the first entry. 1250 1251 // System.out.printf("foundBreakCount = %d%n", foundBreakCount); 1252 if (foundBreakCount > 0) { 1253 assert(foundBreakCount == fBreaks.size()); 1254 if (startPos < fBreaks.elementAt(0)) { 1255 // The dictionary did not place a boundary at the start of the segment of text. 1256 // Add one now. This should not commonly happen, but it would be easy for interactions 1257 // of the rules for dictionary segments and the break engine implementations to 1258 // inadvertently cause it. Cover it here, just in case. 1259 fBreaks.offer(startPos); 1260 } 1261 if (endPos > fBreaks.peek()) { 1262 fBreaks.push(endPos); 1263 } 1264 fPositionInCache = 0; 1265 // Note: Dictionary matching may extend beyond the original limit. 1266 fStart = fBreaks.elementAt(0); 1267 fLimit = fBreaks.peek(); 1268 } else { 1269 // there were no language-based breaks, even though the segment contained 1270 // dictionary characters. Subsequent attempts to fetch boundaries from the dictionary cache 1271 // for this range will fail, and the calling code will fall back to the rule based boundaries. 1272 } 1273 1274 }; 1275 1276 DictionaryCache()1277 DictionaryCache() { 1278 fPositionInCache = -1; 1279 fBreaks = new DictionaryBreakEngine.DequeI(); 1280 } 1281 1282 /** 1283 * copy constructor. Used by RuleBasedBreakIterator.clone(). 1284 * 1285 * @param src the source object to be copied. 1286 */ DictionaryCache(DictionaryCache src)1287 DictionaryCache(DictionaryCache src) { 1288 try { 1289 fBreaks = (DictionaryBreakEngine.DequeI)src.fBreaks.clone(); 1290 } 1291 catch (CloneNotSupportedException e) { 1292 throw new RuntimeException(e); 1293 } 1294 fPositionInCache = src.fPositionInCache; 1295 fStart = src.fStart; 1296 fLimit = src.fLimit; 1297 fFirstRuleStatusIndex = src.fFirstRuleStatusIndex; 1298 fOtherRuleStatusIndex = src.fOtherRuleStatusIndex; 1299 fBoundary = src.fBoundary; 1300 fStatusIndex = src.fStatusIndex; 1301 } 1302 1303 // A data structure containing the boundaries themselves. Essentially a vector of raw ints. 1304 DictionaryBreakEngine.DequeI fBreaks; 1305 int fPositionInCache; // Index in fBreaks of last boundary returned by following() 1306 // // or preceding(). Optimizes sequential access. 1307 int fStart; // Text position of first boundary in cache. 1308 int fLimit; // Last boundary in cache. Which is the limit of the 1309 // // text segment being handled by the dictionary. 1310 int fFirstRuleStatusIndex; // Rule status info for first boundary. 1311 int fOtherRuleStatusIndex; // Rule status info for 2nd through last boundaries. 1312 int fBoundary; // Current boundary. Set by preceding(), following(). 1313 int fStatusIndex; // Current rule status index. Set by preceding, following(). 1314 }; 1315 1316 1317 1318 1319 /* 1320 * class BreakCache 1321 * 1322 * Cache of break boundary positions and rule status values. 1323 * Break iterator API functions, next(), previous(), etc., will use cached results 1324 * when possible, and otherwise cache new results as they are obtained. 1325 * 1326 * Uniformly caches both dictionary and rule based (non-dictionary) boundaries. 1327 * 1328 * The cache is implemented as a single circular buffer. 1329 */ 1330 1331 /* 1332 * size of the circular cache buffer. 1333 */ 1334 1335 class BreakCache { 1336 BreakCache()1337 BreakCache() { 1338 reset(); 1339 }; 1340 reset(int pos, int ruleStatus)1341 void reset(int pos, int ruleStatus) { 1342 fStartBufIdx = 0; 1343 fEndBufIdx = 0; 1344 fTextIdx = pos; 1345 fBufIdx = 0; 1346 fBoundaries[0] = pos; 1347 fStatuses[0] = (short)ruleStatus; 1348 } 1349 reset()1350 void reset() {reset(0, 0); }; 1351 next()1352 void next() { 1353 if (fBufIdx == fEndBufIdx) { 1354 fDone = !populateFollowing(); 1355 fPosition = fTextIdx; 1356 fRuleStatusIndex = fStatuses[fBufIdx]; 1357 } else { 1358 fBufIdx = modChunkSize(fBufIdx + 1); 1359 fTextIdx = fPosition = fBoundaries[fBufIdx]; 1360 fRuleStatusIndex = fStatuses[fBufIdx]; 1361 } 1362 }; 1363 previous()1364 void previous() { 1365 int initialBufIdx = fBufIdx; 1366 if (fBufIdx == fStartBufIdx) { 1367 // At start of cache. Prepend to it. 1368 populatePreceding(); 1369 } else { 1370 // Cache already holds the next boundary 1371 fBufIdx = modChunkSize(fBufIdx - 1); 1372 fTextIdx = fBoundaries[fBufIdx]; 1373 } 1374 fDone = (fBufIdx == initialBufIdx); 1375 fPosition = fTextIdx; 1376 fRuleStatusIndex = fStatuses[fBufIdx]; 1377 return; 1378 }; 1379 1380 // Move the iteration state to the position following the startPosition. 1381 // Input position must be pinned to the input length. following(int startPos)1382 void following(int startPos) { 1383 if (startPos == fTextIdx || seek(startPos) || populateNear(startPos)) { 1384 // startPos is in the cache. Do a next() from that position. 1385 // TODO: an awkward set of interactions with bi->fDone 1386 // seek() does not clear it; it can't because of interactions with populateNear(). 1387 // next() does not clear it in the fast-path case, where everything matters. Maybe it should. 1388 // So clear it here, for the case where seek() succeeded on an iterator that had previously run off the end. 1389 fDone = false; 1390 next(); 1391 } 1392 1393 }; 1394 preceding(int startPos)1395 void preceding(int startPos) { 1396 if (startPos == fTextIdx || seek(startPos) || populateNear(startPos)) { 1397 if (startPos == fTextIdx) { 1398 previous(); 1399 } else { 1400 // seek() leaves the BreakCache positioned at the preceding boundary 1401 // if the requested position is between two boundaries. 1402 // current() pushes the BreakCache position out to the BreakIterator itself. 1403 assert(startPos > fTextIdx); 1404 current(); 1405 } 1406 } 1407 return; 1408 }; 1409 1410 /** 1411 * Update the state of the public BreakIterator (fBI) to reflect the 1412 * current state of the break iterator cache (this). 1413 */ current()1414 int current() { 1415 fPosition = fTextIdx; 1416 fRuleStatusIndex = fStatuses[fBufIdx]; 1417 fDone = false; 1418 return fTextIdx; 1419 }; 1420 1421 /** 1422 * Add boundaries to the cache near the specified position. 1423 * The given position need not be a boundary itself. 1424 * The input position must be within the range of the text, and 1425 * on a code point boundary. 1426 * If the requested position is a break boundary, leave the iteration 1427 * position on it. 1428 * If the requested position is not a boundary, leave the iteration 1429 * position on the preceding boundary and include both the the 1430 * preceding and following boundaries in the cache. 1431 * Additional boundaries, either preceding or following, may be added 1432 * to the cache as a side effect. 1433 * 1434 * Return false if the operation failed. 1435 */ populateNear(int position)1436 boolean populateNear(int position) { 1437 assert(position < fBoundaries[fStartBufIdx] || position > fBoundaries[fEndBufIdx]); 1438 1439 // Add boundaries to the cache near the specified position. 1440 // The given position need not be a boundary itself. 1441 // The input position must be within the range of the text, and 1442 // on a code point boundary. 1443 // If the requested position is a break boundary, leave the iteration 1444 // position on it. 1445 // If the requested position is not a boundary, leave the iteration 1446 // position on the preceding boundary and include both the 1447 // preceding and following boundaries in the cache. 1448 // Additional boundaries, either preceding or following, may be added 1449 // to the cache as a side effect. 1450 1451 // If the requested position is not near already cached positions, clear the existing cache, 1452 // find a near-by boundary and begin new cache contents there. 1453 1454 // Threshold for a text position to be considered near to existing cache contents. 1455 // TODO: See issue ICU-22024 "perf tuning of Cache needed." 1456 // This value is subject to change. See the ticket for more details. 1457 final int CACHE_NEAR = 15; 1458 1459 int startOfText = fText.getBeginIndex(); 1460 int aBoundary = -1; 1461 int ruleStatusIndex = 0; 1462 boolean retainCache = false; 1463 if ((position > fBoundaries[fStartBufIdx] - CACHE_NEAR) && position < (fBoundaries[fEndBufIdx] + CACHE_NEAR)) { 1464 // Requested position is near the existing cache. Retain it. 1465 retainCache = true; 1466 } else if (position <= startOfText + CACHE_NEAR) { 1467 // Requested position is near the start of the text. Fill cache from start, skipping 1468 // the need to find a safe point. 1469 retainCache = false; 1470 aBoundary = startOfText; 1471 } else { 1472 // Requested position is not near the existing cache. 1473 // Find a safe point to refill the cache from. 1474 int backupPos = handleSafePrevious(position); 1475 1476 if (fBoundaries[fEndBufIdx] < position && fBoundaries[fEndBufIdx] >= (backupPos - CACHE_NEAR)) { 1477 // The requested position is beyond the end of the existing cache, but the 1478 // reverse rules produced a position near or before the cached region. 1479 // Retain the existing cache, and fill from the end of it. 1480 retainCache = true; 1481 } else if (backupPos < startOfText + CACHE_NEAR) { 1482 // The safe reverse rules moved us to near the start of text. 1483 // Take that (index 0) as the backup boundary, avoiding the complication 1484 // (in the following block) of moving forward from the safe point to a known boundary. 1485 // 1486 // Retain the cache if it begins not too far from the requested position. 1487 aBoundary = startOfText; 1488 retainCache = (fBoundaries[fStartBufIdx] <= (position + CACHE_NEAR)); 1489 } else { 1490 // The safe reverse rules produced a position that is neither near the existing 1491 // cache, nor near the start of text. 1492 // Advance to the boundary following. 1493 // There is a complication: the safe reverse rules identify pairs of code points 1494 // that are safe. If advancing from the safe point moves forwards by less than 1495 // two code points, we need to advance one more time to ensure that the boundary 1496 // is good, including a correct rules status value. 1497 // 1498 retainCache = false; 1499 fPosition = backupPos; 1500 aBoundary = handleNext(); 1501 if (aBoundary == backupPos + 1 || 1502 (aBoundary == backupPos + 2 && 1503 Character.isHighSurrogate(fText.setIndex(backupPos)) && 1504 Character.isLowSurrogate(fText.next()))) { 1505 // The initial handleNext() only advanced by a single code point. Go again. 1506 // Safe rules identify safe pairs. 1507 aBoundary = handleNext(); 1508 } 1509 if (aBoundary == BreakIterator.DONE) { 1510 aBoundary = fText.getEndIndex(); 1511 } 1512 ruleStatusIndex = fRuleStatusIndex; 1513 } 1514 } 1515 1516 if (!retainCache) { 1517 assert(aBoundary != -1); 1518 reset(aBoundary, ruleStatusIndex); // Reset cache to hold aBoundary as a single starting point. 1519 } 1520 1521 1522 // Fill in boundaries between existing cache content and the new requested position. 1523 1524 if (fBoundaries[fEndBufIdx] < position) { 1525 // The last position in the cache precedes the requested position. 1526 // Add following position(s) to the cache. 1527 while (fBoundaries[fEndBufIdx] < position) { 1528 if (!populateFollowing()) { 1529 assert false; 1530 return false; 1531 } 1532 } 1533 fBufIdx = fEndBufIdx; // Set iterator position to the end of the buffer. 1534 fTextIdx = fBoundaries[fBufIdx]; // Required because populateFollowing may add extra boundaries. 1535 while (fTextIdx > position) { // Move backwards to a position at or preceding the requested pos. 1536 previous(); 1537 } 1538 return true; 1539 } 1540 1541 if (fBoundaries[fStartBufIdx] > position) { 1542 // The first position in the cache is beyond the requested position. 1543 // back up more until we get a boundary <= the requested position. 1544 while (fBoundaries[fStartBufIdx] > position) { 1545 populatePreceding(); 1546 } 1547 fBufIdx = fStartBufIdx; // Set iterator position to the start of the buffer. 1548 fTextIdx = fBoundaries[fBufIdx]; // Required because populatePreceding may add extra boundaries. 1549 while (fTextIdx < position) { // Move forwards to a position at or following the requested pos. 1550 next(); 1551 } 1552 if (fTextIdx > position) { 1553 // If position is not itself a boundary, the next() loop above will overshoot. 1554 // Back up one, leaving cache position at the boundary preceding the requested position. 1555 previous(); 1556 } 1557 return true; 1558 } 1559 1560 assert fTextIdx == position; 1561 return true; 1562 1563 }; 1564 1565 /** 1566 * Add boundary(s) to the cache following the current last boundary. 1567 * Return false if at the end of the text, and no more boundaries can be added. 1568 * Leave iteration position at the first newly added boundary, or unchanged if no boundary was added. 1569 */ populateFollowing()1570 boolean populateFollowing() { 1571 int fromPosition = fBoundaries[fEndBufIdx]; 1572 int fromRuleStatusIdx = fStatuses[fEndBufIdx]; 1573 int pos = 0; 1574 int ruleStatusIdx = 0; 1575 1576 if (fDictionaryCache.following(fromPosition)) { 1577 addFollowing(fDictionaryCache.fBoundary, fDictionaryCache.fStatusIndex, UpdateCachePosition); 1578 return true; 1579 } 1580 1581 fPosition = fromPosition; 1582 pos = handleNext(); 1583 if (pos == BreakIterator.DONE) { 1584 return false; 1585 } 1586 1587 ruleStatusIdx = fRuleStatusIndex; 1588 if (fDictionaryCharCount > 0) { 1589 // The text segment obtained from the rules includes dictionary characters. 1590 // Subdivide it, with subdivided results going into the dictionary cache. 1591 fDictionaryCache.populateDictionary(fromPosition, pos, fromRuleStatusIdx, ruleStatusIdx); 1592 if (fDictionaryCache.following(fromPosition)) { 1593 addFollowing(fDictionaryCache.fBoundary, fDictionaryCache.fStatusIndex, UpdateCachePosition); 1594 return true; 1595 // TODO: may want to move a sizable chunk of the dictionary cache to the break cache at this point. 1596 // But be careful with interactions with populateNear(). 1597 } 1598 } 1599 1600 // Rule based segment did not include dictionary characters. 1601 // Or, it did contain dictionary chars, but the dictionary segmenter didn't handle them, 1602 // meaning that we didn't take the return, above. 1603 // Add its end point to the cache. 1604 addFollowing(pos, ruleStatusIdx, UpdateCachePosition); 1605 1606 // Add several non-dictionary boundaries at this point, to optimize straight forward iteration. 1607 // (subsequent calls to BreakIterator::next() will take the fast path, getting cached results. 1608 // 1609 for (int count=0; count<6; ++count) { 1610 pos = handleNext(); 1611 if (pos == BreakIterator.DONE || fDictionaryCharCount > 0) { 1612 break; 1613 } 1614 addFollowing(pos, fRuleStatusIndex, RetainCachePosition); 1615 } 1616 return true; 1617 }; 1618 1619 /** 1620 * Add one or more boundaries to the cache preceding the first currently cached boundary. 1621 * Leave the iteration position on the first added boundary. 1622 * Return false if no boundaries could be added (if at the start of the text.) 1623 */ populatePreceding()1624 boolean populatePreceding() { 1625 int textBegin = fText.getBeginIndex(); 1626 int fromPosition = fBoundaries[fStartBufIdx]; 1627 if (fromPosition == textBegin) { 1628 return false; 1629 } 1630 1631 int position = textBegin; 1632 int positionStatusIdx = 0; 1633 1634 if (fDictionaryCache.preceding(fromPosition)) { 1635 addPreceding(fDictionaryCache.fBoundary, fDictionaryCache.fStatusIndex, UpdateCachePosition); 1636 return true; 1637 } 1638 1639 int backupPosition = fromPosition; 1640 1641 // Find a boundary somewhere preceding the first already-cached boundary 1642 do { 1643 backupPosition = backupPosition - 30; 1644 if (backupPosition <= textBegin) { 1645 backupPosition = textBegin; 1646 } else { 1647 backupPosition = handleSafePrevious(backupPosition); 1648 } 1649 if (backupPosition == BreakIterator.DONE || backupPosition == textBegin) { 1650 position = textBegin; 1651 positionStatusIdx = 0; 1652 } else { 1653 // Advance to the boundary following the backup position. 1654 // There is a complication: the safe reverse rules identify pairs of code points 1655 // that are safe. If advancing from the safe point moves forwards by less than 1656 // two code points, we need to advance one more time to ensure that the boundary 1657 // is good, including a correct rules status value. 1658 // 1659 fPosition = backupPosition; // TODO: pass starting position in a clearer way. 1660 position = handleNext(); 1661 if (position == backupPosition + 1 || 1662 (position == backupPosition + 2 && 1663 Character.isHighSurrogate(fText.setIndex(backupPosition)) && 1664 Character.isLowSurrogate(fText.next()))) { 1665 // The initial handleNext() only advanced by a single code point. Go again. 1666 // Safe rules identify safe pairs. 1667 position = handleNext(); 1668 } 1669 positionStatusIdx = fRuleStatusIndex; 1670 } 1671 } while (position >= fromPosition); 1672 1673 // Find boundaries between the one we just located and the first already-cached boundary 1674 // Put them in a side buffer, because we don't yet know where they will fall in the circular cache buffer. 1675 1676 fSideBuffer.removeAllElements(); 1677 fSideBuffer.push(position); 1678 fSideBuffer.push(positionStatusIdx); 1679 1680 do { 1681 int prevPosition = fPosition = position; 1682 int prevStatusIdx = positionStatusIdx; 1683 position = handleNext(); 1684 positionStatusIdx = fRuleStatusIndex; 1685 if (position == BreakIterator.DONE) { 1686 break; 1687 } 1688 1689 boolean segmentHandledByDictionary = false; 1690 if (fDictionaryCharCount != 0) { 1691 // Segment from the rules includes dictionary characters. 1692 // Subdivide it, with subdivided results going into the dictionary cache. 1693 int dictSegEndPosition = position; 1694 fDictionaryCache.populateDictionary(prevPosition, dictSegEndPosition, prevStatusIdx, positionStatusIdx); 1695 while (fDictionaryCache.following(prevPosition)) { 1696 position = fDictionaryCache.fBoundary; 1697 positionStatusIdx = fDictionaryCache.fStatusIndex; 1698 segmentHandledByDictionary = true; 1699 assert(position > prevPosition); 1700 if (position >= fromPosition) { 1701 break; 1702 } 1703 assert(position <= dictSegEndPosition); 1704 fSideBuffer.push(position); 1705 fSideBuffer.push(positionStatusIdx); 1706 prevPosition = position; 1707 } 1708 assert(position==dictSegEndPosition || position>=fromPosition); 1709 } 1710 1711 if (!segmentHandledByDictionary && position < fromPosition) { 1712 fSideBuffer.push(position); 1713 fSideBuffer.push(positionStatusIdx); 1714 } 1715 } while (position < fromPosition); 1716 1717 // Move boundaries from the side buffer to the main circular buffer. 1718 boolean success = false; 1719 if (!fSideBuffer.isEmpty()) { 1720 positionStatusIdx = fSideBuffer.pop(); 1721 position = fSideBuffer.pop(); 1722 addPreceding(position, positionStatusIdx, UpdateCachePosition); 1723 success = true; 1724 } 1725 1726 while (!fSideBuffer.isEmpty()) { 1727 positionStatusIdx = fSideBuffer.pop(); 1728 position = fSideBuffer.pop(); 1729 if (!addPreceding(position, positionStatusIdx, RetainCachePosition)) { 1730 // No space in circular buffer to hold a new preceding result while 1731 // also retaining the current cache (iteration) position. 1732 // Bailing out is safe; the cache will refill again if needed. 1733 break; 1734 } 1735 } 1736 return success; 1737 }; 1738 1739 1740 static final boolean RetainCachePosition = false; 1741 static final boolean UpdateCachePosition = true; 1742 1743 /** 1744 * Add the boundary following the current position. 1745 * The current position can be left as it was, or changed to the newly added boundary, 1746 * as specified by the update parameter. 1747 */ addFollowing(int position, int ruleStatusIdx, boolean update)1748 void addFollowing(int position, int ruleStatusIdx, boolean update) { 1749 assert(position > fBoundaries[fEndBufIdx]); 1750 assert(ruleStatusIdx <= Short.MAX_VALUE); 1751 int nextIdx = modChunkSize(fEndBufIdx + 1); 1752 if (nextIdx == fStartBufIdx) { 1753 fStartBufIdx = modChunkSize(fStartBufIdx + 6); // TODO: experiment. Probably revert to 1. 1754 } 1755 fBoundaries[nextIdx] = position; 1756 fStatuses[nextIdx] = (short)ruleStatusIdx; 1757 fEndBufIdx = nextIdx; 1758 if (update == UpdateCachePosition) { 1759 // Set current position to the newly added boundary. 1760 fBufIdx = nextIdx; 1761 fTextIdx = position; 1762 } else { 1763 // Retaining the original cache position. 1764 // Check if the added boundary wraps around the buffer, and would over-write the original position. 1765 // It's the responsibility of callers of this function to not add too many. 1766 assert(nextIdx != fBufIdx); 1767 } 1768 1769 }; 1770 1771 1772 /** 1773 * Add the boundary preceding the current position. 1774 * The current position can be left as it was, or changed to the newly added boundary, 1775 * as specified by the update parameter. 1776 */ addPreceding(int position, int ruleStatusIdx, boolean update)1777 boolean addPreceding(int position, int ruleStatusIdx, boolean update) { 1778 assert(position < fBoundaries[fStartBufIdx]); 1779 assert(ruleStatusIdx <= Short.MAX_VALUE); 1780 int nextIdx = modChunkSize(fStartBufIdx - 1); 1781 if (nextIdx == fEndBufIdx) { 1782 if (fBufIdx == fEndBufIdx && update == RetainCachePosition) { 1783 // Failure. The insertion of the new boundary would claim the buffer position that is the 1784 // current iteration position. And we also want to retain the current iteration position. 1785 // (The buffer is already completely full of entries that precede the iteration position.) 1786 return false; 1787 } 1788 fEndBufIdx = modChunkSize(fEndBufIdx - 1); 1789 } 1790 fBoundaries[nextIdx] = position; 1791 fStatuses[nextIdx] = (short)ruleStatusIdx; 1792 fStartBufIdx = nextIdx; 1793 if (update == UpdateCachePosition) { 1794 fBufIdx = nextIdx; 1795 fTextIdx = position; 1796 } 1797 return true; 1798 }; 1799 1800 /** 1801 * Set the cache position to the specified position, or, if the position 1802 * falls between to cached boundaries, to the preceding boundary. 1803 * Fails if the requested position is outside of the range of boundaries currently held by the cache. 1804 * The startPosition must be on a code point boundary. 1805 * 1806 * Return true if successful, false if the specified position is after 1807 * the last cached boundary or before the first. 1808 */ 1809 boolean seek(int pos) { 1810 if (pos < fBoundaries[fStartBufIdx] || pos > fBoundaries[fEndBufIdx]) { 1811 return false; 1812 } 1813 if (pos == fBoundaries[fStartBufIdx]) { 1814 // Common case: seek(0), from BreakIterator::first() 1815 fBufIdx = fStartBufIdx; 1816 fTextIdx = fBoundaries[fBufIdx]; 1817 return true; 1818 } 1819 if (pos == fBoundaries[fEndBufIdx]) { 1820 fBufIdx = fEndBufIdx; 1821 fTextIdx = fBoundaries[fBufIdx]; 1822 return true; 1823 } 1824 1825 int min = fStartBufIdx; 1826 int max = fEndBufIdx; 1827 while (min != max) { 1828 int probe = (min + max + (min>max ? CACHE_SIZE : 0)) / 2; 1829 probe = modChunkSize(probe); 1830 if (fBoundaries[probe] > pos) { 1831 max = probe; 1832 } else { 1833 min = modChunkSize(probe + 1); 1834 } 1835 } 1836 assert(fBoundaries[max] > pos); 1837 fBufIdx = modChunkSize(max - 1); 1838 fTextIdx = fBoundaries[fBufIdx]; 1839 assert(fTextIdx <= pos); 1840 return true; 1841 1842 }; 1843 1844 1845 /** 1846 * copy constructor, used from RuleBasedBreakIterator.clone(). 1847 * 1848 * @param src 1849 */ BreakCache(BreakCache src)1850 BreakCache(BreakCache src) { 1851 fStartBufIdx = src.fStartBufIdx; 1852 fEndBufIdx = src.fEndBufIdx; 1853 fTextIdx = src.fTextIdx; 1854 fBufIdx = src.fBufIdx; 1855 fBoundaries = src.fBoundaries.clone(); 1856 fStatuses = src.fStatuses.clone(); 1857 fSideBuffer = new DictionaryBreakEngine.DequeI(); // Transient, no need to clone contents. 1858 } 1859 dumpCache()1860 void dumpCache() { 1861 System.out.printf("fTextIdx:%d fBufIdx:%d%n", fTextIdx, fBufIdx); 1862 for (int i=fStartBufIdx; ; i=modChunkSize(i+1)) { 1863 System.out.printf("%d %d%n", i, fBoundaries[i]); 1864 if (i == fEndBufIdx) { 1865 break; 1866 } 1867 } 1868 }; 1869 modChunkSize(int index)1870 private final int modChunkSize(int index) { return index & (CACHE_SIZE - 1); }; 1871 1872 static final int CACHE_SIZE = 128; 1873 // static_assert((CACHE_SIZE & (CACHE_SIZE-1)) == 0, "CACHE_SIZE must be power of two."); 1874 1875 int fStartBufIdx; 1876 int fEndBufIdx; // inclusive 1877 1878 int fTextIdx; 1879 int fBufIdx; 1880 1881 int[] fBoundaries = new int[CACHE_SIZE]; 1882 short[] fStatuses = new short[CACHE_SIZE]; 1883 1884 DictionaryBreakEngine.DequeI fSideBuffer = new DictionaryBreakEngine.DequeI(); 1885 }; 1886 1887 1888 1889 1890 } 1891 1892