1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * Copyright (C) 2005-2016 International Business Machines Corporation and 6 * others. All Rights Reserved. 7 ******************************************************************************* 8 */ 9 10 package com.ibm.icu.text; 11 12 import static com.ibm.icu.impl.CharacterIteration.DONE32; 13 import static com.ibm.icu.impl.CharacterIteration.next32; 14 import static com.ibm.icu.impl.CharacterIteration.nextTrail32; 15 16 import java.io.ByteArrayOutputStream; 17 import java.io.IOException; 18 import java.io.InputStream; 19 import java.io.OutputStream; 20 import java.nio.ByteBuffer; 21 import java.text.CharacterIterator; 22 import java.util.MissingResourceException; 23 import java.util.concurrent.ConcurrentLinkedQueue; 24 25 import com.ibm.icu.impl.CharacterIteration; 26 import com.ibm.icu.impl.ICUBinary; 27 import com.ibm.icu.impl.ICUDebug; 28 import com.ibm.icu.impl.RBBIDataWrapper; 29 import com.ibm.icu.impl.breakiter.BurmeseBreakEngine; 30 import com.ibm.icu.impl.breakiter.CjkBreakEngine; 31 import com.ibm.icu.impl.breakiter.DictionaryBreakEngine; 32 import com.ibm.icu.impl.breakiter.KhmerBreakEngine; 33 import com.ibm.icu.impl.breakiter.LSTMBreakEngine; 34 import com.ibm.icu.impl.breakiter.LanguageBreakEngine; 35 import com.ibm.icu.impl.breakiter.LaoBreakEngine; 36 import com.ibm.icu.impl.breakiter.ThaiBreakEngine; 37 import com.ibm.icu.impl.breakiter.UnhandledBreakEngine; 38 import com.ibm.icu.lang.UCharacter; 39 import com.ibm.icu.lang.UProperty; 40 import com.ibm.icu.lang.UScript; 41 import com.ibm.icu.util.CodePointTrie; 42 43 /** 44 * Rule Based Break Iterator 45 * This is a port of the C++ class RuleBasedBreakIterator from ICU4C. 46 * 47 * @stable ICU 2.0 48 */ 49 public class RuleBasedBreakIterator extends BreakIterator { 50 //======================================================================= 51 // Constructors & Factories 52 //======================================================================= 53 54 /** 55 * private constructor 56 */ RuleBasedBreakIterator()57 private RuleBasedBreakIterator() { 58 fDictionaryCharCount = 0; 59 } 60 61 /** 62 * Create a break iterator from a precompiled set of break rules. 63 * 64 * Creating a break iterator from the binary rules is much faster than 65 * creating one from source rules. 66 * 67 * The binary rules are generated by the RuleBasedBreakIterator.compileRules() function. 68 * Binary break iterator rules are not guaranteed to be compatible between 69 * different versions of ICU. 70 * 71 * @param is an input stream supplying the compiled binary rules. 72 * @throws IOException if there is an error while reading the rules from the InputStream. 73 * @see #compileRules(String, OutputStream) 74 * @stable ICU 4.8 75 */ getInstanceFromCompiledRules(InputStream is)76 public static RuleBasedBreakIterator getInstanceFromCompiledRules(InputStream is) throws IOException { 77 RuleBasedBreakIterator This = new RuleBasedBreakIterator(); 78 This.fRData = RBBIDataWrapper.get(ICUBinary.getByteBufferFromInputStreamAndCloseStream(is)); 79 This.fLookAheadMatches = new int[This.fRData.fFTable.fLookAheadResultsSize]; 80 return This; 81 } 82 83 /** 84 * This factory method doesn't have an access modifier; it is only accessible in the same 85 * package. 86 * 87 * Create a break iterator from a precompiled set of break rules. 88 * 89 * Creating a break iterator from the binary rules is much faster than 90 * creating one from source rules. 91 * 92 * The binary rules are generated by the RuleBasedBreakIterator.compileRules() function. 93 * Binary break iterator rules are not guaranteed to be compatible between 94 * different versions of ICU. 95 * 96 * @param bytes a buffer supplying the compiled binary rules. 97 * @param phraseBreaking a flag indicating if phrase breaking is required. 98 * @throws IOException if there is an error while reading the rules from the buffer. 99 * @see #compileRules(String, OutputStream) 100 * @internal 101 */ getInstanceFromCompiledRules( ByteBuffer bytes, boolean phraseBreaking)102 /* package-potected */ static RuleBasedBreakIterator getInstanceFromCompiledRules( 103 ByteBuffer bytes, boolean phraseBreaking) throws IOException { 104 RuleBasedBreakIterator instance = getInstanceFromCompiledRules(bytes); 105 instance.fPhraseBreaking = phraseBreaking; 106 return instance; 107 } 108 109 /** 110 * Create a break iterator from a precompiled set of break rules. 111 * 112 * Creating a break iterator from the binary rules is much faster than 113 * creating one from source rules. 114 * 115 * The binary rules are generated by the RuleBasedBreakIterator.compileRules() function. 116 * Binary break iterator rules are not guaranteed to be compatible between 117 * different versions of ICU. 118 * 119 * @param bytes a buffer supplying the compiled binary rules. 120 * @throws IOException if there is an error while reading the rules from the buffer. 121 * @see #compileRules(String, OutputStream) 122 * @internal 123 * @deprecated This API is ICU internal only. 124 */ 125 @Deprecated getInstanceFromCompiledRules(ByteBuffer bytes)126 public static RuleBasedBreakIterator getInstanceFromCompiledRules(ByteBuffer bytes) throws IOException { 127 RuleBasedBreakIterator This = new RuleBasedBreakIterator(); 128 This.fRData = RBBIDataWrapper.get(bytes); 129 This.fLookAheadMatches = new int[This.fRData.fFTable.fLookAheadResultsSize]; 130 return This; 131 } 132 133 /** 134 * Construct a RuleBasedBreakIterator from a set of rules supplied as a string. 135 * @param rules The break rules to be used. 136 * @stable ICU 2.2 137 */ RuleBasedBreakIterator(String rules)138 public RuleBasedBreakIterator(String rules) { 139 this(); 140 try { 141 ByteArrayOutputStream ruleOS = new ByteArrayOutputStream(); 142 compileRules(rules, ruleOS); 143 fRData = RBBIDataWrapper.get(ByteBuffer.wrap(ruleOS.toByteArray())); 144 fLookAheadMatches = new int[fRData.fFTable.fLookAheadResultsSize]; 145 } catch (IOException e) { 146 ///CLOVER:OFF 147 // An IO exception can only arrive here if there is a bug in the RBBI Rule compiler, 148 // causing bogus compiled rules to be produced, but with no compile error raised. 149 RuntimeException rte = new RuntimeException("RuleBasedBreakIterator rule compilation internal error: " 150 + e.getMessage()); 151 throw rte; 152 ///CLOVER:ON 153 } 154 } 155 156 //======================================================================= 157 // Boilerplate 158 //======================================================================= 159 160 /** 161 * Clones this iterator. 162 * @return A newly-constructed RuleBasedBreakIterator with the same 163 * behavior as this one. 164 * @stable ICU 2.0 165 */ 166 @Override clone()167 public Object clone() { 168 RuleBasedBreakIterator result; 169 result = (RuleBasedBreakIterator)super.clone(); 170 if (fText != null) { 171 result.fText = (CharacterIterator)(fText.clone()); 172 } 173 result.fLookAheadMatches = new int[fRData.fFTable.fLookAheadResultsSize]; 174 result.fBreakCache = result.new BreakCache(fBreakCache); 175 result.fDictionaryCache = result.new DictionaryCache(fDictionaryCache); 176 return result; 177 } 178 179 180 /** 181 * Returns true if both BreakIterators are of the same class, have the same 182 * rules, and iterate over the same text. 183 * @stable ICU 2.0 184 */ 185 @Override equals(Object that)186 public boolean equals(Object that) { 187 if (that == null) { 188 return false; 189 } 190 if (this == that) { 191 return true; 192 } 193 try { 194 RuleBasedBreakIterator other = (RuleBasedBreakIterator) that; 195 if (fRData != other.fRData && (fRData == null || other.fRData == null)) { 196 return false; 197 } 198 if (fRData != null && other.fRData != null && 199 (!fRData.fRuleSource.equals(other.fRData.fRuleSource))) { 200 return false; 201 } 202 if (fText == null && other.fText == null) { 203 return true; 204 } 205 if (fText == null || other.fText == null || !fText.equals(other.fText)) { 206 return false; 207 } 208 return fPosition == other.fPosition; 209 } 210 catch(ClassCastException e) { 211 return false; 212 } 213 } 214 215 /** 216 * Returns the description (rules) used to create this iterator. 217 * (In ICU4C, the same function is RuleBasedBreakIterator::getRules()) 218 * @stable ICU 2.0 219 */ 220 @Override toString()221 public String toString() { 222 String retStr = ""; 223 if (fRData != null) { 224 retStr = fRData.fRuleSource; 225 } 226 return retStr; 227 } 228 229 /** 230 * Compute a hashcode for this BreakIterator 231 * @return A hash code 232 * @stable ICU 2.0 233 */ 234 @Override hashCode()235 public int hashCode() 236 { 237 return fRData.fRuleSource.hashCode(); 238 } 239 240 241 private static final int START_STATE = 1; // The state number of the starting state 242 private static final int STOP_STATE = 0; // The state-transition value indicating "stop" 243 244 // RBBIRunMode - the state machine runs an extra iteration at the beginning and end 245 // of user text. A variable with this enum type keeps track of where we 246 // are. The state machine only fetches user text input while in RUN mode. 247 private static final int RBBI_START = 0; 248 private static final int RBBI_RUN = 1; 249 private static final int RBBI_END = 2; 250 251 /** 252 * The character iterator through which this BreakIterator accesses the text. 253 */ 254 private CharacterIterator fText = new java.text.StringCharacterIterator(""); 255 256 /** 257 * The rule data for this BreakIterator instance. 258 * Not intended for public use. Declared public for testing purposes only. 259 * @internal 260 * @deprecated This API is ICU internal only. 261 */ 262 @Deprecated 263 public RBBIDataWrapper fRData; 264 265 /** 266 * The iteration state - current position, rule status for the current position, 267 * and whether the iterator ran off the end, yielding UBRK_DONE. 268 * Current position is pinned to be 0 < position <= text.length. 269 * Current position is always set to a boundary. 270 * 271 * The current position of the iterator. Pinned, 0 < fPosition <= text.length. 272 * Never has the value UBRK_DONE (-1). 273 */ 274 private int fPosition; 275 276 /** 277 * Index of the Rule {tag} values for the most recent match. 278 */ 279 private int fRuleStatusIndex; 280 281 /** 282 * True when iteration has run off the end, and iterator functions should return UBRK_DONE. 283 */ 284 private boolean fDone; 285 286 /** 287 * Array of look-ahead tentative results. 288 */ 289 private int[] fLookAheadMatches; 290 291 /** 292 * Cache of previously determined boundary positions. 293 */ 294 private BreakCache fBreakCache = new BreakCache(); 295 296 /** 297 * Flag used to indicate if phrase breaking is required. 298 */ 299 private boolean fPhraseBreaking = false; 300 301 302 /** 303 * Counter for the number of characters encountered with the "dictionary" 304 * flag set. Normal RBBI iterators don't use it, although the code 305 * for updating it is live. Dictionary Based break iterators (a subclass 306 * of us) access this field directly. 307 * @internal 308 */ 309 private int fDictionaryCharCount; 310 311 private DictionaryCache fDictionaryCache = new DictionaryCache(); 312 313 /** 314 * ICU debug argument name for RBBI 315 */ 316 private static final String RBBI_DEBUG_ARG = "rbbi"; 317 318 /** 319 * Debugging flag. Trace operation of state machine when true. 320 */ 321 private static final boolean TRACE = ICUDebug.enabled(RBBI_DEBUG_ARG) 322 && ICUDebug.value(RBBI_DEBUG_ARG).indexOf("trace") >= 0; 323 324 /** 325 * The "default" break engine - just skips over ranges of dictionary words, 326 * producing no breaks. Should only be used if characters need to be handled 327 * by a dictionary but we have no dictionary implementation for them. 328 * 329 * Only one instance; shared by all break iterators. 330 */ 331 private static final UnhandledBreakEngine gUnhandledBreakEngine; 332 333 /** 334 * List of all known break engines, common for all break iterators. 335 * Lazily updated as break engines are needed, because instantiation of 336 * break engines is expensive. 337 * 338 * Important notes: 339 * <ul>Because we don't want to add the same LanguageBreakEngine multiple times, all writes 340 * are synchronized. 341 * <ul>Read access avoids explicit synchronization, but will end up being synchronized if 342 * needed. 343 */ 344 private static final ConcurrentLinkedQueue<LanguageBreakEngine> gAllBreakEngines; 345 346 static { 347 gUnhandledBreakEngine = new UnhandledBreakEngine(); 348 gAllBreakEngines = new ConcurrentLinkedQueue<>(); 349 gAllBreakEngines.add(gUnhandledBreakEngine); 350 } 351 352 /** 353 * Dump the contents of the state table and character classes for this break iterator. 354 * For debugging only. 355 * @internal 356 * @deprecated This API is ICU internal only. 357 */ 358 @Deprecated dump(java.io.PrintStream out)359 public void dump(java.io.PrintStream out) { 360 if (out == null) { 361 out = System.out; 362 } 363 this.fRData.dump(out); 364 } 365 366 /** 367 * Compile a set of source break rules into the binary state tables used 368 * by the break iterator engine. Creating a break iterator from precompiled 369 * rules is much faster than creating one from source rules. 370 * 371 * Binary break rules are not guaranteed to be compatible between different 372 * versions of ICU. 373 * 374 * 375 * @param rules The source form of the break rules 376 * @param ruleBinary An output stream to receive the compiled rules. 377 * @throws IOException If there is an error writing the output. 378 * @see #getInstanceFromCompiledRules(InputStream) 379 * @stable ICU 4.8 380 */ compileRules(String rules, OutputStream ruleBinary)381 public static void compileRules(String rules, OutputStream ruleBinary) throws IOException { 382 RBBIRuleBuilder.compileRules(rules, ruleBinary); 383 } 384 385 //======================================================================= 386 // BreakIterator overrides 387 //======================================================================= 388 389 /** 390 * Sets the current iteration position to the beginning of the text. 391 * (i.e., the CharacterIterator's starting offset). 392 * @return The offset of the beginning of the text. 393 * @stable ICU 2.0 394 */ 395 @Override first()396 public int first() { 397 if (fText == null) { 398 return BreakIterator.DONE; 399 } 400 fText.first(); 401 int start = fText.getIndex(); 402 if (!fBreakCache.seek(start)) { 403 fBreakCache.populateNear(start); 404 } 405 fBreakCache.current(); 406 assert(fPosition == start); 407 return fPosition; 408 } 409 410 /** 411 * Sets the current iteration position to the end of the text. 412 * (i.e., the CharacterIterator's ending offset). 413 * @return The text's past-the-end offset. 414 * @stable ICU 2.0 415 */ 416 @Override last()417 public int last() { 418 if (fText == null) { 419 return BreakIterator.DONE; 420 } 421 int endPos = fText.getEndIndex(); 422 boolean endShouldBeBoundary = isBoundary(endPos); // Has side effect of setting iterator position. 423 assert(endShouldBeBoundary); 424 if (fPosition != endPos) { 425 assert(fPosition == endPos); 426 } 427 return endPos; 428 } 429 430 /** 431 * Advances the iterator either forward or backward the specified number of steps. 432 * Negative values move backward, and positive values move forward. This is 433 * equivalent to repeatedly calling next() or previous(). 434 * @param n The number of steps to move. The sign indicates the direction 435 * (negative is backwards, and positive is forwards). 436 * @return The character offset of the boundary position n boundaries away from 437 * the current one. 438 * @stable ICU 2.0 439 */ 440 @Override next(int n)441 public int next(int n) { 442 int result = 0; 443 if (n > 0) { 444 for (; n > 0 && result != DONE; --n) { 445 result = next(); 446 } 447 } else if (n < 0) { 448 for (; n < 0 && result != DONE; ++n) { 449 result = previous(); 450 } 451 } else { 452 result = current(); 453 } 454 return result; 455 } 456 457 /** 458 * Advances the iterator to the next boundary position. 459 * @return The position of the first boundary after this one. 460 * @stable ICU 2.0 461 */ 462 @Override next()463 public int next() { 464 fBreakCache.next(); 465 return fDone ? DONE : fPosition; 466 } 467 468 /** 469 * Moves the iterator backwards, to the boundary preceding the current one. 470 * @return The position of the boundary position immediately preceding the starting position. 471 * @stable ICU 2.0 472 */ 473 @Override previous()474 public int previous() { 475 fBreakCache.previous(); 476 return fDone ? DONE : fPosition; 477 } 478 479 /** 480 * Sets the iterator to refer to the first boundary position following 481 * the specified position. 482 * @param startPos The position from which to begin searching for a break position. 483 * @return The position of the first break after the current position. 484 * @stable ICU 2.0 485 */ 486 @Override following(int startPos)487 public int following(int startPos) { 488 // if the supplied position is before the beginning, return the 489 // text's starting offset 490 if (startPos < fText.getBeginIndex()) { 491 return first(); 492 } 493 494 // Move requested offset to a code point start. It might be between a lead and trail surrogate. 495 // Or it may be beyond the end of the text. 496 startPos = CISetIndex32(fText, startPos); 497 fBreakCache.following(startPos); 498 return fDone ? DONE : fPosition; 499 } 500 501 502 /** 503 * Sets the iterator to refer to the last boundary position before the 504 * specified position. 505 * @param offset The position to begin searching for a break from. 506 * @return The position of the last boundary before the starting position. 507 * @stable ICU 2.0 508 */ 509 @Override preceding(int offset)510 public int preceding(int offset) { 511 if (fText == null || offset > fText.getEndIndex()) { 512 return last(); 513 } else if (offset < fText.getBeginIndex()) { 514 return first(); 515 } 516 517 // Move requested offset to a code point start. It might be between a lead and trail surrogate. 518 // int adjustedOffset = CISetIndex32(fText, offset); // TODO: restore to match ICU4C behavior. 519 int adjustedOffset = offset; 520 fBreakCache.preceding(adjustedOffset); 521 return fDone ? DONE : fPosition; 522 523 } 524 525 526 /** 527 * Throw IllegalArgumentException unless begin <= offset < end. 528 * @stable ICU 2.0 529 */ checkOffset(int offset, CharacterIterator text)530 protected static final void checkOffset(int offset, CharacterIterator text) { 531 if (offset < text.getBeginIndex() || offset > text.getEndIndex()) { 532 throw new IllegalArgumentException("offset out of bounds"); 533 } 534 } 535 536 537 /** 538 * Returns true if the specified position is a boundary position. As a side 539 * effect, leaves the iterator pointing to the first boundary position at 540 * or after "offset". 541 * @param offset the offset to check. 542 * @return True if "offset" is a boundary position. 543 * @stable ICU 2.0 544 */ 545 @Override isBoundary(int offset)546 public boolean isBoundary(int offset) { 547 // TODO: behavior difference with ICU4C, which considers out-of-range offsets 548 // to not be boundaries, and to not be errors. 549 checkOffset(offset, fText); 550 551 // Adjust offset to be on a code point boundary and not beyond the end of the text. 552 // Note that isBoundary() is always false for offsets that are not on code point boundaries. 553 // But we still need the side effect of leaving iteration at the following boundary. 554 int adjustedOffset = CISetIndex32(fText, offset); 555 556 boolean result = false; 557 if (fBreakCache.seek(adjustedOffset) || fBreakCache.populateNear(adjustedOffset)) { 558 result = (fBreakCache.current() == offset); 559 } 560 561 if (!result) { 562 // Not on a boundary. isBoundary() must leave iterator on the following boundary. 563 // fBreakCache.seek(), above, left us on the preceding boundary, so advance one. 564 next(); 565 } 566 return result; 567 568 } 569 570 /** 571 * Returns the current iteration position. Note that DONE is never 572 * returned from this function; if iteration has run to the end of a 573 * string, current() will return the length of the string while 574 * next() will return BreakIterator.DONE). 575 * @return The current iteration position. 576 * @stable ICU 2.0 577 */ 578 @Override current()579 public int current() { 580 return (fText != null) ? fPosition : BreakIterator.DONE; 581 } 582 583 584 /** 585 * Return the status tag from the break rule that determined the boundary at 586 * the current iteration position. The values appear in the rule source 587 * within brackets, {123}, for example. For rules that do not specify a 588 * status, a default value of 0 is returned. If more than one rule applies, 589 * the numerically largest of the possible status values is returned. 590 * <p> 591 * Of the standard types of ICU break iterators, only the word and line break 592 * iterator provides status values. The values are defined in 593 * class RuleBasedBreakIterator, and allow distinguishing between words 594 * that contain alphabetic letters, "words" that appear to be numbers, 595 * punctuation and spaces, words containing ideographic characters, and 596 * more. Call <code>getRuleStatus</code> after obtaining a boundary 597 * position from <code>next()</code>, <code>previous()</code>, or 598 * any other break iterator functions that returns a boundary position. 599 * <p> 600 * Note that <code>getRuleStatus()</code> returns the value corresponding to 601 * <code>current()</code> index even after <code>next()</code> has returned DONE. 602 * <p> 603 604 * @return the status from the break rule that determined the boundary 605 * at the current iteration position. 606 * 607 * @stable ICU 60 608 */ 609 610 @Override getRuleStatus()611 public int getRuleStatus() { 612 // Status records have this form: 613 // Count N <-- fLastRuleStatusIndex points here. 614 // Status val 0 615 // Status val 1 616 // ... 617 // Status val N-1 <-- the value we need to return 618 // The status values are sorted in ascending order. 619 // This function returns the last (largest) of the array of status values. 620 int idx = fRuleStatusIndex + fRData.fStatusTable[fRuleStatusIndex]; 621 int tagVal = fRData.fStatusTable[idx]; 622 return tagVal; 623 } 624 625 /** 626 * Get the status (tag) values from the break rule(s) that determined the boundary 627 * at the current iteration position. The values appear in the rule source 628 * within brackets, {123}, for example. The default status value for rules 629 * that do not explicitly provide one is zero. 630 * <p> 631 * The status values used by the standard ICU break rules are defined 632 * as public constants in class RuleBasedBreakIterator. 633 * <p> 634 * If the size of the output array is insufficient to hold the data, 635 * the output will be truncated to the available length. No exception 636 * will be thrown. 637 * 638 * @param fillInArray an array to be filled in with the status values. 639 * @return The number of rule status values from the rules that determined 640 * the boundary at the current iteration position. 641 * In the event that the array is too small, the return value 642 * is the total number of status values that were available, 643 * not the reduced number that were actually returned. 644 * @stable ICU 60 645 */ 646 @Override getRuleStatusVec(int[] fillInArray)647 public int getRuleStatusVec(int[] fillInArray) { 648 int numStatusVals = fRData.fStatusTable[fRuleStatusIndex]; 649 if (fillInArray != null) { 650 int numToCopy = Math.min(numStatusVals, fillInArray.length); 651 for (int i=0; i<numToCopy; i++) { 652 fillInArray[i] = fRData.fStatusTable[fRuleStatusIndex + i + 1]; 653 } 654 } 655 return numStatusVals; 656 } 657 658 /** 659 * Returns a CharacterIterator over the text being analyzed. 660 * <p> 661 * <b><i>Caution:</i></b>The state of the returned CharacterIterator 662 * must not be modified in any way while the BreakIterator is still in use. 663 * Doing so will lead to undefined behavior of the BreakIterator. 664 * Clone the returned CharacterIterator first and work with that. 665 * <p> 666 * The returned CharacterIterator is a reference 667 * to the <b>actual iterator being used</b> by the BreakIterator. 668 * No guarantees are made about the current position 669 * of this iterator when it is returned; it may differ from the 670 * BreakIterators current position. If you need to move that 671 * position to examine the text, clone this function's return value first. 672 * @return An iterator over the text being analyzed. 673 * @stable ICU 2.0 674 */ 675 @Override getText()676 public CharacterIterator getText() { 677 return fText; 678 } 679 680 /** 681 * Set the iterator to analyze a new piece of text. This function resets 682 * the current iteration position to the beginning of the text. 683 * (The old iterator is dropped.) 684 * <p> 685 * <b><i>Caution:</i></b> The supplied CharacterIterator is used 686 * directly by the BreakIterator, and must not be altered in any 687 * way by code outside of the BreakIterator. 688 * Doing so will lead to undefined behavior of the BreakIterator. 689 * 690 * @param newText An iterator over the text to analyze. 691 * @stable ICU 2.0 692 */ 693 @Override setText(CharacterIterator newText)694 public void setText(CharacterIterator newText) { 695 if (newText != null) { 696 fBreakCache.reset(newText.getBeginIndex(), 0); 697 } else { 698 fBreakCache.reset(); 699 } 700 fDictionaryCache.reset(); 701 fText = newText; 702 this.first(); 703 } 704 705 /** 706 * Control debug, trace and dump options. 707 * @internal 708 * @deprecated This API is ICU internal only. 709 */ 710 @Deprecated 711 public static final String fDebugEnv = ICUDebug.enabled(RBBI_DEBUG_ARG) ? 712 ICUDebug.value(RBBI_DEBUG_ARG) : null; 713 714 getLanguageBreakEngine(int c)715 private LanguageBreakEngine getLanguageBreakEngine(int c) { 716 717 // We have a dictionary character. 718 // Does an already instantiated break engine handle it? 719 // First read without synchronization, which could lead to a new language 720 // break engine being added and we didn't go over it. 721 for (LanguageBreakEngine candidate : gAllBreakEngines) { 722 if (candidate.handles(c)) { 723 return candidate; 724 } 725 } 726 727 synchronized (gAllBreakEngines) { 728 // Another break iterator may have instantiated the desired engine. 729 for (LanguageBreakEngine candidate : gAllBreakEngines) { 730 if (candidate.handles(c)) { 731 return candidate; 732 } 733 } 734 735 // The global list doesn't have an existing engine, build one. 736 int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT); 737 if (script == UScript.KATAKANA || script == UScript.HIRAGANA) { 738 // Katakana, Hiragana and Han are handled by the same dictionary engine. 739 // Fold them together for mapping from script -> engine. 740 script = UScript.HAN; 741 } 742 743 LanguageBreakEngine eng; 744 try { 745 switch (script) { 746 case UScript.THAI: 747 try { 748 eng = LSTMBreakEngine.create(script, LSTMBreakEngine.createData(script)); 749 } catch (MissingResourceException e) { 750 eng = new ThaiBreakEngine(); 751 } 752 break; 753 case UScript.LAO: 754 eng = new LaoBreakEngine(); 755 break; 756 case UScript.MYANMAR: 757 try { 758 eng = LSTMBreakEngine.create(script, LSTMBreakEngine.createData(script)); 759 } catch (MissingResourceException e) { 760 eng = new BurmeseBreakEngine(); 761 } 762 break; 763 case UScript.KHMER: 764 eng = new KhmerBreakEngine(); 765 break; 766 case UScript.HAN: 767 eng = new CjkBreakEngine(false); 768 break; 769 case UScript.HANGUL: 770 eng = new CjkBreakEngine(true); 771 break; 772 default: 773 gUnhandledBreakEngine.handleChar(c); 774 eng = gUnhandledBreakEngine; 775 break; 776 } 777 } catch (IOException e) { 778 eng = null; 779 } 780 781 if (eng != null && eng != gUnhandledBreakEngine) { 782 gAllBreakEngines.add(eng); 783 } 784 return eng; 785 } // end synchronized(gAllBreakEngines) 786 } 787 788 /** 789 * The State Machine Engine for moving forward is here. 790 * This function is the heart of the RBBI run time engine. 791 * 792 * Input 793 * fPosition, the position in the text to begin from. 794 * Output 795 * fPosition: the boundary following the starting position. 796 * fDictionaryCharCount the number of dictionary characters encountered. 797 * If > 0, the segment will be further subdivided 798 * fRuleStatusIndex Info from the state table indicating which rules caused the boundary. 799 * 800 * @return the new iterator position 801 * 802 * A note on supplementary characters and the position of underlying 803 * Java CharacterIterator: Normally, a character iterator is positioned at 804 * the char most recently returned by next(). Within this function, when 805 * a supplementary char is being processed, the char iterator is left 806 * sitting on the trail surrogate, in the middle of the code point. 807 * This is different from everywhere else, where an iterator always 808 * points at the lead surrogate of a supplementary. 809 */ handleNext()810 private int handleNext() { 811 if (TRACE) { 812 System.out.println("Handle Next pos char state category"); 813 } 814 815 // handleNext always sets the break tag value. 816 // Set the default for it. 817 fRuleStatusIndex = 0; 818 fDictionaryCharCount = 0; 819 820 // caches for quicker access 821 CharacterIterator text = fText; 822 CodePointTrie trie = fRData.fTrie; 823 824 char[] stateTable = fRData.fFTable.fTable; 825 int initialPosition = fPosition; 826 text.setIndex(initialPosition); 827 int result = initialPosition; 828 829 // Set up the starting char 830 int c = text.current(); 831 if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) { 832 c = nextTrail32(text, c); 833 if (c == DONE32) { 834 fDone = true; 835 return BreakIterator.DONE; 836 } 837 } 838 839 // Set the initial state for the state machine 840 int state = START_STATE; 841 int row = fRData.getRowIndex(state); 842 short category = 3; 843 int flagsState = fRData.fFTable.fFlags; 844 int dictStart = fRData.fFTable.fDictCategoriesStart; 845 int mode = RBBI_RUN; 846 if ((flagsState & RBBIDataWrapper.RBBI_BOF_REQUIRED) != 0) { 847 category = 2; 848 mode = RBBI_START; 849 if (TRACE) { 850 System.out.print(" " + RBBIDataWrapper.intToString(text.getIndex(), 5)); 851 System.out.print(RBBIDataWrapper.intToHexString(c, 10)); 852 System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6)); 853 } 854 } 855 856 // loop until we reach the end of the text or transition to state 0 857 while (state != STOP_STATE) { 858 if (c == DONE32) { 859 // Reached end of input string. 860 if (mode == RBBI_END) { 861 // We have already run the loop one last time with the 862 // character set to the pseudo {eof} value. Now it is time 863 // to unconditionally bail out. 864 break; 865 } 866 // Run the loop one last time with the fake end-of-input character category 867 mode = RBBI_END; 868 category = 1; 869 } 870 else if (mode == RBBI_RUN) { 871 // Get the char category. An incoming category of 1 or 2 mens that 872 // we are preset for doing the beginning or end of input, and 873 // that we shouldn't get a category from an actual text input character. 874 // 875 876 // look up the current character's character category, which tells us 877 // which column in the state table to look at. 878 // 879 category = (short) trie.get(c); 880 881 // Check for categories that require word dictionary handling. 882 if (category >= dictStart) { 883 fDictionaryCharCount++; 884 } 885 886 if (TRACE) { 887 System.out.print(" " + RBBIDataWrapper.intToString(text.getIndex(), 5)); 888 System.out.print(RBBIDataWrapper.intToHexString(c, 10)); 889 System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6)); 890 } 891 892 // Advance to the next character. 893 // If this is a beginning-of-input loop iteration, don't advance. 894 // The next iteration will be processing the first real input character. 895 c = text.next(); 896 if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) { 897 c = nextTrail32(text, c); 898 } 899 } 900 else { 901 mode = RBBI_RUN; 902 } 903 904 // look up a state transition in the state table 905 state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category]; 906 row = fRData.getRowIndex(state); 907 int accepting = stateTable[row + RBBIDataWrapper.ACCEPTING]; 908 if (accepting == RBBIDataWrapper.ACCEPTING_UNCONDITIONAL) { 909 // Match found, common case 910 result = text.getIndex(); 911 if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c <= UTF16.CODEPOINT_MAX_VALUE) { 912 // The iterator has been left in the middle of a surrogate pair. 913 // We want the start of it. 914 result--; 915 } 916 917 // Remember the break status (tag) values. 918 fRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGSIDX]; 919 } else if (accepting > RBBIDataWrapper.ACCEPTING_UNCONDITIONAL) { 920 // Lookahead match is completed 921 int lookaheadResult = fLookAheadMatches[accepting]; 922 if (lookaheadResult >= 0) { 923 fRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGSIDX]; 924 fPosition = lookaheadResult; 925 return lookaheadResult; 926 } 927 } 928 929 930 // If we are at the position of the '/' in a look-ahead (hard break) rule; 931 // record the current position, to be returned later, if the full rule matches. 932 // TODO: Move this check before the previous check of fAccepting. 933 // This would enable hard-break rules with no following context. 934 // But there are line break test failures when trying this. Investigate. 935 // Issue ICU-20837 936 int rule = stateTable[row + RBBIDataWrapper.LOOKAHEAD]; 937 if (rule != 0) { 938 int pos = text.getIndex(); 939 if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c <= UTF16.CODEPOINT_MAX_VALUE) { 940 // The iterator has been left in the middle of a surrogate pair. 941 // We want the beginning of it. 942 pos--; 943 } 944 fLookAheadMatches[rule] = pos; 945 } 946 947 948 } // End of state machine main loop 949 950 // The state machine is done. Check whether it found a match... 951 952 // If the iterator failed to advance in the match engine force it ahead by one. 953 // This indicates a defect in the break rules, which should always match 954 // at least one character. 955 956 if (result == initialPosition) { 957 if (TRACE) { 958 System.out.println("Iterator did not move. Advancing by 1."); 959 } 960 text.setIndex(initialPosition); 961 next32(text); 962 result = text.getIndex(); 963 fRuleStatusIndex = 0; 964 } 965 966 // Leave the iterator at our result position. 967 // (we may have advanced beyond the last accepting position chasing after 968 // longer matches that never completed.) 969 fPosition = result; 970 971 if (TRACE) { 972 System.out.println("result = " + result); 973 } 974 return result; 975 } 976 977 /** 978 * Iterate backwards from an arbitrary position in the input text using the Safe Reverse rules. 979 * This locates a "Safe Position" from which the forward break rules 980 * will operate correctly. A Safe Position is not necessarily a boundary itself. 981 * 982 * The logic of this function is very similar to handleNext(), above, but simpler 983 * because the safe table does not require as many options. 984 * 985 * @param fromPosition the position in the input text to begin the iteration. 986 * @internal 987 */ handleSafePrevious(int fromPosition)988 private int handleSafePrevious(int fromPosition) { 989 char state; 990 short category = 0; 991 int result = 0; 992 993 // caches for quicker access 994 CharacterIterator text = fText; 995 CodePointTrie trie = fRData.fTrie; 996 char[] stateTable = fRData.fRTable.fTable; 997 998 CISetIndex32(text, fromPosition); 999 if (TRACE) { 1000 System.out.print("Handle Previous pos char state category"); 1001 } 1002 1003 // if we're already at the start of the text, return DONE. 1004 if (text.getIndex() == text.getBeginIndex()) { 1005 return BreakIterator.DONE; 1006 } 1007 1008 // Set the initial state for the state machine 1009 int c = CharacterIteration.previous32(text); 1010 state = START_STATE; 1011 int row = fRData.getRowIndex(state); 1012 1013 // loop until we reach the start of the text or transition to state 0 1014 // 1015 for (; c != DONE32; c = CharacterIteration.previous32(text)) { 1016 1017 // look up the current character's character category, which tells us 1018 // which column in the state table to look at. 1019 // 1020 // And off the dictionary flag bit. For reverse iteration it is not used. 1021 category = (short) trie.get(c); 1022 if (TRACE) { 1023 System.out.print(" " + RBBIDataWrapper.intToString(text.getIndex(), 5)); 1024 System.out.print(RBBIDataWrapper.intToHexString(c, 10)); 1025 System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6)); 1026 } 1027 1028 // State Transition - move machine to its next state 1029 // 1030 assert(category < fRData.fHeader.fCatCount); 1031 state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category]; 1032 row = fRData.getRowIndex(state); 1033 1034 if (state == STOP_STATE) { 1035 // This is the normal exit from the lookup state machine. 1036 // Transition to state zero means we have found a safe point. 1037 break; 1038 } 1039 } 1040 1041 // The state machine is done. 1042 result = text.getIndex(); 1043 if (TRACE) { 1044 System.out.println("result = " + result); 1045 } 1046 return result; 1047 } 1048 1049 /** 1050 * Set the index of a CharacterIterator. 1051 * Pin the index to the valid range range of BeginIndex <= index <= EndIndex. 1052 * If the index points to a trail surrogate of a supplementary character, adjust it 1053 * to the start (lead surrogate) index. 1054 * 1055 * @param ci A CharacterIterator to set 1056 * @param index the index to set 1057 * @return the resulting index, possibly pinned or adjusted. 1058 */ 1059 private static int CISetIndex32(CharacterIterator ci, int index) { 1060 if (index <= ci.getBeginIndex()) { 1061 ci.first(); 1062 } else if (index >= ci.getEndIndex()) { 1063 ci.setIndex(ci.getEndIndex()); 1064 } else if (Character.isLowSurrogate(ci.setIndex(index))) { 1065 if (!Character.isHighSurrogate(ci.previous())) { 1066 ci.next(); 1067 } 1068 } 1069 return ci.getIndex(); 1070 } 1071 1072 /** DictionaryCache stores the boundaries obtained from a run of dictionary characters. 1073 * Dictionary boundaries are moved first to this cache, then from here 1074 * to the main BreakCache, where they may inter-leave with non-dictionary 1075 * boundaries. The public BreakIterator API always fetches directly 1076 * from the main BreakCache, not from here. 1077 * 1078 * In common situations, the number of boundaries in a single dictionary run 1079 * should be quite small, it will be terminated by punctuation, spaces, 1080 * or any other non-dictionary characters. The main BreakCache may end 1081 * up with boundaries from multiple dictionary based runs. 1082 * 1083 * The boundaries are stored in a simple ArrayList (vector), with the 1084 * assumption that they will be accessed sequentially. 1085 */ 1086 class DictionaryCache { 1087 1088 void reset() { 1089 fPositionInCache = -1; 1090 fStart = 0; 1091 fLimit = 0; 1092 fFirstRuleStatusIndex = 0; 1093 fOtherRuleStatusIndex = 0; 1094 fBreaks.removeAllElements(); 1095 }; 1096 1097 boolean following(int fromPos) { 1098 if (fromPos >= fLimit || fromPos < fStart) { 1099 fPositionInCache = -1; 1100 return false; 1101 } 1102 1103 // Sequential iteration, move from previous boundary to the following 1104 1105 int r = 0; 1106 if (fPositionInCache >= 0 && fPositionInCache < fBreaks.size() && fBreaks.elementAt(fPositionInCache) == fromPos) { 1107 ++fPositionInCache; 1108 if (fPositionInCache >= fBreaks.size()) { 1109 fPositionInCache = -1; 1110 return false; 1111 } 1112 r = fBreaks.elementAt(fPositionInCache); 1113 assert(r > fromPos); 1114 fBoundary = r; 1115 fStatusIndex = fOtherRuleStatusIndex; 1116 return true; 1117 } 1118 1119 // Random indexing. Linear search for the boundary following the given position. 1120 1121 for (fPositionInCache = 0; fPositionInCache < fBreaks.size(); ++fPositionInCache) { 1122 r= fBreaks.elementAt(fPositionInCache); 1123 if (r > fromPos) { 1124 fBoundary = r; 1125 fStatusIndex = fOtherRuleStatusIndex; 1126 return true; 1127 } 1128 } 1129 1130 // Internal error. fStart <= fromPos < fLimit, but no cached boundary. 1131 assert(false); 1132 fPositionInCache = -1; 1133 return false; 1134 }; 1135 preceding(int fromPos)1136 boolean preceding(int fromPos) { 1137 if (fromPos <= fStart || fromPos > fLimit) { 1138 fPositionInCache = -1; 1139 return false; 1140 } 1141 1142 if (fromPos == fLimit) { 1143 fPositionInCache = fBreaks.size() - 1; 1144 if (fPositionInCache >= 0) { 1145 assert(fBreaks.elementAt(fPositionInCache) == fromPos); 1146 } 1147 } 1148 1149 int r; 1150 if (fPositionInCache > 0 && fPositionInCache < fBreaks.size() && fBreaks.elementAt(fPositionInCache) == fromPos) { 1151 --fPositionInCache; 1152 r = fBreaks.elementAt(fPositionInCache); 1153 assert(r < fromPos); 1154 fBoundary = r; 1155 fStatusIndex = ( r== fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex; 1156 return true; 1157 } 1158 1159 if (fPositionInCache == 0) { 1160 fPositionInCache = -1; 1161 return false; 1162 } 1163 1164 for (fPositionInCache = fBreaks.size()-1; fPositionInCache >= 0; --fPositionInCache) { 1165 r = fBreaks.elementAt(fPositionInCache); 1166 if (r < fromPos) { 1167 fBoundary = r; 1168 fStatusIndex = ( r == fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex; 1169 return true; 1170 } 1171 } 1172 assert(false); 1173 fPositionInCache = -1; 1174 return false; 1175 }; 1176 1177 /** 1178 * Populate the cache with the dictionary based boundaries within a region of text. 1179 * @param startPos The start position of a range of text 1180 * @param endPos The end position of a range of text 1181 * @param firstRuleStatus The rule status index that applies to the break at startPos 1182 * @param otherRuleStatus The rule status index that applies to boundaries other than startPos 1183 * @internal 1184 */ populateDictionary(int startPos, int endPos, int firstRuleStatus, int otherRuleStatus)1185 void populateDictionary(int startPos, int endPos, 1186 int firstRuleStatus, int otherRuleStatus) { 1187 if ((endPos - startPos) <= 1) { 1188 return; 1189 } 1190 1191 reset(); 1192 fFirstRuleStatusIndex = firstRuleStatus; 1193 fOtherRuleStatusIndex = otherRuleStatus; 1194 1195 int rangeStart = startPos; 1196 int rangeEnd = endPos; 1197 1198 int category; 1199 int current; 1200 int foundBreakCount = 0; 1201 1202 // Loop through the text, looking for ranges of dictionary characters. 1203 // For each span, find the appropriate break engine, and ask it to find 1204 // any breaks within the span. 1205 1206 fText.setIndex(rangeStart); 1207 int c = CharacterIteration.current32(fText); 1208 category = (short)fRData.fTrie.get(c); 1209 int dictStart = fRData.fFTable.fDictCategoriesStart; 1210 1211 while(true) { 1212 while((current = fText.getIndex()) < rangeEnd && (category < dictStart)) { 1213 c = CharacterIteration.next32(fText); // pre-increment 1214 category = (short)fRData.fTrie.get(c); 1215 } 1216 if (current >= rangeEnd) { 1217 break; 1218 } 1219 1220 // We now have a dictionary character. Get the appropriate language object 1221 // to deal with it. 1222 LanguageBreakEngine lbe = getLanguageBreakEngine(c); 1223 1224 // Ask the language object if there are any breaks. It will add them to the cache and 1225 // leave the text pointer on the other side of its range, ready to search for the next one. 1226 if (lbe != null) { 1227 foundBreakCount += lbe.findBreaks(fText, rangeStart, rangeEnd, fBreaks, fPhraseBreaking); 1228 } 1229 1230 // Reload the loop variables for the next go-round 1231 c = CharacterIteration.current32(fText); 1232 category = (short)fRData.fTrie.get(c); 1233 } 1234 1235 // If we found breaks, ensure that the first and last entries are 1236 // the original starting and ending position. And initialize the 1237 // cache iteration position to the first entry. 1238 1239 // System.out.printf("foundBreakCount = %d%n", foundBreakCount); 1240 if (foundBreakCount > 0) { 1241 assert(foundBreakCount == fBreaks.size()); 1242 if (startPos < fBreaks.elementAt(0)) { 1243 // The dictionary did not place a boundary at the start of the segment of text. 1244 // Add one now. This should not commonly happen, but it would be easy for interactions 1245 // of the rules for dictionary segments and the break engine implementations to 1246 // inadvertently cause it. Cover it here, just in case. 1247 fBreaks.offer(startPos); 1248 } 1249 if (endPos > fBreaks.peek()) { 1250 fBreaks.push(endPos); 1251 } 1252 fPositionInCache = 0; 1253 // Note: Dictionary matching may extend beyond the original limit. 1254 fStart = fBreaks.elementAt(0); 1255 fLimit = fBreaks.peek(); 1256 } else { 1257 // there were no language-based breaks, even though the segment contained 1258 // dictionary characters. Subsequent attempts to fetch boundaries from the dictionary cache 1259 // for this range will fail, and the calling code will fall back to the rule based boundaries. 1260 } 1261 1262 }; 1263 1264 DictionaryCache()1265 DictionaryCache() { 1266 fPositionInCache = -1; 1267 fBreaks = new DictionaryBreakEngine.DequeI(); 1268 } 1269 1270 /** 1271 * copy constructor. Used by RuleBasedBreakIterator.clone(). 1272 * 1273 * @param src the source object to be copied. 1274 */ DictionaryCache(DictionaryCache src)1275 DictionaryCache(DictionaryCache src) { 1276 try { 1277 fBreaks = (DictionaryBreakEngine.DequeI)src.fBreaks.clone(); 1278 } 1279 catch (CloneNotSupportedException e) { 1280 throw new RuntimeException(e); 1281 } 1282 fPositionInCache = src.fPositionInCache; 1283 fStart = src.fStart; 1284 fLimit = src.fLimit; 1285 fFirstRuleStatusIndex = src.fFirstRuleStatusIndex; 1286 fOtherRuleStatusIndex = src.fOtherRuleStatusIndex; 1287 fBoundary = src.fBoundary; 1288 fStatusIndex = src.fStatusIndex; 1289 } 1290 1291 // A data structure containing the boundaries themselves. Essentially a vector of raw ints. 1292 DictionaryBreakEngine.DequeI fBreaks; 1293 int fPositionInCache; // Index in fBreaks of last boundary returned by following() 1294 // // or preceding(). Optimizes sequential access. 1295 int fStart; // Text position of first boundary in cache. 1296 int fLimit; // Last boundary in cache. Which is the limit of the 1297 // // text segment being handled by the dictionary. 1298 int fFirstRuleStatusIndex; // Rule status info for first boundary. 1299 int fOtherRuleStatusIndex; // Rule status info for 2nd through last boundaries. 1300 int fBoundary; // Current boundary. Set by preceding(), following(). 1301 int fStatusIndex; // Current rule status index. Set by preceding, following(). 1302 }; 1303 1304 1305 1306 1307 /* 1308 * class BreakCache 1309 * 1310 * Cache of break boundary positions and rule status values. 1311 * Break iterator API functions, next(), previous(), etc., will use cached results 1312 * when possible, and otherwise cache new results as they are obtained. 1313 * 1314 * Uniformly caches both dictionary and rule based (non-dictionary) boundaries. 1315 * 1316 * The cache is implemented as a single circular buffer. 1317 */ 1318 1319 /* 1320 * size of the circular cache buffer. 1321 */ 1322 1323 class BreakCache { 1324 BreakCache()1325 BreakCache() { 1326 reset(); 1327 }; 1328 reset(int pos, int ruleStatus)1329 void reset(int pos, int ruleStatus) { 1330 fStartBufIdx = 0; 1331 fEndBufIdx = 0; 1332 fTextIdx = pos; 1333 fBufIdx = 0; 1334 fBoundaries[0] = pos; 1335 fStatuses[0] = (short)ruleStatus; 1336 } 1337 reset()1338 void reset() {reset(0, 0); }; 1339 next()1340 void next() { 1341 if (fBufIdx == fEndBufIdx) { 1342 fDone = !populateFollowing(); 1343 fPosition = fTextIdx; 1344 fRuleStatusIndex = fStatuses[fBufIdx]; 1345 } else { 1346 fBufIdx = modChunkSize(fBufIdx + 1); 1347 fTextIdx = fPosition = fBoundaries[fBufIdx]; 1348 fRuleStatusIndex = fStatuses[fBufIdx]; 1349 } 1350 }; 1351 previous()1352 void previous() { 1353 int initialBufIdx = fBufIdx; 1354 if (fBufIdx == fStartBufIdx) { 1355 // At start of cache. Prepend to it. 1356 populatePreceding(); 1357 } else { 1358 // Cache already holds the next boundary 1359 fBufIdx = modChunkSize(fBufIdx - 1); 1360 fTextIdx = fBoundaries[fBufIdx]; 1361 } 1362 fDone = (fBufIdx == initialBufIdx); 1363 fPosition = fTextIdx; 1364 fRuleStatusIndex = fStatuses[fBufIdx]; 1365 return; 1366 }; 1367 1368 // Move the iteration state to the position following the startPosition. 1369 // Input position must be pinned to the input length. following(int startPos)1370 void following(int startPos) { 1371 if (startPos == fTextIdx || seek(startPos) || populateNear(startPos)) { 1372 // startPos is in the cache. Do a next() from that position. 1373 // TODO: an awkward set of interactions with bi->fDone 1374 // seek() does not clear it; it can't because of interactions with populateNear(). 1375 // next() does not clear it in the fast-path case, where everything matters. Maybe it should. 1376 // So clear it here, for the case where seek() succeeded on an iterator that had previously run off the end. 1377 fDone = false; 1378 next(); 1379 } 1380 1381 }; 1382 preceding(int startPos)1383 void preceding(int startPos) { 1384 if (startPos == fTextIdx || seek(startPos) || populateNear(startPos)) { 1385 if (startPos == fTextIdx) { 1386 previous(); 1387 } else { 1388 // seek() leaves the BreakCache positioned at the preceding boundary 1389 // if the requested position is between two boundaries. 1390 // current() pushes the BreakCache position out to the BreakIterator itself. 1391 assert(startPos > fTextIdx); 1392 current(); 1393 } 1394 } 1395 return; 1396 }; 1397 1398 /** 1399 * Update the state of the public BreakIterator (fBI) to reflect the 1400 * current state of the break iterator cache (this). 1401 */ current()1402 int current() { 1403 fPosition = fTextIdx; 1404 fRuleStatusIndex = fStatuses[fBufIdx]; 1405 fDone = false; 1406 return fTextIdx; 1407 }; 1408 1409 /** 1410 * Add boundaries to the cache near the specified position. 1411 * The given position need not be a boundary itself. 1412 * The input position must be within the range of the text, and 1413 * on a code point boundary. 1414 * If the requested position is a break boundary, leave the iteration 1415 * position on it. 1416 * If the requested position is not a boundary, leave the iteration 1417 * position on the preceding boundary and include both the the 1418 * preceding and following boundaries in the cache. 1419 * Additional boundaries, either preceding or following, may be added 1420 * to the cache as a side effect. 1421 * 1422 * Return false if the operation failed. 1423 */ populateNear(int position)1424 boolean populateNear(int position) { 1425 assert(position < fBoundaries[fStartBufIdx] || position > fBoundaries[fEndBufIdx]); 1426 1427 // Add boundaries to the cache near the specified position. 1428 // The given position need not be a boundary itself. 1429 // The input position must be within the range of the text, and 1430 // on a code point boundary. 1431 // If the requested position is a break boundary, leave the iteration 1432 // position on it. 1433 // If the requested position is not a boundary, leave the iteration 1434 // position on the preceding boundary and include both the 1435 // preceding and following boundaries in the cache. 1436 // Additional boundaries, either preceding or following, may be added 1437 // to the cache as a side effect. 1438 1439 // If the requested position is not near already cached positions, clear the existing cache, 1440 // find a near-by boundary and begin new cache contents there. 1441 1442 // Threshold for a text position to be considered near to existing cache contents. 1443 // TODO: See issue ICU-22024 "perf tuning of Cache needed." 1444 // This value is subject to change. See the ticket for more details. 1445 final int CACHE_NEAR = 15; 1446 1447 int startOfText = fText.getBeginIndex(); 1448 int aBoundary = -1; 1449 int ruleStatusIndex = 0; 1450 boolean retainCache = false; 1451 if ((position > fBoundaries[fStartBufIdx] - CACHE_NEAR) && position < (fBoundaries[fEndBufIdx] + CACHE_NEAR)) { 1452 // Requested position is near the existing cache. Retain it. 1453 retainCache = true; 1454 } else if (position <= startOfText + CACHE_NEAR) { 1455 // Requested position is near the start of the text. Fill cache from start, skipping 1456 // the need to find a safe point. 1457 retainCache = false; 1458 aBoundary = startOfText; 1459 } else { 1460 // Requested position is not near the existing cache. 1461 // Find a safe point to refill the cache from. 1462 int backupPos = handleSafePrevious(position); 1463 1464 if (fBoundaries[fEndBufIdx] < position && fBoundaries[fEndBufIdx] >= (backupPos - CACHE_NEAR)) { 1465 // The requested position is beyond the end of the existing cache, but the 1466 // reverse rules produced a position near or before the cached region. 1467 // Retain the existing cache, and fill from the end of it. 1468 retainCache = true; 1469 } else if (backupPos < startOfText + CACHE_NEAR) { 1470 // The safe reverse rules moved us to near the start of text. 1471 // Take that (index 0) as the backup boundary, avoiding the complication 1472 // (in the following block) of moving forward from the safe point to a known boundary. 1473 // 1474 // Retain the cache if it begins not too far from the requested position. 1475 aBoundary = startOfText; 1476 retainCache = (fBoundaries[fStartBufIdx] <= (position + CACHE_NEAR)); 1477 } else { 1478 // The safe reverse rules produced a position that is neither near the existing 1479 // cache, nor near the start of text. 1480 // Advance to the boundary following. 1481 // There is a complication: the safe reverse rules identify pairs of code points 1482 // that are safe. If advancing from the safe point moves forwards by less than 1483 // two code points, we need to advance one more time to ensure that the boundary 1484 // is good, including a correct rules status value. 1485 // 1486 retainCache = false; 1487 fPosition = backupPos; 1488 aBoundary = handleNext(); 1489 if (aBoundary == backupPos + 1 || 1490 (aBoundary == backupPos + 2 && 1491 Character.isHighSurrogate(fText.setIndex(backupPos)) && 1492 Character.isLowSurrogate(fText.next()))) { 1493 // The initial handleNext() only advanced by a single code point. Go again. 1494 // Safe rules identify safe pairs. 1495 aBoundary = handleNext(); 1496 } 1497 if (aBoundary == BreakIterator.DONE) { 1498 aBoundary = fText.getEndIndex(); 1499 } 1500 ruleStatusIndex = fRuleStatusIndex; 1501 } 1502 } 1503 1504 if (!retainCache) { 1505 assert(aBoundary != -1); 1506 reset(aBoundary, ruleStatusIndex); // Reset cache to hold aBoundary as a single starting point. 1507 } 1508 1509 1510 // Fill in boundaries between existing cache content and the new requested position. 1511 1512 if (fBoundaries[fEndBufIdx] < position) { 1513 // The last position in the cache precedes the requested position. 1514 // Add following position(s) to the cache. 1515 while (fBoundaries[fEndBufIdx] < position) { 1516 if (!populateFollowing()) { 1517 assert false; 1518 return false; 1519 } 1520 } 1521 fBufIdx = fEndBufIdx; // Set iterator position to the end of the buffer. 1522 fTextIdx = fBoundaries[fBufIdx]; // Required because populateFollowing may add extra boundaries. 1523 while (fTextIdx > position) { // Move backwards to a position at or preceding the requested pos. 1524 previous(); 1525 } 1526 return true; 1527 } 1528 1529 if (fBoundaries[fStartBufIdx] > position) { 1530 // The first position in the cache is beyond the requested position. 1531 // back up more until we get a boundary <= the requested position. 1532 while (fBoundaries[fStartBufIdx] > position) { 1533 populatePreceding(); 1534 } 1535 fBufIdx = fStartBufIdx; // Set iterator position to the start of the buffer. 1536 fTextIdx = fBoundaries[fBufIdx]; // Required because populatePreceding may add extra boundaries. 1537 while (fTextIdx < position) { // Move forwards to a position at or following the requested pos. 1538 next(); 1539 } 1540 if (fTextIdx > position) { 1541 // If position is not itself a boundary, the next() loop above will overshoot. 1542 // Back up one, leaving cache position at the boundary preceding the requested position. 1543 previous(); 1544 } 1545 return true; 1546 } 1547 1548 assert fTextIdx == position; 1549 return true; 1550 1551 }; 1552 1553 /** 1554 * Add boundary(s) to the cache following the current last boundary. 1555 * Return false if at the end of the text, and no more boundaries can be added. 1556 * Leave iteration position at the first newly added boundary, or unchanged if no boundary was added. 1557 */ populateFollowing()1558 boolean populateFollowing() { 1559 int fromPosition = fBoundaries[fEndBufIdx]; 1560 int fromRuleStatusIdx = fStatuses[fEndBufIdx]; 1561 int pos = 0; 1562 int ruleStatusIdx = 0; 1563 1564 if (fDictionaryCache.following(fromPosition)) { 1565 addFollowing(fDictionaryCache.fBoundary, fDictionaryCache.fStatusIndex, UpdateCachePosition); 1566 return true; 1567 } 1568 1569 fPosition = fromPosition; 1570 pos = handleNext(); 1571 if (pos == BreakIterator.DONE) { 1572 return false; 1573 } 1574 1575 ruleStatusIdx = fRuleStatusIndex; 1576 if (fDictionaryCharCount > 0) { 1577 // The text segment obtained from the rules includes dictionary characters. 1578 // Subdivide it, with subdivided results going into the dictionary cache. 1579 fDictionaryCache.populateDictionary(fromPosition, pos, fromRuleStatusIdx, ruleStatusIdx); 1580 if (fDictionaryCache.following(fromPosition)) { 1581 addFollowing(fDictionaryCache.fBoundary, fDictionaryCache.fStatusIndex, UpdateCachePosition); 1582 return true; 1583 // TODO: may want to move a sizable chunk of the dictionary cache to the break cache at this point. 1584 // But be careful with interactions with populateNear(). 1585 } 1586 } 1587 1588 // Rule based segment did not include dictionary characters. 1589 // Or, it did contain dictionary chars, but the dictionary segmenter didn't handle them, 1590 // meaning that we didn't take the return, above. 1591 // Add its end point to the cache. 1592 addFollowing(pos, ruleStatusIdx, UpdateCachePosition); 1593 1594 // Add several non-dictionary boundaries at this point, to optimize straight forward iteration. 1595 // (subsequent calls to BreakIterator::next() will take the fast path, getting cached results. 1596 // 1597 for (int count=0; count<6; ++count) { 1598 pos = handleNext(); 1599 if (pos == BreakIterator.DONE || fDictionaryCharCount > 0) { 1600 break; 1601 } 1602 addFollowing(pos, fRuleStatusIndex, RetainCachePosition); 1603 } 1604 return true; 1605 }; 1606 1607 /** 1608 * Add one or more boundaries to the cache preceding the first currently cached boundary. 1609 * Leave the iteration position on the first added boundary. 1610 * Return false if no boundaries could be added (if at the start of the text.) 1611 */ populatePreceding()1612 boolean populatePreceding() { 1613 int textBegin = fText.getBeginIndex(); 1614 int fromPosition = fBoundaries[fStartBufIdx]; 1615 if (fromPosition == textBegin) { 1616 return false; 1617 } 1618 1619 int position = textBegin; 1620 int positionStatusIdx = 0; 1621 1622 if (fDictionaryCache.preceding(fromPosition)) { 1623 addPreceding(fDictionaryCache.fBoundary, fDictionaryCache.fStatusIndex, UpdateCachePosition); 1624 return true; 1625 } 1626 1627 int backupPosition = fromPosition; 1628 1629 // Find a boundary somewhere preceding the first already-cached boundary 1630 do { 1631 backupPosition = backupPosition - 30; 1632 if (backupPosition <= textBegin) { 1633 backupPosition = textBegin; 1634 } else { 1635 backupPosition = handleSafePrevious(backupPosition); 1636 } 1637 if (backupPosition == BreakIterator.DONE || backupPosition == textBegin) { 1638 position = textBegin; 1639 positionStatusIdx = 0; 1640 } else { 1641 // Advance to the boundary following the backup position. 1642 // There is a complication: the safe reverse rules identify pairs of code points 1643 // that are safe. If advancing from the safe point moves forwards by less than 1644 // two code points, we need to advance one more time to ensure that the boundary 1645 // is good, including a correct rules status value. 1646 // 1647 fPosition = backupPosition; // TODO: pass starting position in a clearer way. 1648 position = handleNext(); 1649 if (position == backupPosition + 1 || 1650 (position == backupPosition + 2 && 1651 Character.isHighSurrogate(fText.setIndex(backupPosition)) && 1652 Character.isLowSurrogate(fText.next()))) { 1653 // The initial handleNext() only advanced by a single code point. Go again. 1654 // Safe rules identify safe pairs. 1655 position = handleNext(); 1656 } 1657 positionStatusIdx = fRuleStatusIndex; 1658 } 1659 } while (position >= fromPosition); 1660 1661 // Find boundaries between the one we just located and the first already-cached boundary 1662 // Put them in a side buffer, because we don't yet know where they will fall in the circular cache buffer. 1663 1664 fSideBuffer.removeAllElements(); 1665 fSideBuffer.push(position); 1666 fSideBuffer.push(positionStatusIdx); 1667 1668 do { 1669 int prevPosition = fPosition = position; 1670 int prevStatusIdx = positionStatusIdx; 1671 position = handleNext(); 1672 positionStatusIdx = fRuleStatusIndex; 1673 if (position == BreakIterator.DONE) { 1674 break; 1675 } 1676 1677 boolean segmentHandledByDictionary = false; 1678 if (fDictionaryCharCount != 0) { 1679 // Segment from the rules includes dictionary characters. 1680 // Subdivide it, with subdivided results going into the dictionary cache. 1681 int dictSegEndPosition = position; 1682 fDictionaryCache.populateDictionary(prevPosition, dictSegEndPosition, prevStatusIdx, positionStatusIdx); 1683 while (fDictionaryCache.following(prevPosition)) { 1684 position = fDictionaryCache.fBoundary; 1685 positionStatusIdx = fDictionaryCache.fStatusIndex; 1686 segmentHandledByDictionary = true; 1687 assert(position > prevPosition); 1688 if (position >= fromPosition) { 1689 break; 1690 } 1691 assert(position <= dictSegEndPosition); 1692 fSideBuffer.push(position); 1693 fSideBuffer.push(positionStatusIdx); 1694 prevPosition = position; 1695 } 1696 assert(position==dictSegEndPosition || position>=fromPosition); 1697 } 1698 1699 if (!segmentHandledByDictionary && position < fromPosition) { 1700 fSideBuffer.push(position); 1701 fSideBuffer.push(positionStatusIdx); 1702 } 1703 } while (position < fromPosition); 1704 1705 // Move boundaries from the side buffer to the main circular buffer. 1706 boolean success = false; 1707 if (!fSideBuffer.isEmpty()) { 1708 positionStatusIdx = fSideBuffer.pop(); 1709 position = fSideBuffer.pop(); 1710 addPreceding(position, positionStatusIdx, UpdateCachePosition); 1711 success = true; 1712 } 1713 1714 while (!fSideBuffer.isEmpty()) { 1715 positionStatusIdx = fSideBuffer.pop(); 1716 position = fSideBuffer.pop(); 1717 if (!addPreceding(position, positionStatusIdx, RetainCachePosition)) { 1718 // No space in circular buffer to hold a new preceding result while 1719 // also retaining the current cache (iteration) position. 1720 // Bailing out is safe; the cache will refill again if needed. 1721 break; 1722 } 1723 } 1724 return success; 1725 }; 1726 1727 1728 static final boolean RetainCachePosition = false; 1729 static final boolean UpdateCachePosition = true; 1730 1731 /** 1732 * Add the boundary following the current position. 1733 * The current position can be left as it was, or changed to the newly added boundary, 1734 * as specified by the update parameter. 1735 */ addFollowing(int position, int ruleStatusIdx, boolean update)1736 void addFollowing(int position, int ruleStatusIdx, boolean update) { 1737 assert(position > fBoundaries[fEndBufIdx]); 1738 assert(ruleStatusIdx <= Short.MAX_VALUE); 1739 int nextIdx = modChunkSize(fEndBufIdx + 1); 1740 if (nextIdx == fStartBufIdx) { 1741 fStartBufIdx = modChunkSize(fStartBufIdx + 6); // TODO: experiment. Probably revert to 1. 1742 } 1743 fBoundaries[nextIdx] = position; 1744 fStatuses[nextIdx] = (short)ruleStatusIdx; 1745 fEndBufIdx = nextIdx; 1746 if (update == UpdateCachePosition) { 1747 // Set current position to the newly added boundary. 1748 fBufIdx = nextIdx; 1749 fTextIdx = position; 1750 } else { 1751 // Retaining the original cache position. 1752 // Check if the added boundary wraps around the buffer, and would over-write the original position. 1753 // It's the responsibility of callers of this function to not add too many. 1754 assert(nextIdx != fBufIdx); 1755 } 1756 1757 }; 1758 1759 1760 /** 1761 * Add the boundary preceding the current position. 1762 * The current position can be left as it was, or changed to the newly added boundary, 1763 * as specified by the update parameter. 1764 */ addPreceding(int position, int ruleStatusIdx, boolean update)1765 boolean addPreceding(int position, int ruleStatusIdx, boolean update) { 1766 assert(position < fBoundaries[fStartBufIdx]); 1767 assert(ruleStatusIdx <= Short.MAX_VALUE); 1768 int nextIdx = modChunkSize(fStartBufIdx - 1); 1769 if (nextIdx == fEndBufIdx) { 1770 if (fBufIdx == fEndBufIdx && update == RetainCachePosition) { 1771 // Failure. The insertion of the new boundary would claim the buffer position that is the 1772 // current iteration position. And we also want to retain the current iteration position. 1773 // (The buffer is already completely full of entries that precede the iteration position.) 1774 return false; 1775 } 1776 fEndBufIdx = modChunkSize(fEndBufIdx - 1); 1777 } 1778 fBoundaries[nextIdx] = position; 1779 fStatuses[nextIdx] = (short)ruleStatusIdx; 1780 fStartBufIdx = nextIdx; 1781 if (update == UpdateCachePosition) { 1782 fBufIdx = nextIdx; 1783 fTextIdx = position; 1784 } 1785 return true; 1786 }; 1787 1788 /** 1789 * Set the cache position to the specified position, or, if the position 1790 * falls between to cached boundaries, to the preceding boundary. 1791 * Fails if the requested position is outside of the range of boundaries currently held by the cache. 1792 * The startPosition must be on a code point boundary. 1793 * 1794 * Return true if successful, false if the specified position is after 1795 * the last cached boundary or before the first. 1796 */ 1797 boolean seek(int pos) { 1798 if (pos < fBoundaries[fStartBufIdx] || pos > fBoundaries[fEndBufIdx]) { 1799 return false; 1800 } 1801 if (pos == fBoundaries[fStartBufIdx]) { 1802 // Common case: seek(0), from BreakIterator::first() 1803 fBufIdx = fStartBufIdx; 1804 fTextIdx = fBoundaries[fBufIdx]; 1805 return true; 1806 } 1807 if (pos == fBoundaries[fEndBufIdx]) { 1808 fBufIdx = fEndBufIdx; 1809 fTextIdx = fBoundaries[fBufIdx]; 1810 return true; 1811 } 1812 1813 int min = fStartBufIdx; 1814 int max = fEndBufIdx; 1815 while (min != max) { 1816 int probe = (min + max + (min>max ? CACHE_SIZE : 0)) / 2; 1817 probe = modChunkSize(probe); 1818 if (fBoundaries[probe] > pos) { 1819 max = probe; 1820 } else { 1821 min = modChunkSize(probe + 1); 1822 } 1823 } 1824 assert(fBoundaries[max] > pos); 1825 fBufIdx = modChunkSize(max - 1); 1826 fTextIdx = fBoundaries[fBufIdx]; 1827 assert(fTextIdx <= pos); 1828 return true; 1829 1830 }; 1831 1832 1833 /** 1834 * copy constructor, used from RuleBasedBreakIterator.clone(). 1835 * 1836 * @param src 1837 */ BreakCache(BreakCache src)1838 BreakCache(BreakCache src) { 1839 fStartBufIdx = src.fStartBufIdx; 1840 fEndBufIdx = src.fEndBufIdx; 1841 fTextIdx = src.fTextIdx; 1842 fBufIdx = src.fBufIdx; 1843 fBoundaries = src.fBoundaries.clone(); 1844 fStatuses = src.fStatuses.clone(); 1845 fSideBuffer = new DictionaryBreakEngine.DequeI(); // Transient, no need to clone contents. 1846 } 1847 dumpCache()1848 void dumpCache() { 1849 System.out.printf("fTextIdx:%d fBufIdx:%d%n", fTextIdx, fBufIdx); 1850 for (int i=fStartBufIdx; ; i=modChunkSize(i+1)) { 1851 System.out.printf("%d %d%n", i, fBoundaries[i]); 1852 if (i == fEndBufIdx) { 1853 break; 1854 } 1855 } 1856 }; 1857 modChunkSize(int index)1858 private final int modChunkSize(int index) { return index & (CACHE_SIZE - 1); }; 1859 1860 static final int CACHE_SIZE = 128; 1861 // static_assert((CACHE_SIZE & (CACHE_SIZE-1)) == 0, "CACHE_SIZE must be power of two."); 1862 1863 int fStartBufIdx; 1864 int fEndBufIdx; // inclusive 1865 1866 int fTextIdx; 1867 int fBufIdx; 1868 1869 int[] fBoundaries = new int[CACHE_SIZE]; 1870 short[] fStatuses = new short[CACHE_SIZE]; 1871 1872 DictionaryBreakEngine.DequeI fSideBuffer = new DictionaryBreakEngine.DequeI(); 1873 }; 1874 1875 1876 1877 1878 } 1879 1880