1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * Copyright (C) 2005-2016 International Business Machines Corporation and 6 * others. All Rights Reserved. 7 ******************************************************************************* 8 */ 9 10 package com.ibm.icu.text; 11 12 import static com.ibm.icu.impl.CharacterIteration.DONE32; 13 import static com.ibm.icu.impl.CharacterIteration.next32; 14 import static com.ibm.icu.impl.CharacterIteration.nextTrail32; 15 16 import java.io.ByteArrayOutputStream; 17 import java.io.IOException; 18 import java.io.InputStream; 19 import java.io.OutputStream; 20 import java.nio.ByteBuffer; 21 import java.text.CharacterIterator; 22 import java.util.ArrayList; 23 import java.util.List; 24 25 import com.ibm.icu.impl.CharacterIteration; 26 import com.ibm.icu.impl.ICUBinary; 27 import com.ibm.icu.impl.ICUDebug; 28 import com.ibm.icu.impl.RBBIDataWrapper; 29 import com.ibm.icu.lang.UCharacter; 30 import com.ibm.icu.lang.UProperty; 31 import com.ibm.icu.lang.UScript; 32 import com.ibm.icu.util.CodePointTrie; 33 34 /** 35 * Rule Based Break Iterator 36 * This is a port of the C++ class RuleBasedBreakIterator from ICU4C. 37 * 38 * @stable ICU 2.0 39 */ 40 public class RuleBasedBreakIterator extends BreakIterator { 41 //======================================================================= 42 // Constructors & Factories 43 //======================================================================= 44 45 /** 46 * private constructor 47 */ RuleBasedBreakIterator()48 private RuleBasedBreakIterator() { 49 fDictionaryCharCount = 0; 50 synchronized(gAllBreakEngines) { 51 fBreakEngines = new ArrayList<>(gAllBreakEngines); 52 } 53 } 54 55 /** 56 * Create a break iterator from a precompiled set of break rules. 57 * 58 * Creating a break iterator from the binary rules is much faster than 59 * creating one from source rules. 60 * 61 * The binary rules are generated by the RuleBasedBreakIterator.compileRules() function. 62 * Binary break iterator rules are not guaranteed to be compatible between 63 * different versions of ICU. 64 * 65 * @param is an input stream supplying the compiled binary rules. 66 * @throws IOException if there is an error while reading the rules from the InputStream. 67 * @see #compileRules(String, OutputStream) 68 * @stable ICU 4.8 69 */ getInstanceFromCompiledRules(InputStream is)70 public static RuleBasedBreakIterator getInstanceFromCompiledRules(InputStream is) throws IOException { 71 RuleBasedBreakIterator This = new RuleBasedBreakIterator(); 72 This.fRData = RBBIDataWrapper.get(ICUBinary.getByteBufferFromInputStreamAndCloseStream(is)); 73 This.fLookAheadMatches = new int[This.fRData.fFTable.fLookAheadResultsSize]; 74 return This; 75 } 76 77 /** 78 * Create a break iterator from a precompiled set of break rules. 79 * 80 * Creating a break iterator from the binary rules is much faster than 81 * creating one from source rules. 82 * 83 * The binary rules are generated by the RuleBasedBreakIterator.compileRules() function. 84 * Binary break iterator rules are not guaranteed to be compatible between 85 * different versions of ICU. 86 * 87 * @param bytes a buffer supplying the compiled binary rules. 88 * @throws IOException if there is an error while reading the rules from the buffer. 89 * @see #compileRules(String, OutputStream) 90 * @internal 91 * @deprecated This API is ICU internal only. 92 */ 93 @Deprecated getInstanceFromCompiledRules(ByteBuffer bytes)94 public static RuleBasedBreakIterator getInstanceFromCompiledRules(ByteBuffer bytes) throws IOException { 95 RuleBasedBreakIterator This = new RuleBasedBreakIterator(); 96 This.fRData = RBBIDataWrapper.get(bytes); 97 This.fLookAheadMatches = new int[This.fRData.fFTable.fLookAheadResultsSize]; 98 return This; 99 } 100 101 /** 102 * Construct a RuleBasedBreakIterator from a set of rules supplied as a string. 103 * @param rules The break rules to be used. 104 * @stable ICU 2.2 105 */ RuleBasedBreakIterator(String rules)106 public RuleBasedBreakIterator(String rules) { 107 this(); 108 try { 109 ByteArrayOutputStream ruleOS = new ByteArrayOutputStream(); 110 compileRules(rules, ruleOS); 111 fRData = RBBIDataWrapper.get(ByteBuffer.wrap(ruleOS.toByteArray())); 112 fLookAheadMatches = new int[fRData.fFTable.fLookAheadResultsSize]; 113 } catch (IOException e) { 114 ///CLOVER:OFF 115 // An IO exception can only arrive here if there is a bug in the RBBI Rule compiler, 116 // causing bogus compiled rules to be produced, but with no compile error raised. 117 RuntimeException rte = new RuntimeException("RuleBasedBreakIterator rule compilation internal error: " 118 + e.getMessage()); 119 throw rte; 120 ///CLOVER:ON 121 } 122 } 123 124 //======================================================================= 125 // Boilerplate 126 //======================================================================= 127 128 /** 129 * Clones this iterator. 130 * @return A newly-constructed RuleBasedBreakIterator with the same 131 * behavior as this one. 132 * @stable ICU 2.0 133 */ 134 @Override clone()135 public Object clone() { 136 RuleBasedBreakIterator result; 137 result = (RuleBasedBreakIterator)super.clone(); 138 if (fText != null) { 139 result.fText = (CharacterIterator)(fText.clone()); 140 } 141 synchronized (gAllBreakEngines) { 142 result.fBreakEngines = new ArrayList<>(gAllBreakEngines); 143 } 144 result.fLookAheadMatches = new int[fRData.fFTable.fLookAheadResultsSize]; 145 result.fBreakCache = result.new BreakCache(fBreakCache); 146 result.fDictionaryCache = result.new DictionaryCache(fDictionaryCache); 147 return result; 148 } 149 150 151 /** 152 * Returns true if both BreakIterators are of the same class, have the same 153 * rules, and iterate over the same text. 154 * @stable ICU 2.0 155 */ 156 @Override equals(Object that)157 public boolean equals(Object that) { 158 if (that == null) { 159 return false; 160 } 161 if (this == that) { 162 return true; 163 } 164 try { 165 RuleBasedBreakIterator other = (RuleBasedBreakIterator) that; 166 if (fRData != other.fRData && (fRData == null || other.fRData == null)) { 167 return false; 168 } 169 if (fRData != null && other.fRData != null && 170 (!fRData.fRuleSource.equals(other.fRData.fRuleSource))) { 171 return false; 172 } 173 if (fText == null && other.fText == null) { 174 return true; 175 } 176 if (fText == null || other.fText == null || !fText.equals(other.fText)) { 177 return false; 178 } 179 return fPosition == other.fPosition; 180 } 181 catch(ClassCastException e) { 182 return false; 183 } 184 } 185 186 /** 187 * Returns the description (rules) used to create this iterator. 188 * (In ICU4C, the same function is RuleBasedBreakIterator::getRules()) 189 * @stable ICU 2.0 190 */ 191 @Override toString()192 public String toString() { 193 String retStr = ""; 194 if (fRData != null) { 195 retStr = fRData.fRuleSource; 196 } 197 return retStr; 198 } 199 200 /** 201 * Compute a hashcode for this BreakIterator 202 * @return A hash code 203 * @stable ICU 2.0 204 */ 205 @Override hashCode()206 public int hashCode() 207 { 208 return fRData.fRuleSource.hashCode(); 209 } 210 211 212 private static final int START_STATE = 1; // The state number of the starting state 213 private static final int STOP_STATE = 0; // The state-transition value indicating "stop" 214 215 // RBBIRunMode - the state machine runs an extra iteration at the beginning and end 216 // of user text. A variable with this enum type keeps track of where we 217 // are. The state machine only fetches user text input while in RUN mode. 218 private static final int RBBI_START = 0; 219 private static final int RBBI_RUN = 1; 220 private static final int RBBI_END = 2; 221 222 /** 223 * The character iterator through which this BreakIterator accesses the text. 224 */ 225 private CharacterIterator fText = new java.text.StringCharacterIterator(""); 226 227 /** 228 * The rule data for this BreakIterator instance. 229 * Not intended for public use. Declared public for testing purposes only. 230 * @internal 231 * @deprecated This API is ICU internal only. 232 */ 233 @Deprecated 234 public RBBIDataWrapper fRData; 235 236 /** 237 * The iteration state - current position, rule status for the current position, 238 * and whether the iterator ran off the end, yielding UBRK_DONE. 239 * Current position is pinned to be 0 < position <= text.length. 240 * Current position is always set to a boundary. 241 * 242 * The current position of the iterator. Pinned, 0 < fPosition <= text.length. 243 * Never has the value UBRK_DONE (-1). 244 */ 245 private int fPosition; 246 247 /** 248 * Index of the Rule {tag} values for the most recent match. 249 */ 250 private int fRuleStatusIndex; 251 252 /** 253 * True when iteration has run off the end, and iterator functions should return UBRK_DONE. 254 */ 255 private boolean fDone; 256 257 /** 258 * Array of look-ahead tentative results. 259 */ 260 private int[] fLookAheadMatches; 261 262 /** 263 * Cache of previously determined boundary positions. 264 */ 265 private BreakCache fBreakCache = new BreakCache(); 266 267 268 /** 269 * Counter for the number of characters encountered with the "dictionary" 270 * flag set. Normal RBBI iterators don't use it, although the code 271 * for updating it is live. Dictionary Based break iterators (a subclass 272 * of us) access this field directly. 273 * @internal 274 */ 275 private int fDictionaryCharCount; 276 277 private DictionaryCache fDictionaryCache = new DictionaryCache(); 278 279 /** 280 * ICU debug argument name for RBBI 281 */ 282 private static final String RBBI_DEBUG_ARG = "rbbi"; 283 284 /** 285 * Debugging flag. Trace operation of state machine when true. 286 */ 287 private static final boolean TRACE = ICUDebug.enabled(RBBI_DEBUG_ARG) 288 && ICUDebug.value(RBBI_DEBUG_ARG).indexOf("trace") >= 0; 289 290 /** 291 * The "default" break engine - just skips over ranges of dictionary words, 292 * producing no breaks. Should only be used if characters need to be handled 293 * by a dictionary but we have no dictionary implementation for them. 294 * 295 * Only one instance; shared by all break iterators. 296 */ 297 private static final UnhandledBreakEngine gUnhandledBreakEngine; 298 299 /** 300 * List of all known break engines, common for all break iterators. 301 * Lazily updated as break engines are needed, because instantiation of 302 * break engines is expensive. 303 * 304 * Because gAllBreakEngines can be referenced concurrently from different 305 * BreakIterator instances, all access is synchronized. 306 */ 307 private static final List<LanguageBreakEngine> gAllBreakEngines; 308 309 static { 310 gUnhandledBreakEngine = new UnhandledBreakEngine(); 311 gAllBreakEngines = new ArrayList<>(); 312 gAllBreakEngines.add(gUnhandledBreakEngine); 313 } 314 315 /** 316 * List of all known break engines. Similar to gAllBreakEngines, but local to a 317 * break iterator, allowing it to be used without synchronization. 318 */ 319 private List<LanguageBreakEngine> fBreakEngines; 320 321 /** 322 * Dump the contents of the state table and character classes for this break iterator. 323 * For debugging only. 324 * @internal 325 * @deprecated This API is ICU internal only. 326 */ 327 @Deprecated dump(java.io.PrintStream out)328 public void dump(java.io.PrintStream out) { 329 if (out == null) { 330 out = System.out; 331 } 332 this.fRData.dump(out); 333 } 334 335 /** 336 * Compile a set of source break rules into the binary state tables used 337 * by the break iterator engine. Creating a break iterator from precompiled 338 * rules is much faster than creating one from source rules. 339 * 340 * Binary break rules are not guaranteed to be compatible between different 341 * versions of ICU. 342 * 343 * 344 * @param rules The source form of the break rules 345 * @param ruleBinary An output stream to receive the compiled rules. 346 * @throws IOException If there is an error writing the output. 347 * @see #getInstanceFromCompiledRules(InputStream) 348 * @stable ICU 4.8 349 */ compileRules(String rules, OutputStream ruleBinary)350 public static void compileRules(String rules, OutputStream ruleBinary) throws IOException { 351 RBBIRuleBuilder.compileRules(rules, ruleBinary); 352 } 353 354 //======================================================================= 355 // BreakIterator overrides 356 //======================================================================= 357 358 /** 359 * Sets the current iteration position to the beginning of the text. 360 * (i.e., the CharacterIterator's starting offset). 361 * @return The offset of the beginning of the text. 362 * @stable ICU 2.0 363 */ 364 @Override first()365 public int first() { 366 if (fText == null) { 367 return BreakIterator.DONE; 368 } 369 fText.first(); 370 int start = fText.getIndex(); 371 if (!fBreakCache.seek(start)) { 372 fBreakCache.populateNear(start); 373 } 374 fBreakCache.current(); 375 assert(fPosition == start); 376 return fPosition; 377 } 378 379 /** 380 * Sets the current iteration position to the end of the text. 381 * (i.e., the CharacterIterator's ending offset). 382 * @return The text's past-the-end offset. 383 * @stable ICU 2.0 384 */ 385 @Override last()386 public int last() { 387 if (fText == null) { 388 return BreakIterator.DONE; 389 } 390 int endPos = fText.getEndIndex(); 391 boolean endShouldBeBoundary = isBoundary(endPos); // Has side effect of setting iterator position. 392 assert(endShouldBeBoundary); 393 if (fPosition != endPos) { 394 assert(fPosition == endPos); 395 } 396 return endPos; 397 } 398 399 /** 400 * Advances the iterator either forward or backward the specified number of steps. 401 * Negative values move backward, and positive values move forward. This is 402 * equivalent to repeatedly calling next() or previous(). 403 * @param n The number of steps to move. The sign indicates the direction 404 * (negative is backwards, and positive is forwards). 405 * @return The character offset of the boundary position n boundaries away from 406 * the current one. 407 * @stable ICU 2.0 408 */ 409 @Override next(int n)410 public int next(int n) { 411 int result = 0; 412 if (n > 0) { 413 for (; n > 0 && result != DONE; --n) { 414 result = next(); 415 } 416 } else if (n < 0) { 417 for (; n < 0 && result != DONE; ++n) { 418 result = previous(); 419 } 420 } else { 421 result = current(); 422 } 423 return result; 424 } 425 426 /** 427 * Advances the iterator to the next boundary position. 428 * @return The position of the first boundary after this one. 429 * @stable ICU 2.0 430 */ 431 @Override next()432 public int next() { 433 fBreakCache.next(); 434 return fDone ? DONE : fPosition; 435 } 436 437 /** 438 * Moves the iterator backwards, to the boundary preceding the current one. 439 * @return The position of the boundary position immediately preceding the starting position. 440 * @stable ICU 2.0 441 */ 442 @Override previous()443 public int previous() { 444 fBreakCache.previous(); 445 return fDone ? DONE : fPosition; 446 } 447 448 /** 449 * Sets the iterator to refer to the first boundary position following 450 * the specified position. 451 * @param startPos The position from which to begin searching for a break position. 452 * @return The position of the first break after the current position. 453 * @stable ICU 2.0 454 */ 455 @Override following(int startPos)456 public int following(int startPos) { 457 // if the supplied position is before the beginning, return the 458 // text's starting offset 459 if (startPos < fText.getBeginIndex()) { 460 return first(); 461 } 462 463 // Move requested offset to a code point start. It might be between a lead and trail surrogate. 464 // Or it may be beyond the end of the text. 465 startPos = CISetIndex32(fText, startPos); 466 fBreakCache.following(startPos); 467 return fDone ? DONE : fPosition; 468 } 469 470 471 /** 472 * Sets the iterator to refer to the last boundary position before the 473 * specified position. 474 * @param offset The position to begin searching for a break from. 475 * @return The position of the last boundary before the starting position. 476 * @stable ICU 2.0 477 */ 478 @Override preceding(int offset)479 public int preceding(int offset) { 480 if (fText == null || offset > fText.getEndIndex()) { 481 return last(); 482 } else if (offset < fText.getBeginIndex()) { 483 return first(); 484 } 485 486 // Move requested offset to a code point start. It might be between a lead and trail surrogate. 487 // int adjustedOffset = CISetIndex32(fText, offset); // TODO: restore to match ICU4C behavior. 488 int adjustedOffset = offset; 489 fBreakCache.preceding(adjustedOffset); 490 return fDone ? DONE : fPosition; 491 492 } 493 494 495 /** 496 * Throw IllegalArgumentException unless begin <= offset < end. 497 * @stable ICU 2.0 498 */ checkOffset(int offset, CharacterIterator text)499 protected static final void checkOffset(int offset, CharacterIterator text) { 500 if (offset < text.getBeginIndex() || offset > text.getEndIndex()) { 501 throw new IllegalArgumentException("offset out of bounds"); 502 } 503 } 504 505 506 /** 507 * Returns true if the specified position is a boundary position. As a side 508 * effect, leaves the iterator pointing to the first boundary position at 509 * or after "offset". 510 * @param offset the offset to check. 511 * @return True if "offset" is a boundary position. 512 * @stable ICU 2.0 513 */ 514 @Override isBoundary(int offset)515 public boolean isBoundary(int offset) { 516 // TODO: behavior difference with ICU4C, which considers out-of-range offsets 517 // to not be boundaries, and to not be errors. 518 checkOffset(offset, fText); 519 520 // Adjust offset to be on a code point boundary and not beyond the end of the text. 521 // Note that isBoundary() is always false for offsets that are not on code point boundaries. 522 // But we still need the side effect of leaving iteration at the following boundary. 523 int adjustedOffset = CISetIndex32(fText, offset); 524 525 boolean result = false; 526 if (fBreakCache.seek(adjustedOffset) || fBreakCache.populateNear(adjustedOffset)) { 527 result = (fBreakCache.current() == offset); 528 } 529 530 if (!result) { 531 // Not on a boundary. isBoundary() must leave iterator on the following boundary. 532 // fBreakCache.seek(), above, left us on the preceding boundary, so advance one. 533 next(); 534 } 535 return result; 536 537 } 538 539 /** 540 * Returns the current iteration position. Note that DONE is never 541 * returned from this function; if iteration has run to the end of a 542 * string, current() will return the length of the string while 543 * next() will return BreakIterator.DONE). 544 * @return The current iteration position. 545 * @stable ICU 2.0 546 */ 547 @Override current()548 public int current() { 549 return (fText != null) ? fPosition : BreakIterator.DONE; 550 } 551 552 553 /** 554 * Return the status tag from the break rule that determined the boundary at 555 * the current iteration position. The values appear in the rule source 556 * within brackets, {123}, for example. For rules that do not specify a 557 * status, a default value of 0 is returned. If more than one rule applies, 558 * the numerically largest of the possible status values is returned. 559 * <p> 560 * Of the standard types of ICU break iterators, only the word and line break 561 * iterator provides status values. The values are defined in 562 * class RuleBasedBreakIterator, and allow distinguishing between words 563 * that contain alphabetic letters, "words" that appear to be numbers, 564 * punctuation and spaces, words containing ideographic characters, and 565 * more. Call <code>getRuleStatus</code> after obtaining a boundary 566 * position from <code>next()</code>, <code>previous()</code>, or 567 * any other break iterator functions that returns a boundary position. 568 * <p> 569 * Note that <code>getRuleStatus()</code> returns the value corresponding to 570 * <code>current()</code> index even after <code>next()</code> has returned DONE. 571 * <p> 572 573 * @return the status from the break rule that determined the boundary 574 * at the current iteration position. 575 * 576 * @stable ICU 60 577 */ 578 579 @Override getRuleStatus()580 public int getRuleStatus() { 581 // Status records have this form: 582 // Count N <-- fLastRuleStatusIndex points here. 583 // Status val 0 584 // Status val 1 585 // ... 586 // Status val N-1 <-- the value we need to return 587 // The status values are sorted in ascending order. 588 // This function returns the last (largest) of the array of status values. 589 int idx = fRuleStatusIndex + fRData.fStatusTable[fRuleStatusIndex]; 590 int tagVal = fRData.fStatusTable[idx]; 591 return tagVal; 592 } 593 594 /** 595 * Get the status (tag) values from the break rule(s) that determined the boundary 596 * at the current iteration position. The values appear in the rule source 597 * within brackets, {123}, for example. The default status value for rules 598 * that do not explicitly provide one is zero. 599 * <p> 600 * The status values used by the standard ICU break rules are defined 601 * as public constants in class RuleBasedBreakIterator. 602 * <p> 603 * If the size of the output array is insufficient to hold the data, 604 * the output will be truncated to the available length. No exception 605 * will be thrown. 606 * 607 * @param fillInArray an array to be filled in with the status values. 608 * @return The number of rule status values from the rules that determined 609 * the boundary at the current iteration position. 610 * In the event that the array is too small, the return value 611 * is the total number of status values that were available, 612 * not the reduced number that were actually returned. 613 * @stable ICU 60 614 */ 615 @Override getRuleStatusVec(int[] fillInArray)616 public int getRuleStatusVec(int[] fillInArray) { 617 int numStatusVals = fRData.fStatusTable[fRuleStatusIndex]; 618 if (fillInArray != null) { 619 int numToCopy = Math.min(numStatusVals, fillInArray.length); 620 for (int i=0; i<numToCopy; i++) { 621 fillInArray[i] = fRData.fStatusTable[fRuleStatusIndex + i + 1]; 622 } 623 } 624 return numStatusVals; 625 } 626 627 /** 628 * Returns a CharacterIterator over the text being analyzed. 629 * <p> 630 * <b><i>Caution:</i></b>The state of the returned CharacterIterator 631 * must not be modified in any way while the BreakIterator is still in use. 632 * Doing so will lead to undefined behavior of the BreakIterator. 633 * Clone the returned CharacterIterator first and work with that. 634 * <p> 635 * The returned CharacterIterator is a reference 636 * to the <b>actual iterator being used</b> by the BreakIterator. 637 * No guarantees are made about the current position 638 * of this iterator when it is returned; it may differ from the 639 * BreakIterators current position. If you need to move that 640 * position to examine the text, clone this function's return value first. 641 * @return An iterator over the text being analyzed. 642 * @stable ICU 2.0 643 */ 644 @Override getText()645 public CharacterIterator getText() { 646 return fText; 647 } 648 649 /** 650 * Set the iterator to analyze a new piece of text. This function resets 651 * the current iteration position to the beginning of the text. 652 * (The old iterator is dropped.) 653 * <p> 654 * <b><i>Caution:</i></b> The supplied CharacterIterator is used 655 * directly by the BreakIterator, and must not be altered in any 656 * way by code outside of the BreakIterator. 657 * Doing so will lead to undefined behavior of the BreakIterator. 658 * 659 * @param newText An iterator over the text to analyze. 660 * @stable ICU 2.0 661 */ 662 @Override setText(CharacterIterator newText)663 public void setText(CharacterIterator newText) { 664 if (newText != null) { 665 fBreakCache.reset(newText.getBeginIndex(), 0); 666 } else { 667 fBreakCache.reset(); 668 } 669 fDictionaryCache.reset(); 670 fText = newText; 671 this.first(); 672 } 673 674 /** 675 * Control debug, trace and dump options. 676 * @internal 677 * @deprecated This API is ICU internal only. 678 */ 679 @Deprecated 680 public static final String fDebugEnv = ICUDebug.enabled(RBBI_DEBUG_ARG) ? 681 ICUDebug.value(RBBI_DEBUG_ARG) : null; 682 683 getLanguageBreakEngine(int c)684 private LanguageBreakEngine getLanguageBreakEngine(int c) { 685 686 // We have a dictionary character. 687 // Does an already instantiated break engine handle it? 688 for (LanguageBreakEngine candidate : fBreakEngines) { 689 if (candidate.handles(c)) { 690 return candidate; 691 } 692 } 693 694 synchronized (gAllBreakEngines) { 695 // This break iterator's list of break engines didn't handle the character. 696 // Check the global list, another break iterator may have instantiated the 697 // desired engine. 698 for (LanguageBreakEngine candidate : gAllBreakEngines) { 699 if (candidate.handles(c)) { 700 fBreakEngines.add(candidate); 701 return candidate; 702 } 703 } 704 705 // The global list doesn't have an existing engine, build one. 706 int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT); 707 if (script == UScript.KATAKANA || script == UScript.HIRAGANA) { 708 // Katakana, Hiragana and Han are handled by the same dictionary engine. 709 // Fold them together for mapping from script -> engine. 710 script = UScript.HAN; 711 } 712 713 LanguageBreakEngine eng; 714 try { 715 switch (script) { 716 case UScript.THAI: 717 eng = new ThaiBreakEngine(); 718 break; 719 case UScript.LAO: 720 eng = new LaoBreakEngine(); 721 break; 722 case UScript.MYANMAR: 723 eng = new BurmeseBreakEngine(); 724 break; 725 case UScript.KHMER: 726 eng = new KhmerBreakEngine(); 727 break; 728 case UScript.HAN: 729 eng = new CjkBreakEngine(false); 730 break; 731 case UScript.HANGUL: 732 eng = new CjkBreakEngine(true); 733 break; 734 default: 735 gUnhandledBreakEngine.handleChar(c); 736 eng = gUnhandledBreakEngine; 737 break; 738 } 739 } catch (IOException e) { 740 eng = null; 741 } 742 743 if (eng != null && eng != gUnhandledBreakEngine) { 744 gAllBreakEngines.add(eng); 745 fBreakEngines.add(eng); 746 } 747 return eng; 748 } // end synchronized(gAllBreakEngines) 749 } 750 751 /** 752 * The State Machine Engine for moving forward is here. 753 * This function is the heart of the RBBI run time engine. 754 * 755 * Input 756 * fPosition, the position in the text to begin from. 757 * Output 758 * fPosition: the boundary following the starting position. 759 * fDictionaryCharCount the number of dictionary characters encountered. 760 * If > 0, the segment will be further subdivided 761 * fRuleStatusIndex Info from the state table indicating which rules caused the boundary. 762 * 763 * @return the new iterator position 764 * 765 * A note on supplementary characters and the position of underlying 766 * Java CharacterIterator: Normally, a character iterator is positioned at 767 * the char most recently returned by next(). Within this function, when 768 * a supplementary char is being processed, the char iterator is left 769 * sitting on the trail surrogate, in the middle of the code point. 770 * This is different from everywhere else, where an iterator always 771 * points at the lead surrogate of a supplementary. 772 */ handleNext()773 private int handleNext() { 774 if (TRACE) { 775 System.out.println("Handle Next pos char state category"); 776 } 777 778 // handleNext always sets the break tag value. 779 // Set the default for it. 780 fRuleStatusIndex = 0; 781 fDictionaryCharCount = 0; 782 783 // caches for quicker access 784 CharacterIterator text = fText; 785 CodePointTrie trie = fRData.fTrie; 786 787 char[] stateTable = fRData.fFTable.fTable; 788 int initialPosition = fPosition; 789 text.setIndex(initialPosition); 790 int result = initialPosition; 791 792 // Set up the starting char 793 int c = text.current(); 794 if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) { 795 c = nextTrail32(text, c); 796 if (c == DONE32) { 797 fDone = true; 798 return BreakIterator.DONE; 799 } 800 } 801 802 // Set the initial state for the state machine 803 int state = START_STATE; 804 int row = fRData.getRowIndex(state); 805 short category = 3; 806 int flagsState = fRData.fFTable.fFlags; 807 int dictStart = fRData.fFTable.fDictCategoriesStart; 808 int mode = RBBI_RUN; 809 if ((flagsState & RBBIDataWrapper.RBBI_BOF_REQUIRED) != 0) { 810 category = 2; 811 mode = RBBI_START; 812 if (TRACE) { 813 System.out.print(" " + RBBIDataWrapper.intToString(text.getIndex(), 5)); 814 System.out.print(RBBIDataWrapper.intToHexString(c, 10)); 815 System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6)); 816 } 817 } 818 819 // loop until we reach the end of the text or transition to state 0 820 while (state != STOP_STATE) { 821 if (c == DONE32) { 822 // Reached end of input string. 823 if (mode == RBBI_END) { 824 // We have already run the loop one last time with the 825 // character set to the pseudo {eof} value. Now it is time 826 // to unconditionally bail out. 827 break; 828 } 829 // Run the loop one last time with the fake end-of-input character category 830 mode = RBBI_END; 831 category = 1; 832 } 833 else if (mode == RBBI_RUN) { 834 // Get the char category. An incoming category of 1 or 2 mens that 835 // we are preset for doing the beginning or end of input, and 836 // that we shouldn't get a category from an actual text input character. 837 // 838 839 // look up the current character's character category, which tells us 840 // which column in the state table to look at. 841 // 842 category = (short) trie.get(c); 843 844 // Check for categories that require word dictionary handling. 845 if (category >= dictStart) { 846 fDictionaryCharCount++; 847 } 848 849 if (TRACE) { 850 System.out.print(" " + RBBIDataWrapper.intToString(text.getIndex(), 5)); 851 System.out.print(RBBIDataWrapper.intToHexString(c, 10)); 852 System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6)); 853 } 854 855 // Advance to the next character. 856 // If this is a beginning-of-input loop iteration, don't advance. 857 // The next iteration will be processing the first real input character. 858 c = text.next(); 859 if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) { 860 c = nextTrail32(text, c); 861 } 862 } 863 else { 864 mode = RBBI_RUN; 865 } 866 867 // look up a state transition in the state table 868 state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category]; 869 row = fRData.getRowIndex(state); 870 int accepting = stateTable[row + RBBIDataWrapper.ACCEPTING]; 871 if (accepting == RBBIDataWrapper.ACCEPTING_UNCONDITIONAL) { 872 // Match found, common case 873 result = text.getIndex(); 874 if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c <= UTF16.CODEPOINT_MAX_VALUE) { 875 // The iterator has been left in the middle of a surrogate pair. 876 // We want the start of it. 877 result--; 878 } 879 880 // Remember the break status (tag) values. 881 fRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGSIDX]; 882 } else if (accepting > RBBIDataWrapper.ACCEPTING_UNCONDITIONAL) { 883 // Lookahead match is completed 884 int lookaheadResult = fLookAheadMatches[accepting]; 885 if (lookaheadResult >= 0) { 886 fRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGSIDX]; 887 fPosition = lookaheadResult; 888 return lookaheadResult; 889 } 890 } 891 892 893 // If we are at the position of the '/' in a look-ahead (hard break) rule; 894 // record the current position, to be returned later, if the full rule matches. 895 // TODO: Move this check before the previous check of fAccepting. 896 // This would enable hard-break rules with no following context. 897 // But there are line break test failures when trying this. Investigate. 898 // Issue ICU-20837 899 int rule = stateTable[row + RBBIDataWrapper.LOOKAHEAD]; 900 if (rule != 0) { 901 int pos = text.getIndex(); 902 if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c <= UTF16.CODEPOINT_MAX_VALUE) { 903 // The iterator has been left in the middle of a surrogate pair. 904 // We want the beginning of it. 905 pos--; 906 } 907 fLookAheadMatches[rule] = pos; 908 } 909 910 911 } // End of state machine main loop 912 913 // The state machine is done. Check whether it found a match... 914 915 // If the iterator failed to advance in the match engine force it ahead by one. 916 // This indicates a defect in the break rules, which should always match 917 // at least one character. 918 919 if (result == initialPosition) { 920 if (TRACE) { 921 System.out.println("Iterator did not move. Advancing by 1."); 922 } 923 text.setIndex(initialPosition); 924 next32(text); 925 result = text.getIndex(); 926 fRuleStatusIndex = 0; 927 } 928 929 // Leave the iterator at our result position. 930 // (we may have advanced beyond the last accepting position chasing after 931 // longer matches that never completed.) 932 fPosition = result; 933 934 if (TRACE) { 935 System.out.println("result = " + result); 936 } 937 return result; 938 } 939 940 /** 941 * Iterate backwards from an arbitrary position in the input text using the Safe Reverse rules. 942 * This locates a "Safe Position" from which the forward break rules 943 * will operate correctly. A Safe Position is not necessarily a boundary itself. 944 * 945 * The logic of this function is very similar to handleNext(), above, but simpler 946 * because the safe table does not require as many options. 947 * 948 * @param fromPosition the position in the input text to begin the iteration. 949 * @internal 950 */ handleSafePrevious(int fromPosition)951 private int handleSafePrevious(int fromPosition) { 952 char state; 953 short category = 0; 954 int result = 0; 955 956 // caches for quicker access 957 CharacterIterator text = fText; 958 CodePointTrie trie = fRData.fTrie; 959 char[] stateTable = fRData.fRTable.fTable; 960 961 CISetIndex32(text, fromPosition); 962 if (TRACE) { 963 System.out.print("Handle Previous pos char state category"); 964 } 965 966 // if we're already at the start of the text, return DONE. 967 if (text.getIndex() == text.getBeginIndex()) { 968 return BreakIterator.DONE; 969 } 970 971 // Set the initial state for the state machine 972 int c = CharacterIteration.previous32(text); 973 state = START_STATE; 974 int row = fRData.getRowIndex(state); 975 976 // loop until we reach the start of the text or transition to state 0 977 // 978 for (; c != DONE32; c = CharacterIteration.previous32(text)) { 979 980 // look up the current character's character category, which tells us 981 // which column in the state table to look at. 982 // 983 // And off the dictionary flag bit. For reverse iteration it is not used. 984 category = (short) trie.get(c); 985 if (TRACE) { 986 System.out.print(" " + RBBIDataWrapper.intToString(text.getIndex(), 5)); 987 System.out.print(RBBIDataWrapper.intToHexString(c, 10)); 988 System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6)); 989 } 990 991 // State Transition - move machine to its next state 992 // 993 assert(category < fRData.fHeader.fCatCount); 994 state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category]; 995 row = fRData.getRowIndex(state); 996 997 if (state == STOP_STATE) { 998 // This is the normal exit from the lookup state machine. 999 // Transition to state zero means we have found a safe point. 1000 break; 1001 } 1002 } 1003 1004 // The state machine is done. 1005 result = text.getIndex(); 1006 if (TRACE) { 1007 System.out.println("result = " + result); 1008 } 1009 return result; 1010 } 1011 1012 /** 1013 * Set the index of a CharacterIterator. 1014 * Pin the index to the valid range range of BeginIndex <= index <= EndIndex. 1015 * If the index points to a trail surrogate of a supplementary character, adjust it 1016 * to the start (lead surrogate) index. 1017 * 1018 * @param ci A CharacterIterator to set 1019 * @param index the index to set 1020 * @return the resulting index, possibly pinned or adjusted. 1021 */ 1022 private static int CISetIndex32(CharacterIterator ci, int index) { 1023 if (index <= ci.getBeginIndex()) { 1024 ci.first(); 1025 } else if (index >= ci.getEndIndex()) { 1026 ci.setIndex(ci.getEndIndex()); 1027 } else if (Character.isLowSurrogate(ci.setIndex(index))) { 1028 if (!Character.isHighSurrogate(ci.previous())) { 1029 ci.next(); 1030 } 1031 } 1032 return ci.getIndex(); 1033 } 1034 1035 /** DictionaryCache stores the boundaries obtained from a run of dictionary characters. 1036 * Dictionary boundaries are moved first to this cache, then from here 1037 * to the main BreakCache, where they may inter-leave with non-dictionary 1038 * boundaries. The public BreakIterator API always fetches directly 1039 * from the main BreakCache, not from here. 1040 * 1041 * In common situations, the number of boundaries in a single dictionary run 1042 * should be quite small, it will be terminated by punctuation, spaces, 1043 * or any other non-dictionary characters. The main BreakCache may end 1044 * up with boundaries from multiple dictionary based runs. 1045 * 1046 * The boundaries are stored in a simple ArrayList (vector), with the 1047 * assumption that they will be accessed sequentially. 1048 */ 1049 class DictionaryCache { 1050 1051 void reset() { 1052 fPositionInCache = -1; 1053 fStart = 0; 1054 fLimit = 0; 1055 fFirstRuleStatusIndex = 0; 1056 fOtherRuleStatusIndex = 0; 1057 fBreaks.removeAllElements(); 1058 }; 1059 1060 boolean following(int fromPos) { 1061 if (fromPos >= fLimit || fromPos < fStart) { 1062 fPositionInCache = -1; 1063 return false; 1064 } 1065 1066 // Sequential iteration, move from previous boundary to the following 1067 1068 int r = 0; 1069 if (fPositionInCache >= 0 && fPositionInCache < fBreaks.size() && fBreaks.elementAt(fPositionInCache) == fromPos) { 1070 ++fPositionInCache; 1071 if (fPositionInCache >= fBreaks.size()) { 1072 fPositionInCache = -1; 1073 return false; 1074 } 1075 r = fBreaks.elementAt(fPositionInCache); 1076 assert(r > fromPos); 1077 fBoundary = r; 1078 fStatusIndex = fOtherRuleStatusIndex; 1079 return true; 1080 } 1081 1082 // Random indexing. Linear search for the boundary following the given position. 1083 1084 for (fPositionInCache = 0; fPositionInCache < fBreaks.size(); ++fPositionInCache) { 1085 r= fBreaks.elementAt(fPositionInCache); 1086 if (r > fromPos) { 1087 fBoundary = r; 1088 fStatusIndex = fOtherRuleStatusIndex; 1089 return true; 1090 } 1091 } 1092 1093 // Internal error. fStart <= fromPos < fLimit, but no cached boundary. 1094 assert(false); 1095 fPositionInCache = -1; 1096 return false; 1097 }; 1098 preceding(int fromPos)1099 boolean preceding(int fromPos) { 1100 if (fromPos <= fStart || fromPos > fLimit) { 1101 fPositionInCache = -1; 1102 return false; 1103 } 1104 1105 if (fromPos == fLimit) { 1106 fPositionInCache = fBreaks.size() - 1; 1107 if (fPositionInCache >= 0) { 1108 assert(fBreaks.elementAt(fPositionInCache) == fromPos); 1109 } 1110 } 1111 1112 int r; 1113 if (fPositionInCache > 0 && fPositionInCache < fBreaks.size() && fBreaks.elementAt(fPositionInCache) == fromPos) { 1114 --fPositionInCache; 1115 r = fBreaks.elementAt(fPositionInCache); 1116 assert(r < fromPos); 1117 fBoundary = r; 1118 fStatusIndex = ( r== fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex; 1119 return true; 1120 } 1121 1122 if (fPositionInCache == 0) { 1123 fPositionInCache = -1; 1124 return false; 1125 } 1126 1127 for (fPositionInCache = fBreaks.size()-1; fPositionInCache >= 0; --fPositionInCache) { 1128 r = fBreaks.elementAt(fPositionInCache); 1129 if (r < fromPos) { 1130 fBoundary = r; 1131 fStatusIndex = ( r == fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex; 1132 return true; 1133 } 1134 } 1135 assert(false); 1136 fPositionInCache = -1; 1137 return false; 1138 }; 1139 1140 /** 1141 * Populate the cache with the dictionary based boundaries within a region of text. 1142 * @param startPos The start position of a range of text 1143 * @param endPos The end position of a range of text 1144 * @param firstRuleStatus The rule status index that applies to the break at startPos 1145 * @param otherRuleStatus The rule status index that applies to boundaries other than startPos 1146 * @internal 1147 */ populateDictionary(int startPos, int endPos, int firstRuleStatus, int otherRuleStatus)1148 void populateDictionary(int startPos, int endPos, 1149 int firstRuleStatus, int otherRuleStatus) { 1150 if ((endPos - startPos) <= 1) { 1151 return; 1152 } 1153 1154 reset(); 1155 fFirstRuleStatusIndex = firstRuleStatus; 1156 fOtherRuleStatusIndex = otherRuleStatus; 1157 1158 int rangeStart = startPos; 1159 int rangeEnd = endPos; 1160 1161 int category; 1162 int current; 1163 int foundBreakCount = 0; 1164 1165 // Loop through the text, looking for ranges of dictionary characters. 1166 // For each span, find the appropriate break engine, and ask it to find 1167 // any breaks within the span. 1168 1169 fText.setIndex(rangeStart); 1170 int c = CharacterIteration.current32(fText); 1171 category = (short)fRData.fTrie.get(c); 1172 int dictStart = fRData.fFTable.fDictCategoriesStart; 1173 1174 while(true) { 1175 while((current = fText.getIndex()) < rangeEnd && (category < dictStart)) { 1176 c = CharacterIteration.next32(fText); // pre-increment 1177 category = (short)fRData.fTrie.get(c); 1178 } 1179 if (current >= rangeEnd) { 1180 break; 1181 } 1182 1183 // We now have a dictionary character. Get the appropriate language object 1184 // to deal with it. 1185 LanguageBreakEngine lbe = getLanguageBreakEngine(c); 1186 1187 // Ask the language object if there are any breaks. It will add them to the cache and 1188 // leave the text pointer on the other side of its range, ready to search for the next one. 1189 if (lbe != null) { 1190 foundBreakCount += lbe.findBreaks(fText, rangeStart, rangeEnd, fBreaks); 1191 } 1192 1193 // Reload the loop variables for the next go-round 1194 c = CharacterIteration.current32(fText); 1195 category = (short)fRData.fTrie.get(c); 1196 } 1197 1198 // If we found breaks, ensure that the first and last entries are 1199 // the original starting and ending position. And initialize the 1200 // cache iteration position to the first entry. 1201 1202 // System.out.printf("foundBreakCount = %d%n", foundBreakCount); 1203 if (foundBreakCount > 0) { 1204 assert(foundBreakCount == fBreaks.size()); 1205 if (startPos < fBreaks.elementAt(0)) { 1206 // The dictionary did not place a boundary at the start of the segment of text. 1207 // Add one now. This should not commonly happen, but it would be easy for interactions 1208 // of the rules for dictionary segments and the break engine implementations to 1209 // inadvertently cause it. Cover it here, just in case. 1210 fBreaks.offer(startPos); 1211 } 1212 if (endPos > fBreaks.peek()) { 1213 fBreaks.push(endPos); 1214 } 1215 fPositionInCache = 0; 1216 // Note: Dictionary matching may extend beyond the original limit. 1217 fStart = fBreaks.elementAt(0); 1218 fLimit = fBreaks.peek(); 1219 } else { 1220 // there were no language-based breaks, even though the segment contained 1221 // dictionary characters. Subsequent attempts to fetch boundaries from the dictionary cache 1222 // for this range will fail, and the calling code will fall back to the rule based boundaries. 1223 } 1224 1225 }; 1226 1227 DictionaryCache()1228 DictionaryCache() { 1229 fPositionInCache = -1; 1230 fBreaks = new DictionaryBreakEngine.DequeI(); 1231 } 1232 1233 /** 1234 * copy constructor. Used by RuleBasedBreakIterator.clone(). 1235 * 1236 * @param src the source object to be copied. 1237 */ DictionaryCache(DictionaryCache src)1238 DictionaryCache(DictionaryCache src) { 1239 try { 1240 fBreaks = (DictionaryBreakEngine.DequeI)src.fBreaks.clone(); 1241 } 1242 catch (CloneNotSupportedException e) { 1243 throw new RuntimeException(e); 1244 } 1245 fPositionInCache = src.fPositionInCache; 1246 fStart = src.fStart; 1247 fLimit = src.fLimit; 1248 fFirstRuleStatusIndex = src.fFirstRuleStatusIndex; 1249 fOtherRuleStatusIndex = src.fOtherRuleStatusIndex; 1250 fBoundary = src.fBoundary; 1251 fStatusIndex = src.fStatusIndex; 1252 } 1253 1254 // A data structure containing the boundaries themselves. Essentially a vector of raw ints. 1255 DictionaryBreakEngine.DequeI fBreaks; 1256 int fPositionInCache; // Index in fBreaks of last boundary returned by following() 1257 // // or preceding(). Optimizes sequential access. 1258 int fStart; // Text position of first boundary in cache. 1259 int fLimit; // Last boundary in cache. Which is the limit of the 1260 // // text segment being handled by the dictionary. 1261 int fFirstRuleStatusIndex; // Rule status info for first boundary. 1262 int fOtherRuleStatusIndex; // Rule status info for 2nd through last boundaries. 1263 int fBoundary; // Current boundary. Set by preceding(), following(). 1264 int fStatusIndex; // Current rule status index. Set by preceding, following(). 1265 }; 1266 1267 1268 1269 1270 /* 1271 * class BreakCache 1272 * 1273 * Cache of break boundary positions and rule status values. 1274 * Break iterator API functions, next(), previous(), etc., will use cached results 1275 * when possible, and otherwise cache new results as they are obtained. 1276 * 1277 * Uniformly caches both dictionary and rule based (non-dictionary) boundaries. 1278 * 1279 * The cache is implemented as a single circular buffer. 1280 */ 1281 1282 /* 1283 * size of the circular cache buffer. 1284 */ 1285 1286 class BreakCache { 1287 BreakCache()1288 BreakCache() { 1289 reset(); 1290 }; 1291 reset(int pos, int ruleStatus)1292 void reset(int pos, int ruleStatus) { 1293 fStartBufIdx = 0; 1294 fEndBufIdx = 0; 1295 fTextIdx = pos; 1296 fBufIdx = 0; 1297 fBoundaries[0] = pos; 1298 fStatuses[0] = (short)ruleStatus; 1299 } 1300 reset()1301 void reset() {reset(0, 0); }; 1302 next()1303 void next() { 1304 if (fBufIdx == fEndBufIdx) { 1305 fDone = !populateFollowing(); 1306 fPosition = fTextIdx; 1307 fRuleStatusIndex = fStatuses[fBufIdx]; 1308 } else { 1309 fBufIdx = modChunkSize(fBufIdx + 1); 1310 fTextIdx = fPosition = fBoundaries[fBufIdx]; 1311 fRuleStatusIndex = fStatuses[fBufIdx]; 1312 } 1313 }; 1314 previous()1315 void previous() { 1316 int initialBufIdx = fBufIdx; 1317 if (fBufIdx == fStartBufIdx) { 1318 // At start of cache. Prepend to it. 1319 populatePreceding(); 1320 } else { 1321 // Cache already holds the next boundary 1322 fBufIdx = modChunkSize(fBufIdx - 1); 1323 fTextIdx = fBoundaries[fBufIdx]; 1324 } 1325 fDone = (fBufIdx == initialBufIdx); 1326 fPosition = fTextIdx; 1327 fRuleStatusIndex = fStatuses[fBufIdx]; 1328 return; 1329 }; 1330 1331 // Move the iteration state to the position following the startPosition. 1332 // Input position must be pinned to the input length. following(int startPos)1333 void following(int startPos) { 1334 if (startPos == fTextIdx || seek(startPos) || populateNear(startPos)) { 1335 // startPos is in the cache. Do a next() from that position. 1336 // TODO: an awkward set of interactions with bi->fDone 1337 // seek() does not clear it; it can't because of interactions with populateNear(). 1338 // next() does not clear it in the fast-path case, where everything matters. Maybe it should. 1339 // So clear it here, for the case where seek() succeeded on an iterator that had previously run off the end. 1340 fDone = false; 1341 next(); 1342 } 1343 1344 }; 1345 preceding(int startPos)1346 void preceding(int startPos) { 1347 if (startPos == fTextIdx || seek(startPos) || populateNear(startPos)) { 1348 if (startPos == fTextIdx) { 1349 previous(); 1350 } else { 1351 // seek() leaves the BreakCache positioned at the preceding boundary 1352 // if the requested position is between two boundaries. 1353 // current() pushes the BreakCache position out to the BreakIterator itself. 1354 assert(startPos > fTextIdx); 1355 current(); 1356 } 1357 } 1358 return; 1359 }; 1360 1361 /** 1362 * Update the state of the public BreakIterator (fBI) to reflect the 1363 * current state of the break iterator cache (this). 1364 */ current()1365 int current() { 1366 fPosition = fTextIdx; 1367 fRuleStatusIndex = fStatuses[fBufIdx]; 1368 fDone = false; 1369 return fTextIdx; 1370 }; 1371 1372 /** 1373 * Add boundaries to the cache near the specified position. 1374 * The given position need not be a boundary itself. 1375 * The input position must be within the range of the text, and 1376 * on a code point boundary. 1377 * If the requested position is a break boundary, leave the iteration 1378 * position on it. 1379 * If the requested position is not a boundary, leave the iteration 1380 * position on the preceding boundary and include both the the 1381 * preceding and following boundaries in the cache. 1382 * Additional boundaries, either preceding or following, may be added 1383 * to the cache as a side effect. 1384 * 1385 * Return false if the operation failed. 1386 */ populateNear(int position)1387 boolean populateNear(int position) { 1388 assert(position < fBoundaries[fStartBufIdx] || position > fBoundaries[fEndBufIdx]); 1389 1390 // Find a boundary somewhere in the vicinity of the requested position. 1391 // Depending on the safe rules and the text data, it could be either before, at, or after 1392 // the requested position. 1393 1394 1395 // If the requested position is not near already cached positions, clear the existing cache, 1396 // find a near-by boundary and begin new cache contents there. 1397 1398 if ((position < fBoundaries[fStartBufIdx] - 15) || position > (fBoundaries[fEndBufIdx] + 15)) { 1399 int aBoundary = fText.getBeginIndex(); 1400 int ruleStatusIndex = 0; 1401 if (position > aBoundary + 20) { 1402 int backupPos = handleSafePrevious(position); 1403 if (backupPos > aBoundary) { 1404 // Advance to the boundary following the backup position. 1405 // There is a complication: the safe reverse rules identify pairs of code points 1406 // that are safe. If advancing from the safe point moves forwards by less than 1407 // two code points, we need to advance one more time to ensure that the boundary 1408 // is good, including a correct rules status value. 1409 // 1410 fPosition = backupPos; 1411 aBoundary = handleNext(); 1412 if (aBoundary == backupPos + 1 || 1413 (aBoundary == backupPos + 2 && 1414 Character.isHighSurrogate(fText.setIndex(backupPos)) && 1415 Character.isLowSurrogate(fText.next()))) { 1416 // The initial handleNext() only advanced by a single code point. Go again. 1417 // Safe rules identify safe pairs. 1418 aBoundary = handleNext(); 1419 } 1420 } 1421 ruleStatusIndex = fRuleStatusIndex; 1422 } 1423 reset(aBoundary, ruleStatusIndex); // Reset cache to hold aBoundary as a single starting point. 1424 } 1425 1426 // Fill in boundaries between existing cache content and the new requested position. 1427 1428 if (fBoundaries[fEndBufIdx] < position) { 1429 // The last position in the cache precedes the requested position. 1430 // Add following position(s) to the cache. 1431 while (fBoundaries[fEndBufIdx] < position) { 1432 if (!populateFollowing()) { 1433 assert false; 1434 return false; 1435 } 1436 } 1437 fBufIdx = fEndBufIdx; // Set iterator position to the end of the buffer. 1438 fTextIdx = fBoundaries[fBufIdx]; // Required because populateFollowing may add extra boundaries. 1439 while (fTextIdx > position) { // Move backwards to a position at or preceding the requested pos. 1440 previous(); 1441 } 1442 return true; 1443 } 1444 1445 if (fBoundaries[fStartBufIdx] > position) { 1446 // The first position in the cache is beyond the requested position. 1447 // back up more until we get a boundary <= the requested position. 1448 while (fBoundaries[fStartBufIdx] > position) { 1449 populatePreceding(); 1450 } 1451 fBufIdx = fStartBufIdx; // Set iterator position to the start of the buffer. 1452 fTextIdx = fBoundaries[fBufIdx]; // Required because populatePreceding may add extra boundaries. 1453 while (fTextIdx < position) { // Move forwards to a position at or following the requested pos. 1454 next(); 1455 } 1456 if (fTextIdx > position) { 1457 // If position is not itself a boundary, the next() loop above will overshoot. 1458 // Back up one, leaving cache position at the boundary preceding the requested position. 1459 previous(); 1460 } 1461 return true; 1462 } 1463 1464 assert fTextIdx == position; 1465 return true; 1466 1467 }; 1468 1469 /** 1470 * Add boundary(s) to the cache following the current last boundary. 1471 * Return false if at the end of the text, and no more boundaries can be added. 1472 * Leave iteration position at the first newly added boundary, or unchanged if no boundary was added. 1473 */ populateFollowing()1474 boolean populateFollowing() { 1475 int fromPosition = fBoundaries[fEndBufIdx]; 1476 int fromRuleStatusIdx = fStatuses[fEndBufIdx]; 1477 int pos = 0; 1478 int ruleStatusIdx = 0; 1479 1480 if (fDictionaryCache.following(fromPosition)) { 1481 addFollowing(fDictionaryCache.fBoundary, fDictionaryCache.fStatusIndex, UpdateCachePosition); 1482 return true; 1483 } 1484 1485 fPosition = fromPosition; 1486 pos = handleNext(); 1487 if (pos == BreakIterator.DONE) { 1488 return false; 1489 } 1490 1491 ruleStatusIdx = fRuleStatusIndex; 1492 if (fDictionaryCharCount > 0) { 1493 // The text segment obtained from the rules includes dictionary characters. 1494 // Subdivide it, with subdivided results going into the dictionary cache. 1495 fDictionaryCache.populateDictionary(fromPosition, pos, fromRuleStatusIdx, ruleStatusIdx); 1496 if (fDictionaryCache.following(fromPosition)) { 1497 addFollowing(fDictionaryCache.fBoundary, fDictionaryCache.fStatusIndex, UpdateCachePosition); 1498 return true; 1499 // TODO: may want to move a sizable chunk of the dictionary cache to the break cache at this point. 1500 // But be careful with interactions with populateNear(). 1501 } 1502 } 1503 1504 // Rule based segment did not include dictionary characters. 1505 // Or, it did contain dictionary chars, but the dictionary segmenter didn't handle them, 1506 // meaning that we didn't take the return, above. 1507 // Add its end point to the cache. 1508 addFollowing(pos, ruleStatusIdx, UpdateCachePosition); 1509 1510 // Add several non-dictionary boundaries at this point, to optimize straight forward iteration. 1511 // (subsequent calls to BreakIterator::next() will take the fast path, getting cached results. 1512 // 1513 for (int count=0; count<6; ++count) { 1514 pos = handleNext(); 1515 if (pos == BreakIterator.DONE || fDictionaryCharCount > 0) { 1516 break; 1517 } 1518 addFollowing(pos, fRuleStatusIndex, RetainCachePosition); 1519 } 1520 return true; 1521 }; 1522 1523 /** 1524 * Add one or more boundaries to the cache preceding the first currently cached boundary. 1525 * Leave the iteration position on the first added boundary. 1526 * Return false if no boundaries could be added (if at the start of the text.) 1527 */ populatePreceding()1528 boolean populatePreceding() { 1529 int textBegin = fText.getBeginIndex(); 1530 int fromPosition = fBoundaries[fStartBufIdx]; 1531 if (fromPosition == textBegin) { 1532 return false; 1533 } 1534 1535 int position = textBegin; 1536 int positionStatusIdx = 0; 1537 1538 if (fDictionaryCache.preceding(fromPosition)) { 1539 addPreceding(fDictionaryCache.fBoundary, fDictionaryCache.fStatusIndex, UpdateCachePosition); 1540 return true; 1541 } 1542 1543 int backupPosition = fromPosition; 1544 1545 // Find a boundary somewhere preceding the first already-cached boundary 1546 do { 1547 backupPosition = backupPosition - 30; 1548 if (backupPosition <= textBegin) { 1549 backupPosition = textBegin; 1550 } else { 1551 backupPosition = handleSafePrevious(backupPosition); 1552 } 1553 if (backupPosition == BreakIterator.DONE || backupPosition == textBegin) { 1554 position = textBegin; 1555 positionStatusIdx = 0; 1556 } else { 1557 // Advance to the boundary following the backup position. 1558 // There is a complication: the safe reverse rules identify pairs of code points 1559 // that are safe. If advancing from the safe point moves forwards by less than 1560 // two code points, we need to advance one more time to ensure that the boundary 1561 // is good, including a correct rules status value. 1562 // 1563 fPosition = backupPosition; // TODO: pass starting position in a clearer way. 1564 position = handleNext(); 1565 if (position == backupPosition + 1 || 1566 (position == backupPosition + 2 && 1567 Character.isHighSurrogate(fText.setIndex(backupPosition)) && 1568 Character.isLowSurrogate(fText.next()))) { 1569 // The initial handleNext() only advanced by a single code point. Go again. 1570 // Safe rules identify safe pairs. 1571 position = handleNext(); 1572 } 1573 positionStatusIdx = fRuleStatusIndex; 1574 } 1575 } while (position >= fromPosition); 1576 1577 // Find boundaries between the one we just located and the first already-cached boundary 1578 // Put them in a side buffer, because we don't yet know where they will fall in the circular cache buffer. 1579 1580 fSideBuffer.removeAllElements(); 1581 fSideBuffer.push(position); 1582 fSideBuffer.push(positionStatusIdx); 1583 1584 do { 1585 int prevPosition = fPosition = position; 1586 int prevStatusIdx = positionStatusIdx; 1587 position = handleNext(); 1588 positionStatusIdx = fRuleStatusIndex; 1589 if (position == BreakIterator.DONE) { 1590 break; 1591 } 1592 1593 boolean segmentHandledByDictionary = false; 1594 if (fDictionaryCharCount != 0) { 1595 // Segment from the rules includes dictionary characters. 1596 // Subdivide it, with subdivided results going into the dictionary cache. 1597 int dictSegEndPosition = position; 1598 fDictionaryCache.populateDictionary(prevPosition, dictSegEndPosition, prevStatusIdx, positionStatusIdx); 1599 while (fDictionaryCache.following(prevPosition)) { 1600 position = fDictionaryCache.fBoundary; 1601 positionStatusIdx = fDictionaryCache.fStatusIndex; 1602 segmentHandledByDictionary = true; 1603 assert(position > prevPosition); 1604 if (position >= fromPosition) { 1605 break; 1606 } 1607 assert(position <= dictSegEndPosition); 1608 fSideBuffer.push(position); 1609 fSideBuffer.push(positionStatusIdx); 1610 prevPosition = position; 1611 } 1612 assert(position==dictSegEndPosition || position>=fromPosition); 1613 } 1614 1615 if (!segmentHandledByDictionary && position < fromPosition) { 1616 fSideBuffer.push(position); 1617 fSideBuffer.push(positionStatusIdx); 1618 } 1619 } while (position < fromPosition); 1620 1621 // Move boundaries from the side buffer to the main circular buffer. 1622 boolean success = false; 1623 if (!fSideBuffer.isEmpty()) { 1624 positionStatusIdx = fSideBuffer.pop(); 1625 position = fSideBuffer.pop(); 1626 addPreceding(position, positionStatusIdx, UpdateCachePosition); 1627 success = true; 1628 } 1629 1630 while (!fSideBuffer.isEmpty()) { 1631 positionStatusIdx = fSideBuffer.pop(); 1632 position = fSideBuffer.pop(); 1633 if (!addPreceding(position, positionStatusIdx, RetainCachePosition)) { 1634 // No space in circular buffer to hold a new preceding result while 1635 // also retaining the current cache (iteration) position. 1636 // Bailing out is safe; the cache will refill again if needed. 1637 break; 1638 } 1639 } 1640 return success; 1641 }; 1642 1643 1644 static final boolean RetainCachePosition = false; 1645 static final boolean UpdateCachePosition = true; 1646 1647 /** 1648 * Add the boundary following the current position. 1649 * The current position can be left as it was, or changed to the newly added boundary, 1650 * as specified by the update parameter. 1651 */ addFollowing(int position, int ruleStatusIdx, boolean update)1652 void addFollowing(int position, int ruleStatusIdx, boolean update) { 1653 assert(position > fBoundaries[fEndBufIdx]); 1654 assert(ruleStatusIdx <= Short.MAX_VALUE); 1655 int nextIdx = modChunkSize(fEndBufIdx + 1); 1656 if (nextIdx == fStartBufIdx) { 1657 fStartBufIdx = modChunkSize(fStartBufIdx + 6); // TODO: experiment. Probably revert to 1. 1658 } 1659 fBoundaries[nextIdx] = position; 1660 fStatuses[nextIdx] = (short)ruleStatusIdx; 1661 fEndBufIdx = nextIdx; 1662 if (update == UpdateCachePosition) { 1663 // Set current position to the newly added boundary. 1664 fBufIdx = nextIdx; 1665 fTextIdx = position; 1666 } else { 1667 // Retaining the original cache position. 1668 // Check if the added boundary wraps around the buffer, and would over-write the original position. 1669 // It's the responsibility of callers of this function to not add too many. 1670 assert(nextIdx != fBufIdx); 1671 } 1672 1673 }; 1674 1675 1676 /** 1677 * Add the boundary preceding the current position. 1678 * The current position can be left as it was, or changed to the newly added boundary, 1679 * as specified by the update parameter. 1680 */ addPreceding(int position, int ruleStatusIdx, boolean update)1681 boolean addPreceding(int position, int ruleStatusIdx, boolean update) { 1682 assert(position < fBoundaries[fStartBufIdx]); 1683 assert(ruleStatusIdx <= Short.MAX_VALUE); 1684 int nextIdx = modChunkSize(fStartBufIdx - 1); 1685 if (nextIdx == fEndBufIdx) { 1686 if (fBufIdx == fEndBufIdx && update == RetainCachePosition) { 1687 // Failure. The insertion of the new boundary would claim the buffer position that is the 1688 // current iteration position. And we also want to retain the current iteration position. 1689 // (The buffer is already completely full of entries that precede the iteration position.) 1690 return false; 1691 } 1692 fEndBufIdx = modChunkSize(fEndBufIdx - 1); 1693 } 1694 fBoundaries[nextIdx] = position; 1695 fStatuses[nextIdx] = (short)ruleStatusIdx; 1696 fStartBufIdx = nextIdx; 1697 if (update == UpdateCachePosition) { 1698 fBufIdx = nextIdx; 1699 fTextIdx = position; 1700 } 1701 return true; 1702 }; 1703 1704 /** 1705 * Set the cache position to the specified position, or, if the position 1706 * falls between to cached boundaries, to the preceding boundary. 1707 * Fails if the requested position is outside of the range of boundaries currently held by the cache. 1708 * The startPosition must be on a code point boundary. 1709 * 1710 * Return true if successful, false if the specified position is after 1711 * the last cached boundary or before the first. 1712 */ 1713 boolean seek(int pos) { 1714 if (pos < fBoundaries[fStartBufIdx] || pos > fBoundaries[fEndBufIdx]) { 1715 return false; 1716 } 1717 if (pos == fBoundaries[fStartBufIdx]) { 1718 // Common case: seek(0), from BreakIterator::first() 1719 fBufIdx = fStartBufIdx; 1720 fTextIdx = fBoundaries[fBufIdx]; 1721 return true; 1722 } 1723 if (pos == fBoundaries[fEndBufIdx]) { 1724 fBufIdx = fEndBufIdx; 1725 fTextIdx = fBoundaries[fBufIdx]; 1726 return true; 1727 } 1728 1729 int min = fStartBufIdx; 1730 int max = fEndBufIdx; 1731 while (min != max) { 1732 int probe = (min + max + (min>max ? CACHE_SIZE : 0)) / 2; 1733 probe = modChunkSize(probe); 1734 if (fBoundaries[probe] > pos) { 1735 max = probe; 1736 } else { 1737 min = modChunkSize(probe + 1); 1738 } 1739 } 1740 assert(fBoundaries[max] > pos); 1741 fBufIdx = modChunkSize(max - 1); 1742 fTextIdx = fBoundaries[fBufIdx]; 1743 assert(fTextIdx <= pos); 1744 return true; 1745 1746 }; 1747 1748 1749 /** 1750 * copy constructor, used from RuleBasedBreakIterator.clone(). 1751 * 1752 * @param src 1753 */ BreakCache(BreakCache src)1754 BreakCache(BreakCache src) { 1755 fStartBufIdx = src.fStartBufIdx; 1756 fEndBufIdx = src.fEndBufIdx; 1757 fTextIdx = src.fTextIdx; 1758 fBufIdx = src.fBufIdx; 1759 fBoundaries = src.fBoundaries.clone(); 1760 fStatuses = src.fStatuses.clone(); 1761 fSideBuffer = new DictionaryBreakEngine.DequeI(); // Transient, no need to clone contents. 1762 } 1763 dumpCache()1764 void dumpCache() { 1765 System.out.printf("fTextIdx:%d fBufIdx:%d%n", fTextIdx, fBufIdx); 1766 for (int i=fStartBufIdx; ; i=modChunkSize(i+1)) { 1767 System.out.printf("%d %d%n", i, fBoundaries[i]); 1768 if (i == fEndBufIdx) { 1769 break; 1770 } 1771 } 1772 }; 1773 modChunkSize(int index)1774 private final int modChunkSize(int index) { return index & (CACHE_SIZE - 1); }; 1775 1776 static final int CACHE_SIZE = 128; 1777 // static_assert((CACHE_SIZE & (CACHE_SIZE-1)) == 0, "CACHE_SIZE must be power of two."); 1778 1779 int fStartBufIdx; 1780 int fEndBufIdx; // inclusive 1781 1782 int fTextIdx; 1783 int fBufIdx; 1784 1785 int[] fBoundaries = new int[CACHE_SIZE]; 1786 short[] fStatuses = new short[CACHE_SIZE]; 1787 1788 DictionaryBreakEngine.DequeI fSideBuffer = new DictionaryBreakEngine.DequeI(); 1789 }; 1790 1791 1792 1793 1794 } 1795 1796