1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html#License 3 /* 4 ******************************************************************************* 5 * Copyright (C) 2005-2016 International Business Machines Corporation and 6 * others. All Rights Reserved. 7 ******************************************************************************* 8 */ 9 10 package com.ibm.icu.text; 11 12 import static com.ibm.icu.impl.CharacterIteration.DONE32; 13 import static com.ibm.icu.impl.CharacterIteration.next32; 14 import static com.ibm.icu.impl.CharacterIteration.nextTrail32; 15 16 import java.io.ByteArrayOutputStream; 17 import java.io.IOException; 18 import java.io.InputStream; 19 import java.io.OutputStream; 20 import java.nio.ByteBuffer; 21 import java.text.CharacterIterator; 22 import java.util.ArrayList; 23 import java.util.List; 24 25 import com.ibm.icu.impl.CharacterIteration; 26 import com.ibm.icu.impl.ICUBinary; 27 import com.ibm.icu.impl.ICUDebug; 28 import com.ibm.icu.impl.RBBIDataWrapper; 29 import com.ibm.icu.impl.Trie2; 30 import com.ibm.icu.lang.UCharacter; 31 import com.ibm.icu.lang.UProperty; 32 import com.ibm.icu.lang.UScript; 33 34 /** 35 * Rule Based Break Iterator 36 * This is a port of the C++ class RuleBasedBreakIterator from ICU4C. 37 * 38 * @stable ICU 2.0 39 */ 40 public class RuleBasedBreakIterator extends BreakIterator { 41 //======================================================================= 42 // Constructors & Factories 43 //======================================================================= 44 45 /** 46 * private constructor 47 */ RuleBasedBreakIterator()48 private RuleBasedBreakIterator() { 49 fDictionaryCharCount = 0; 50 synchronized(gAllBreakEngines) { 51 fBreakEngines = new ArrayList<>(gAllBreakEngines); 52 } 53 } 54 55 /** 56 * Create a break iterator from a precompiled set of break rules. 57 * 58 * Creating a break iterator from the binary rules is much faster than 59 * creating one from source rules. 60 * 61 * The binary rules are generated by the RuleBasedBreakIterator.compileRules() function. 62 * Binary break iterator rules are not guaranteed to be compatible between 63 * different versions of ICU. 64 * 65 * @param is an input stream supplying the compiled binary rules. 66 * @throws IOException if there is an error while reading the rules from the InputStream. 67 * @see #compileRules(String, OutputStream) 68 * @stable ICU 4.8 69 */ getInstanceFromCompiledRules(InputStream is)70 public static RuleBasedBreakIterator getInstanceFromCompiledRules(InputStream is) throws IOException { 71 RuleBasedBreakIterator This = new RuleBasedBreakIterator(); 72 This.fRData = RBBIDataWrapper.get(ICUBinary.getByteBufferFromInputStreamAndCloseStream(is)); 73 return This; 74 } 75 76 /** 77 * Create a break iterator from a precompiled set of break rules. 78 * 79 * Creating a break iterator from the binary rules is much faster than 80 * creating one from source rules. 81 * 82 * The binary rules are generated by the RuleBasedBreakIterator.compileRules() function. 83 * Binary break iterator rules are not guaranteed to be compatible between 84 * different versions of ICU. 85 * 86 * @param bytes a buffer supplying the compiled binary rules. 87 * @throws IOException if there is an error while reading the rules from the buffer. 88 * @see #compileRules(String, OutputStream) 89 * @internal 90 * @deprecated This API is ICU internal only. 91 */ 92 @Deprecated getInstanceFromCompiledRules(ByteBuffer bytes)93 public static RuleBasedBreakIterator getInstanceFromCompiledRules(ByteBuffer bytes) throws IOException { 94 RuleBasedBreakIterator This = new RuleBasedBreakIterator(); 95 This.fRData = RBBIDataWrapper.get(bytes); 96 return This; 97 } 98 99 /** 100 * Construct a RuleBasedBreakIterator from a set of rules supplied as a string. 101 * @param rules The break rules to be used. 102 * @stable ICU 2.2 103 */ RuleBasedBreakIterator(String rules)104 public RuleBasedBreakIterator(String rules) { 105 this(); 106 try { 107 ByteArrayOutputStream ruleOS = new ByteArrayOutputStream(); 108 compileRules(rules, ruleOS); 109 fRData = RBBIDataWrapper.get(ByteBuffer.wrap(ruleOS.toByteArray())); 110 } catch (IOException e) { 111 ///CLOVER:OFF 112 // An IO exception can only arrive here if there is a bug in the RBBI Rule compiler, 113 // causing bogus compiled rules to be produced, but with no compile error raised. 114 RuntimeException rte = new RuntimeException("RuleBasedBreakIterator rule compilation internal error: " 115 + e.getMessage()); 116 throw rte; 117 ///CLOVER:ON 118 } 119 } 120 121 //======================================================================= 122 // Boilerplate 123 //======================================================================= 124 125 /** 126 * Clones this iterator. 127 * @return A newly-constructed RuleBasedBreakIterator with the same 128 * behavior as this one. 129 * @stable ICU 2.0 130 */ 131 @Override clone()132 public Object clone() { 133 RuleBasedBreakIterator result; 134 result = (RuleBasedBreakIterator)super.clone(); 135 if (fText != null) { 136 result.fText = (CharacterIterator)(fText.clone()); 137 } 138 synchronized (gAllBreakEngines) { 139 result.fBreakEngines = new ArrayList<>(gAllBreakEngines); 140 } 141 result.fLookAheadMatches = new LookAheadResults(); 142 result.fBreakCache = result.new BreakCache(fBreakCache); 143 result.fDictionaryCache = result.new DictionaryCache(fDictionaryCache); 144 return result; 145 } 146 147 148 /** 149 * Returns true if both BreakIterators are of the same class, have the same 150 * rules, and iterate over the same text. 151 * @stable ICU 2.0 152 */ 153 @Override equals(Object that)154 public boolean equals(Object that) { 155 if (that == null) { 156 return false; 157 } 158 if (this == that) { 159 return true; 160 } 161 try { 162 RuleBasedBreakIterator other = (RuleBasedBreakIterator) that; 163 if (fRData != other.fRData && (fRData == null || other.fRData == null)) { 164 return false; 165 } 166 if (fRData != null && other.fRData != null && 167 (!fRData.fRuleSource.equals(other.fRData.fRuleSource))) { 168 return false; 169 } 170 if (fText == null && other.fText == null) { 171 return true; 172 } 173 if (fText == null || other.fText == null || !fText.equals(other.fText)) { 174 return false; 175 } 176 return fPosition == other.fPosition; 177 } 178 catch(ClassCastException e) { 179 return false; 180 } 181 } 182 183 /** 184 * Returns the description (rules) used to create this iterator. 185 * (In ICU4C, the same function is RuleBasedBreakIterator::getRules()) 186 * @stable ICU 2.0 187 */ 188 @Override toString()189 public String toString() { 190 String retStr = ""; 191 if (fRData != null) { 192 retStr = fRData.fRuleSource; 193 } 194 return retStr; 195 } 196 197 /** 198 * Compute a hashcode for this BreakIterator 199 * @return A hash code 200 * @stable ICU 2.0 201 */ 202 @Override hashCode()203 public int hashCode() 204 { 205 return fRData.fRuleSource.hashCode(); 206 } 207 208 209 private static final int START_STATE = 1; // The state number of the starting state 210 private static final int STOP_STATE = 0; // The state-transition value indicating "stop" 211 212 // RBBIRunMode - the state machine runs an extra iteration at the beginning and end 213 // of user text. A variable with this enum type keeps track of where we 214 // are. The state machine only fetches user text input while in RUN mode. 215 private static final int RBBI_START = 0; 216 private static final int RBBI_RUN = 1; 217 private static final int RBBI_END = 2; 218 219 /** 220 * The character iterator through which this BreakIterator accesses the text. 221 */ 222 private CharacterIterator fText = new java.text.StringCharacterIterator(""); 223 224 /** 225 * The rule data for this BreakIterator instance. 226 * Not intended for public use. Declared public for testing purposes only. 227 * @internal 228 * @deprecated This API is ICU internal only. 229 */ 230 @Deprecated 231 public RBBIDataWrapper fRData; 232 233 /** 234 * The iteration state - current position, rule status for the current position, 235 * and whether the iterator ran off the end, yielding UBRK_DONE. 236 * Current position is pinned to be 0 < position <= text.length. 237 * Current position is always set to a boundary. 238 * 239 * The current position of the iterator. Pinned, 0 < fPosition <= text.length. 240 * Never has the value UBRK_DONE (-1). 241 */ 242 private int fPosition; 243 244 /** 245 * Index of the Rule {tag} values for the most recent match. 246 */ 247 private int fRuleStatusIndex; 248 249 /** 250 * True when iteration has run off the end, and iterator functions should return UBRK_DONE. 251 */ 252 private boolean fDone; 253 254 /** 255 * Cache of previously determined boundary positions. 256 */ 257 private BreakCache fBreakCache = new BreakCache(); 258 259 260 /** 261 * Counter for the number of characters encountered with the "dictionary" 262 * flag set. Normal RBBI iterators don't use it, although the code 263 * for updating it is live. Dictionary Based break iterators (a subclass 264 * of us) access this field directly. 265 * @internal 266 */ 267 private int fDictionaryCharCount; 268 269 private DictionaryCache fDictionaryCache = new DictionaryCache(); 270 271 /** 272 * ICU debug argument name for RBBI 273 */ 274 private static final String RBBI_DEBUG_ARG = "rbbi"; 275 276 /** 277 * Debugging flag. Trace operation of state machine when true. 278 */ 279 private static final boolean TRACE = ICUDebug.enabled(RBBI_DEBUG_ARG) 280 && ICUDebug.value(RBBI_DEBUG_ARG).indexOf("trace") >= 0; 281 282 /** 283 * The "default" break engine - just skips over ranges of dictionary words, 284 * producing no breaks. Should only be used if characters need to be handled 285 * by a dictionary but we have no dictionary implementation for them. 286 * 287 * Only one instance; shared by all break iterators. 288 */ 289 private static final UnhandledBreakEngine gUnhandledBreakEngine; 290 291 /** 292 * List of all known break engines, common for all break iterators. 293 * Lazily updated as break engines are needed, because instantiation of 294 * break engines is expensive. 295 * 296 * Because gAllBreakEngines can be referenced concurrently from different 297 * BreakIterator instances, all access is synchronized. 298 */ 299 private static final List<LanguageBreakEngine> gAllBreakEngines; 300 301 static { 302 gUnhandledBreakEngine = new UnhandledBreakEngine(); 303 gAllBreakEngines = new ArrayList<>(); 304 gAllBreakEngines.add(gUnhandledBreakEngine); 305 } 306 307 /** 308 * List of all known break engines. Similar to gAllBreakEngines, but local to a 309 * break iterator, allowing it to be used without synchronization. 310 */ 311 private List<LanguageBreakEngine> fBreakEngines; 312 313 /** 314 * Dump the contents of the state table and character classes for this break iterator. 315 * For debugging only. 316 * @internal 317 * @deprecated This API is ICU internal only. 318 */ 319 @Deprecated dump(java.io.PrintStream out)320 public void dump(java.io.PrintStream out) { 321 if (out == null) { 322 out = System.out; 323 } 324 this.fRData.dump(out); 325 } 326 327 /** 328 * Compile a set of source break rules into the binary state tables used 329 * by the break iterator engine. Creating a break iterator from precompiled 330 * rules is much faster than creating one from source rules. 331 * 332 * Binary break rules are not guaranteed to be compatible between different 333 * versions of ICU. 334 * 335 * 336 * @param rules The source form of the break rules 337 * @param ruleBinary An output stream to receive the compiled rules. 338 * @throws IOException If there is an error writing the output. 339 * @see #getInstanceFromCompiledRules(InputStream) 340 * @stable ICU 4.8 341 */ compileRules(String rules, OutputStream ruleBinary)342 public static void compileRules(String rules, OutputStream ruleBinary) throws IOException { 343 RBBIRuleBuilder.compileRules(rules, ruleBinary); 344 } 345 346 //======================================================================= 347 // BreakIterator overrides 348 //======================================================================= 349 350 /** 351 * Sets the current iteration position to the beginning of the text. 352 * (i.e., the CharacterIterator's starting offset). 353 * @return The offset of the beginning of the text. 354 * @stable ICU 2.0 355 */ 356 @Override first()357 public int first() { 358 if (fText == null) { 359 return BreakIterator.DONE; 360 } 361 fText.first(); 362 int start = fText.getIndex(); 363 if (!fBreakCache.seek(start)) { 364 fBreakCache.populateNear(start); 365 } 366 fBreakCache.current(); 367 assert(fPosition == start); 368 return fPosition; 369 } 370 371 /** 372 * Sets the current iteration position to the end of the text. 373 * (i.e., the CharacterIterator's ending offset). 374 * @return The text's past-the-end offset. 375 * @stable ICU 2.0 376 */ 377 @Override last()378 public int last() { 379 if (fText == null) { 380 return BreakIterator.DONE; 381 } 382 int endPos = fText.getEndIndex(); 383 boolean endShouldBeBoundary = isBoundary(endPos); // Has side effect of setting iterator position. 384 assert(endShouldBeBoundary); 385 if (fPosition != endPos) { 386 assert(fPosition == endPos); 387 } 388 return endPos; 389 } 390 391 /** 392 * Advances the iterator either forward or backward the specified number of steps. 393 * Negative values move backward, and positive values move forward. This is 394 * equivalent to repeatedly calling next() or previous(). 395 * @param n The number of steps to move. The sign indicates the direction 396 * (negative is backwards, and positive is forwards). 397 * @return The character offset of the boundary position n boundaries away from 398 * the current one. 399 * @stable ICU 2.0 400 */ 401 @Override next(int n)402 public int next(int n) { 403 int result = 0; 404 if (n > 0) { 405 for (; n > 0 && result != DONE; --n) { 406 result = next(); 407 } 408 } else if (n < 0) { 409 for (; n < 0 && result != DONE; ++n) { 410 result = previous(); 411 } 412 } else { 413 result = current(); 414 } 415 return result; 416 } 417 418 /** 419 * Advances the iterator to the next boundary position. 420 * @return The position of the first boundary after this one. 421 * @stable ICU 2.0 422 */ 423 @Override next()424 public int next() { 425 fBreakCache.next(); 426 return fDone ? DONE : fPosition; 427 } 428 429 /** 430 * Moves the iterator backwards, to the boundary preceding the current one. 431 * @return The position of the boundary position immediately preceding the starting position. 432 * @stable ICU 2.0 433 */ 434 @Override previous()435 public int previous() { 436 fBreakCache.previous(); 437 return fDone ? DONE : fPosition; 438 } 439 440 /** 441 * Sets the iterator to refer to the first boundary position following 442 * the specified position. 443 * @param startPos The position from which to begin searching for a break position. 444 * @return The position of the first break after the current position. 445 * @stable ICU 2.0 446 */ 447 @Override following(int startPos)448 public int following(int startPos) { 449 // if the supplied position is before the beginning, return the 450 // text's starting offset 451 if (startPos < fText.getBeginIndex()) { 452 return first(); 453 } 454 455 // Move requested offset to a code point start. It might be between a lead and trail surrogate. 456 // Or it may be beyond the end of the text. 457 startPos = CISetIndex32(fText, startPos); 458 fBreakCache.following(startPos); 459 return fDone ? DONE : fPosition; 460 } 461 462 463 /** 464 * Sets the iterator to refer to the last boundary position before the 465 * specified position. 466 * @param offset The position to begin searching for a break from. 467 * @return The position of the last boundary before the starting position. 468 * @stable ICU 2.0 469 */ 470 @Override preceding(int offset)471 public int preceding(int offset) { 472 if (fText == null || offset > fText.getEndIndex()) { 473 return last(); 474 } else if (offset < fText.getBeginIndex()) { 475 return first(); 476 } 477 478 // Move requested offset to a code point start. It might be between a lead and trail surrogate. 479 // int adjustedOffset = CISetIndex32(fText, offset); // TODO: restore to match ICU4C behavior. 480 int adjustedOffset = offset; 481 fBreakCache.preceding(adjustedOffset); 482 return fDone ? DONE : fPosition; 483 484 } 485 486 487 /** 488 * Throw IllegalArgumentException unless begin <= offset < end. 489 * @stable ICU 2.0 490 */ checkOffset(int offset, CharacterIterator text)491 protected static final void checkOffset(int offset, CharacterIterator text) { 492 if (offset < text.getBeginIndex() || offset > text.getEndIndex()) { 493 throw new IllegalArgumentException("offset out of bounds"); 494 } 495 } 496 497 498 /** 499 * Returns true if the specified position is a boundary position. As a side 500 * effect, leaves the iterator pointing to the first boundary position at 501 * or after "offset". 502 * @param offset the offset to check. 503 * @return True if "offset" is a boundary position. 504 * @stable ICU 2.0 505 */ 506 @Override isBoundary(int offset)507 public boolean isBoundary(int offset) { 508 // TODO: behavior difference with ICU4C, which considers out-of-range offsets 509 // to not be boundaries, and to not be errors. 510 checkOffset(offset, fText); 511 512 // Adjust offset to be on a code point boundary and not beyond the end of the text. 513 // Note that isBoundary() is always false for offsets that are not on code point boundaries. 514 // But we still need the side effect of leaving iteration at the following boundary. 515 int adjustedOffset = CISetIndex32(fText, offset); 516 517 boolean result = false; 518 if (fBreakCache.seek(adjustedOffset) || fBreakCache.populateNear(adjustedOffset)) { 519 result = (fBreakCache.current() == offset); 520 } 521 522 if (!result) { 523 // Not on a boundary. isBoundary() must leave iterator on the following boundary. 524 // fBreakCache.seek(), above, left us on the preceding boundary, so advance one. 525 next(); 526 } 527 return result; 528 529 } 530 531 /** 532 * Returns the current iteration position. Note that DONE is never 533 * returned from this function; if iteration has run to the end of a 534 * string, current() will return the length of the string while 535 * next() will return BreakIterator.DONE). 536 * @return The current iteration position. 537 * @stable ICU 2.0 538 */ 539 @Override current()540 public int current() { 541 return (fText != null) ? fPosition : BreakIterator.DONE; 542 } 543 544 545 /** 546 * Return the status tag from the break rule that determined the boundary at 547 * the current iteration position. The values appear in the rule source 548 * within brackets, {123}, for example. For rules that do not specify a 549 * status, a default value of 0 is returned. If more than one rule applies, 550 * the numerically largest of the possible status values is returned. 551 * <p> 552 * Of the standard types of ICU break iterators, only the word and line break 553 * iterator provides status values. The values are defined in 554 * class RuleBasedBreakIterator, and allow distinguishing between words 555 * that contain alphabetic letters, "words" that appear to be numbers, 556 * punctuation and spaces, words containing ideographic characters, and 557 * more. Call <code>getRuleStatus</code> after obtaining a boundary 558 * position from <code>next()</code>, <code>previous()</code>, or 559 * any other break iterator functions that returns a boundary position. 560 * <p> 561 * Note that <code>getRuleStatus()</code> returns the value corresponding to 562 * <code>current()</code> index even after <code>next()</code> has returned DONE. 563 * <p> 564 565 * @return the status from the break rule that determined the boundary 566 * at the current iteration position. 567 * 568 * @stable ICU 60 569 */ 570 571 @Override getRuleStatus()572 public int getRuleStatus() { 573 // Status records have this form: 574 // Count N <-- fLastRuleStatusIndex points here. 575 // Status val 0 576 // Status val 1 577 // ... 578 // Status val N-1 <-- the value we need to return 579 // The status values are sorted in ascending order. 580 // This function returns the last (largest) of the array of status values. 581 int idx = fRuleStatusIndex + fRData.fStatusTable[fRuleStatusIndex]; 582 int tagVal = fRData.fStatusTable[idx]; 583 return tagVal; 584 } 585 586 /** 587 * Get the status (tag) values from the break rule(s) that determined the boundary 588 * at the current iteration position. The values appear in the rule source 589 * within brackets, {123}, for example. The default status value for rules 590 * that do not explicitly provide one is zero. 591 * <p> 592 * The status values used by the standard ICU break rules are defined 593 * as public constants in class RuleBasedBreakIterator. 594 * <p> 595 * If the size of the output array is insufficient to hold the data, 596 * the output will be truncated to the available length. No exception 597 * will be thrown. 598 * 599 * @param fillInArray an array to be filled in with the status values. 600 * @return The number of rule status values from the rules that determined 601 * the boundary at the current iteration position. 602 * In the event that the array is too small, the return value 603 * is the total number of status values that were available, 604 * not the reduced number that were actually returned. 605 * @stable ICU 60 606 */ 607 @Override getRuleStatusVec(int[] fillInArray)608 public int getRuleStatusVec(int[] fillInArray) { 609 int numStatusVals = fRData.fStatusTable[fRuleStatusIndex]; 610 if (fillInArray != null) { 611 int numToCopy = Math.min(numStatusVals, fillInArray.length); 612 for (int i=0; i<numToCopy; i++) { 613 fillInArray[i] = fRData.fStatusTable[fRuleStatusIndex + i + 1]; 614 } 615 } 616 return numStatusVals; 617 } 618 619 /** 620 * Returns a CharacterIterator over the text being analyzed. 621 * <p> 622 * <b><i>Caution:</i></b>The state of the returned CharacterIterator 623 * must not be modified in any way while the BreakIterator is still in use. 624 * Doing so will lead to undefined behavior of the BreakIterator. 625 * Clone the returned CharacterIterator first and work with that. 626 * <p> 627 * The returned CharacterIterator is a reference 628 * to the <b>actual iterator being used</b> by the BreakIterator. 629 * No guarantees are made about the current position 630 * of this iterator when it is returned; it may differ from the 631 * BreakIterators current position. If you need to move that 632 * position to examine the text, clone this function's return value first. 633 * @return An iterator over the text being analyzed. 634 * @stable ICU 2.0 635 */ 636 @Override getText()637 public CharacterIterator getText() { 638 return fText; 639 } 640 641 /** 642 * Set the iterator to analyze a new piece of text. This function resets 643 * the current iteration position to the beginning of the text. 644 * (The old iterator is dropped.) 645 * <p> 646 * <b><i>Caution:</i></b> The supplied CharacterIterator is used 647 * directly by the BreakIterator, and must not be altered in any 648 * way by code outside of the BreakIterator. 649 * Doing so will lead to undefined behavior of the BreakIterator. 650 * 651 * @param newText An iterator over the text to analyze. 652 * @stable ICU 2.0 653 */ 654 @Override setText(CharacterIterator newText)655 public void setText(CharacterIterator newText) { 656 if (newText != null) { 657 fBreakCache.reset(newText.getBeginIndex(), 0); 658 } else { 659 fBreakCache.reset(); 660 } 661 fDictionaryCache.reset(); 662 fText = newText; 663 this.first(); 664 } 665 666 /** 667 * Control debug, trace and dump options. 668 * @internal 669 * @deprecated This API is ICU internal only. 670 */ 671 @Deprecated 672 public static final String fDebugEnv = ICUDebug.enabled(RBBI_DEBUG_ARG) ? 673 ICUDebug.value(RBBI_DEBUG_ARG) : null; 674 675 getLanguageBreakEngine(int c)676 private LanguageBreakEngine getLanguageBreakEngine(int c) { 677 678 // We have a dictionary character. 679 // Does an already instantiated break engine handle it? 680 for (LanguageBreakEngine candidate : fBreakEngines) { 681 if (candidate.handles(c)) { 682 return candidate; 683 } 684 } 685 686 synchronized (gAllBreakEngines) { 687 // This break iterator's list of break engines didn't handle the character. 688 // Check the global list, another break iterator may have instantiated the 689 // desired engine. 690 for (LanguageBreakEngine candidate : gAllBreakEngines) { 691 if (candidate.handles(c)) { 692 fBreakEngines.add(candidate); 693 return candidate; 694 } 695 } 696 697 // The global list doesn't have an existing engine, build one. 698 int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT); 699 if (script == UScript.KATAKANA || script == UScript.HIRAGANA) { 700 // Katakana, Hiragana and Han are handled by the same dictionary engine. 701 // Fold them together for mapping from script -> engine. 702 script = UScript.HAN; 703 } 704 705 LanguageBreakEngine eng; 706 try { 707 switch (script) { 708 case UScript.THAI: 709 eng = new ThaiBreakEngine(); 710 break; 711 case UScript.LAO: 712 eng = new LaoBreakEngine(); 713 break; 714 case UScript.MYANMAR: 715 eng = new BurmeseBreakEngine(); 716 break; 717 case UScript.KHMER: 718 eng = new KhmerBreakEngine(); 719 break; 720 case UScript.HAN: 721 eng = new CjkBreakEngine(false); 722 break; 723 case UScript.HANGUL: 724 eng = new CjkBreakEngine(true); 725 break; 726 default: 727 gUnhandledBreakEngine.handleChar(c); 728 eng = gUnhandledBreakEngine; 729 break; 730 } 731 } catch (IOException e) { 732 eng = null; 733 } 734 735 if (eng != null && eng != gUnhandledBreakEngine) { 736 gAllBreakEngines.add(eng); 737 fBreakEngines.add(eng); 738 } 739 return eng; 740 } // end synchronized(gAllBreakEngines) 741 } 742 743 private static final int kMaxLookaheads = 8; 744 private static class LookAheadResults { 745 int fUsedSlotLimit; 746 int[] fPositions; 747 int[] fKeys; 748 LookAheadResults()749 LookAheadResults() { 750 fUsedSlotLimit= 0; 751 fPositions = new int[kMaxLookaheads]; 752 fKeys = new int[kMaxLookaheads]; 753 } 754 getPosition(int key)755 int getPosition(int key) { 756 for (int i=0; i<fUsedSlotLimit; ++i) { 757 if (fKeys[i] == key) { 758 return fPositions[i]; 759 } 760 } 761 assert(false); 762 return -1; 763 } 764 setPosition(int key, int position)765 void setPosition(int key, int position) { 766 int i; 767 for (i=0; i<fUsedSlotLimit; ++i) { 768 if (fKeys[i] == key) { 769 fPositions[i] = position; 770 return; 771 } 772 } 773 if (i >= kMaxLookaheads) { 774 assert(false); 775 i = kMaxLookaheads - 1; 776 } 777 fKeys[i] = key; 778 fPositions[i] = position; 779 assert(fUsedSlotLimit == i); 780 fUsedSlotLimit = i + 1; 781 } 782 reset()783 void reset() { 784 fUsedSlotLimit = 0; 785 } 786 }; 787 private LookAheadResults fLookAheadMatches = new LookAheadResults(); 788 789 790 /** 791 * The State Machine Engine for moving forward is here. 792 * This function is the heart of the RBBI run time engine. 793 * 794 * Input 795 * fPosition, the position in the text to begin from. 796 * Output 797 * fPosition: the boundary following the starting position. 798 * fDictionaryCharCount the number of dictionary characters encountered. 799 * If > 0, the segment will be further subdivided 800 * fRuleStatusIndex Info from the state table indicating which rules caused the boundary. 801 * 802 * @return the new iterator position 803 * 804 * A note on supplementary characters and the position of underlying 805 * Java CharacterIterator: Normally, a character iterator is positioned at 806 * the char most recently returned by next(). Within this function, when 807 * a supplementary char is being processed, the char iterator is left 808 * sitting on the trail surrogate, in the middle of the code point. 809 * This is different from everywhere else, where an iterator always 810 * points at the lead surrogate of a supplementary. 811 */ handleNext()812 private int handleNext() { 813 if (TRACE) { 814 System.out.println("Handle Next pos char state category"); 815 } 816 817 // handleNext always sets the break tag value. 818 // Set the default for it. 819 fRuleStatusIndex = 0; 820 fDictionaryCharCount = 0; 821 822 // caches for quicker access 823 CharacterIterator text = fText; 824 Trie2 trie = fRData.fTrie; 825 826 short[] stateTable = fRData.fFTable.fTable; 827 int initialPosition = fPosition; 828 text.setIndex(initialPosition); 829 int result = initialPosition; 830 831 // Set up the starting char 832 int c = text.current(); 833 if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) { 834 c = nextTrail32(text, c); 835 if (c == DONE32) { 836 fDone = true; 837 return BreakIterator.DONE; 838 } 839 } 840 841 // Set the initial state for the state machine 842 int state = START_STATE; 843 int row = fRData.getRowIndex(state); 844 short category = 3; 845 int flagsState = fRData.fFTable.fFlags; 846 int mode = RBBI_RUN; 847 if ((flagsState & RBBIDataWrapper.RBBI_BOF_REQUIRED) != 0) { 848 category = 2; 849 mode = RBBI_START; 850 if (TRACE) { 851 System.out.print(" " + RBBIDataWrapper.intToString(text.getIndex(), 5)); 852 System.out.print(RBBIDataWrapper.intToHexString(c, 10)); 853 System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6)); 854 } 855 } 856 fLookAheadMatches.reset(); 857 858 // loop until we reach the end of the text or transition to state 0 859 while (state != STOP_STATE) { 860 if (c == DONE32) { 861 // Reached end of input string. 862 if (mode == RBBI_END) { 863 // We have already run the loop one last time with the 864 // character set to the pseudo {eof} value. Now it is time 865 // to unconditionally bail out. 866 break; 867 } 868 // Run the loop one last time with the fake end-of-input character category 869 mode = RBBI_END; 870 category = 1; 871 } 872 else if (mode == RBBI_RUN) { 873 // Get the char category. An incoming category of 1 or 2 mens that 874 // we are preset for doing the beginning or end of input, and 875 // that we shouldn't get a category from an actual text input character. 876 // 877 878 // look up the current character's character category, which tells us 879 // which column in the state table to look at. 880 // 881 category = (short) trie.get(c); 882 883 // Check the dictionary bit in the character's category. 884 // Counter is only used by dictionary based iterators (subclasses). 885 // Chars that need to be handled by a dictionary have a flag bit set 886 // in their category values. 887 // 888 if ((category & 0x4000) != 0) { 889 fDictionaryCharCount++; 890 // And off the dictionary flag bit. 891 category &= ~0x4000; 892 } 893 894 if (TRACE) { 895 System.out.print(" " + RBBIDataWrapper.intToString(text.getIndex(), 5)); 896 System.out.print(RBBIDataWrapper.intToHexString(c, 10)); 897 System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6)); 898 } 899 900 // Advance to the next character. 901 // If this is a beginning-of-input loop iteration, don't advance. 902 // The next iteration will be processing the first real input character. 903 c = text.next(); 904 if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) { 905 c = nextTrail32(text, c); 906 } 907 } 908 else { 909 mode = RBBI_RUN; 910 } 911 912 // look up a state transition in the state table 913 state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category]; 914 row = fRData.getRowIndex(state); 915 916 if (stateTable[row + RBBIDataWrapper.ACCEPTING] == -1) { 917 // Match found, common case 918 result = text.getIndex(); 919 if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c <= UTF16.CODEPOINT_MAX_VALUE) { 920 // The iterator has been left in the middle of a surrogate pair. 921 // We want the start of it. 922 result--; 923 } 924 925 // Remember the break status (tag) values. 926 fRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGIDX]; 927 } 928 929 int completedRule = stateTable[row + RBBIDataWrapper.ACCEPTING]; 930 if (completedRule > 0) { 931 // Lookahead match is completed 932 int lookaheadResult = fLookAheadMatches.getPosition(completedRule); 933 if (lookaheadResult >= 0) { 934 fRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGIDX]; 935 fPosition = lookaheadResult; 936 return lookaheadResult; 937 } 938 } 939 940 int rule = stateTable[row + RBBIDataWrapper.LOOKAHEAD]; 941 if (rule != 0) { 942 // At the position of a '/' in a look-ahead match. Record it. 943 int pos = text.getIndex(); 944 if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c <= UTF16.CODEPOINT_MAX_VALUE) { 945 // The iterator has been left in the middle of a surrogate pair. 946 // We want the beginning of it. 947 pos--; 948 } 949 fLookAheadMatches.setPosition(rule, pos); 950 } 951 952 953 } // End of state machine main loop 954 955 // The state machine is done. Check whether it found a match... 956 957 // If the iterator failed to advance in the match engine force it ahead by one. 958 // This indicates a defect in the break rules, which should always match 959 // at least one character. 960 961 if (result == initialPosition) { 962 if (TRACE) { 963 System.out.println("Iterator did not move. Advancing by 1."); 964 } 965 text.setIndex(initialPosition); 966 next32(text); 967 result = text.getIndex(); 968 fRuleStatusIndex = 0; 969 } 970 971 // Leave the iterator at our result position. 972 // (we may have advanced beyond the last accepting position chasing after 973 // longer matches that never completed.) 974 fPosition = result; 975 976 if (TRACE) { 977 System.out.println("result = " + result); 978 } 979 return result; 980 } 981 982 /** 983 * Iterate backwards from an arbitrary position in the input text using the Safe Reverse rules. 984 * This locates a "Safe Position" from which the forward break rules 985 * will operate correctly. A Safe Position is not necessarily a boundary itself. 986 * 987 * The logic of this function is very similar to handleNext(), above, but simpler 988 * because the safe table does not require as many options. 989 * 990 * @param fromPosition the position in the input text to begin the iteration. 991 * @internal 992 */ handleSafePrevious(int fromPosition)993 private int handleSafePrevious(int fromPosition) { 994 int state; 995 short category = 0; 996 int result = 0; 997 998 // caches for quicker access 999 CharacterIterator text = fText; 1000 Trie2 trie = fRData.fTrie; 1001 short[] stateTable = fRData.fRTable.fTable; 1002 1003 CISetIndex32(text, fromPosition); 1004 if (TRACE) { 1005 System.out.print("Handle Previous pos char state category"); 1006 } 1007 1008 // if we're already at the start of the text, return DONE. 1009 if (text.getIndex() == text.getBeginIndex()) { 1010 return BreakIterator.DONE; 1011 } 1012 1013 // Set the initial state for the state machine 1014 int c = CharacterIteration.previous32(text); 1015 state = START_STATE; 1016 int row = fRData.getRowIndex(state); 1017 1018 // loop until we reach the start of the text or transition to state 0 1019 // 1020 for (; c != DONE32; c = CharacterIteration.previous32(text)) { 1021 1022 // look up the current character's character category, which tells us 1023 // which column in the state table to look at. 1024 // 1025 // And off the dictionary flag bit. For reverse iteration it is not used. 1026 category = (short) trie.get(c); 1027 category &= ~0x4000; 1028 if (TRACE) { 1029 System.out.print(" " + RBBIDataWrapper.intToString(text.getIndex(), 5)); 1030 System.out.print(RBBIDataWrapper.intToHexString(c, 10)); 1031 System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6)); 1032 } 1033 1034 // State Transition - move machine to its next state 1035 // 1036 assert(category < fRData.fHeader.fCatCount); 1037 state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category]; 1038 row = fRData.getRowIndex(state); 1039 1040 if (state == STOP_STATE) { 1041 // This is the normal exit from the lookup state machine. 1042 // Transition to state zero means we have found a safe point. 1043 break; 1044 } 1045 } 1046 1047 // The state machine is done. 1048 result = text.getIndex(); 1049 if (TRACE) { 1050 System.out.println("result = " + result); 1051 } 1052 return result; 1053 } 1054 1055 /** 1056 * Set the index of a CharacterIterator. 1057 * Pin the index to the valid range range of BeginIndex <= index <= EndIndex. 1058 * If the index points to a trail surrogate of a supplementary character, adjust it 1059 * to the start (lead surrogate) index. 1060 * 1061 * @param ci A CharacterIterator to set 1062 * @param index the index to set 1063 * @return the resulting index, possibly pinned or adjusted. 1064 */ 1065 private static int CISetIndex32(CharacterIterator ci, int index) { 1066 if (index <= ci.getBeginIndex()) { 1067 ci.first(); 1068 } else if (index >= ci.getEndIndex()) { 1069 ci.setIndex(ci.getEndIndex()); 1070 } else if (Character.isLowSurrogate(ci.setIndex(index))) { 1071 if (!Character.isHighSurrogate(ci.previous())) { 1072 ci.next(); 1073 } 1074 } 1075 return ci.getIndex(); 1076 } 1077 1078 /** DictionaryCache stores the boundaries obtained from a run of dictionary characters. 1079 * Dictionary boundaries are moved first to this cache, then from here 1080 * to the main BreakCache, where they may inter-leave with non-dictionary 1081 * boundaries. The public BreakIterator API always fetches directly 1082 * from the main BreakCache, not from here. 1083 * 1084 * In common situations, the number of boundaries in a single dictionary run 1085 * should be quite small, it will be terminated by punctuation, spaces, 1086 * or any other non-dictionary characters. The main BreakCache may end 1087 * up with boundaries from multiple dictionary based runs. 1088 * 1089 * The boundaries are stored in a simple ArrayList (vector), with the 1090 * assumption that they will be accessed sequentially. 1091 */ 1092 class DictionaryCache { 1093 1094 void reset() { 1095 fPositionInCache = -1; 1096 fStart = 0; 1097 fLimit = 0; 1098 fFirstRuleStatusIndex = 0; 1099 fOtherRuleStatusIndex = 0; 1100 fBreaks.removeAllElements(); 1101 }; 1102 1103 boolean following(int fromPos) { 1104 if (fromPos >= fLimit || fromPos < fStart) { 1105 fPositionInCache = -1; 1106 return false; 1107 } 1108 1109 // Sequential iteration, move from previous boundary to the following 1110 1111 int r = 0; 1112 if (fPositionInCache >= 0 && fPositionInCache < fBreaks.size() && fBreaks.elementAt(fPositionInCache) == fromPos) { 1113 ++fPositionInCache; 1114 if (fPositionInCache >= fBreaks.size()) { 1115 fPositionInCache = -1; 1116 return false; 1117 } 1118 r = fBreaks.elementAt(fPositionInCache); 1119 assert(r > fromPos); 1120 fBoundary = r; 1121 fStatusIndex = fOtherRuleStatusIndex; 1122 return true; 1123 } 1124 1125 // Random indexing. Linear search for the boundary following the given position. 1126 1127 for (fPositionInCache = 0; fPositionInCache < fBreaks.size(); ++fPositionInCache) { 1128 r= fBreaks.elementAt(fPositionInCache); 1129 if (r > fromPos) { 1130 fBoundary = r; 1131 fStatusIndex = fOtherRuleStatusIndex; 1132 return true; 1133 } 1134 } 1135 1136 // Internal error. fStart <= fromPos < fLimit, but no cached boundary. 1137 assert(false); 1138 fPositionInCache = -1; 1139 return false; 1140 }; 1141 preceding(int fromPos)1142 boolean preceding(int fromPos) { 1143 if (fromPos <= fStart || fromPos > fLimit) { 1144 fPositionInCache = -1; 1145 return false; 1146 } 1147 1148 if (fromPos == fLimit) { 1149 fPositionInCache = fBreaks.size() - 1; 1150 if (fPositionInCache >= 0) { 1151 assert(fBreaks.elementAt(fPositionInCache) == fromPos); 1152 } 1153 } 1154 1155 int r; 1156 if (fPositionInCache > 0 && fPositionInCache < fBreaks.size() && fBreaks.elementAt(fPositionInCache) == fromPos) { 1157 --fPositionInCache; 1158 r = fBreaks.elementAt(fPositionInCache); 1159 assert(r < fromPos); 1160 fBoundary = r; 1161 fStatusIndex = ( r== fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex; 1162 return true; 1163 } 1164 1165 if (fPositionInCache == 0) { 1166 fPositionInCache = -1; 1167 return false; 1168 } 1169 1170 for (fPositionInCache = fBreaks.size()-1; fPositionInCache >= 0; --fPositionInCache) { 1171 r = fBreaks.elementAt(fPositionInCache); 1172 if (r < fromPos) { 1173 fBoundary = r; 1174 fStatusIndex = ( r == fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex; 1175 return true; 1176 } 1177 } 1178 assert(false); 1179 fPositionInCache = -1; 1180 return false; 1181 }; 1182 1183 /** 1184 * Populate the cache with the dictionary based boundaries within a region of text. 1185 * @param startPos The start position of a range of text 1186 * @param endPos The end position of a range of text 1187 * @param firstRuleStatus The rule status index that applies to the break at startPos 1188 * @param otherRuleStatus The rule status index that applies to boundaries other than startPos 1189 * @internal 1190 */ populateDictionary(int startPos, int endPos, int firstRuleStatus, int otherRuleStatus)1191 void populateDictionary(int startPos, int endPos, 1192 int firstRuleStatus, int otherRuleStatus) { 1193 if ((endPos - startPos) <= 1) { 1194 return; 1195 } 1196 1197 reset(); 1198 fFirstRuleStatusIndex = firstRuleStatus; 1199 fOtherRuleStatusIndex = otherRuleStatus; 1200 1201 int rangeStart = startPos; 1202 int rangeEnd = endPos; 1203 1204 int category; 1205 int current; 1206 int foundBreakCount = 0; 1207 1208 // Loop through the text, looking for ranges of dictionary characters. 1209 // For each span, find the appropriate break engine, and ask it to find 1210 // any breaks within the span. 1211 1212 fText.setIndex(rangeStart); 1213 int c = CharacterIteration.current32(fText); 1214 category = (short)fRData.fTrie.get(c); 1215 1216 while(true) { 1217 while((current = fText.getIndex()) < rangeEnd && (category & 0x4000) == 0) { 1218 c = CharacterIteration.next32(fText); // pre-increment 1219 category = (short)fRData.fTrie.get(c); 1220 } 1221 if (current >= rangeEnd) { 1222 break; 1223 } 1224 1225 // We now have a dictionary character. Get the appropriate language object 1226 // to deal with it. 1227 LanguageBreakEngine lbe = getLanguageBreakEngine(c); 1228 1229 // Ask the language object if there are any breaks. It will add them to the cache and 1230 // leave the text pointer on the other side of its range, ready to search for the next one. 1231 if (lbe != null) { 1232 foundBreakCount += lbe.findBreaks(fText, rangeStart, rangeEnd, fBreaks); 1233 } 1234 1235 // Reload the loop variables for the next go-round 1236 c = CharacterIteration.current32(fText); 1237 category = (short)fRData.fTrie.get(c); 1238 } 1239 1240 // If we found breaks, ensure that the first and last entries are 1241 // the original starting and ending position. And initialize the 1242 // cache iteration position to the first entry. 1243 1244 // System.out.printf("foundBreakCount = %d%n", foundBreakCount); 1245 if (foundBreakCount > 0) { 1246 assert(foundBreakCount == fBreaks.size()); 1247 if (startPos < fBreaks.elementAt(0)) { 1248 // The dictionary did not place a boundary at the start of the segment of text. 1249 // Add one now. This should not commonly happen, but it would be easy for interactions 1250 // of the rules for dictionary segments and the break engine implementations to 1251 // inadvertently cause it. Cover it here, just in case. 1252 fBreaks.offer(startPos); 1253 } 1254 if (endPos > fBreaks.peek()) { 1255 fBreaks.push(endPos); 1256 } 1257 fPositionInCache = 0; 1258 // Note: Dictionary matching may extend beyond the original limit. 1259 fStart = fBreaks.elementAt(0); 1260 fLimit = fBreaks.peek(); 1261 } else { 1262 // there were no language-based breaks, even though the segment contained 1263 // dictionary characters. Subsequent attempts to fetch boundaries from the dictionary cache 1264 // for this range will fail, and the calling code will fall back to the rule based boundaries. 1265 } 1266 1267 }; 1268 1269 DictionaryCache()1270 DictionaryCache() { 1271 fPositionInCache = -1; 1272 fBreaks = new DictionaryBreakEngine.DequeI(); 1273 } 1274 1275 /** 1276 * copy constructor. Used by RuleBasedBreakIterator.clone(). 1277 * 1278 * @param src the source object to be copied. 1279 */ DictionaryCache(DictionaryCache src)1280 DictionaryCache(DictionaryCache src) { 1281 try { 1282 fBreaks = (DictionaryBreakEngine.DequeI)src.fBreaks.clone(); 1283 } 1284 catch (CloneNotSupportedException e) { 1285 throw new RuntimeException(e); 1286 } 1287 fPositionInCache = src.fPositionInCache; 1288 fStart = src.fStart; 1289 fLimit = src.fLimit; 1290 fFirstRuleStatusIndex = src.fFirstRuleStatusIndex; 1291 fOtherRuleStatusIndex = src.fOtherRuleStatusIndex; 1292 fBoundary = src.fBoundary; 1293 fStatusIndex = src.fStatusIndex; 1294 } 1295 1296 // A data structure containing the boundaries themselves. Essentially a vector of raw ints. 1297 DictionaryBreakEngine.DequeI fBreaks; 1298 int fPositionInCache; // Index in fBreaks of last boundary returned by following() 1299 // // or preceding(). Optimizes sequential access. 1300 int fStart; // Text position of first boundary in cache. 1301 int fLimit; // Last boundary in cache. Which is the limit of the 1302 // // text segment being handled by the dictionary. 1303 int fFirstRuleStatusIndex; // Rule status info for first boundary. 1304 int fOtherRuleStatusIndex; // Rule status info for 2nd through last boundaries. 1305 int fBoundary; // Current boundary. Set by preceding(), following(). 1306 int fStatusIndex; // Current rule status index. Set by preceding, following(). 1307 }; 1308 1309 1310 1311 1312 /* 1313 * class BreakCache 1314 * 1315 * Cache of break boundary positions and rule status values. 1316 * Break iterator API functions, next(), previous(), etc., will use cached results 1317 * when possible, and otherwise cache new results as they are obtained. 1318 * 1319 * Uniformly caches both dictionary and rule based (non-dictionary) boundaries. 1320 * 1321 * The cache is implemented as a single circular buffer. 1322 */ 1323 1324 /* 1325 * size of the circular cache buffer. 1326 */ 1327 1328 class BreakCache { 1329 BreakCache()1330 BreakCache() { 1331 reset(); 1332 }; 1333 reset(int pos, int ruleStatus)1334 void reset(int pos, int ruleStatus) { 1335 fStartBufIdx = 0; 1336 fEndBufIdx = 0; 1337 fTextIdx = pos; 1338 fBufIdx = 0; 1339 fBoundaries[0] = pos; 1340 fStatuses[0] = (short)ruleStatus; 1341 } 1342 reset()1343 void reset() {reset(0, 0); }; 1344 next()1345 void next() { 1346 if (fBufIdx == fEndBufIdx) { 1347 fDone = !populateFollowing(); 1348 fPosition = fTextIdx; 1349 fRuleStatusIndex = fStatuses[fBufIdx]; 1350 } else { 1351 fBufIdx = modChunkSize(fBufIdx + 1); 1352 fTextIdx = fPosition = fBoundaries[fBufIdx]; 1353 fRuleStatusIndex = fStatuses[fBufIdx]; 1354 } 1355 }; 1356 previous()1357 void previous() { 1358 int initialBufIdx = fBufIdx; 1359 if (fBufIdx == fStartBufIdx) { 1360 // At start of cache. Prepend to it. 1361 populatePreceding(); 1362 } else { 1363 // Cache already holds the next boundary 1364 fBufIdx = modChunkSize(fBufIdx - 1); 1365 fTextIdx = fBoundaries[fBufIdx]; 1366 } 1367 fDone = (fBufIdx == initialBufIdx); 1368 fPosition = fTextIdx; 1369 fRuleStatusIndex = fStatuses[fBufIdx]; 1370 return; 1371 }; 1372 1373 // Move the iteration state to the position following the startPosition. 1374 // Input position must be pinned to the input length. following(int startPos)1375 void following(int startPos) { 1376 if (startPos == fTextIdx || seek(startPos) || populateNear(startPos)) { 1377 // startPos is in the cache. Do a next() from that position. 1378 // TODO: an awkward set of interactions with bi->fDone 1379 // seek() does not clear it; it can't because of interactions with populateNear(). 1380 // next() does not clear it in the fast-path case, where everything matters. Maybe it should. 1381 // So clear it here, for the case where seek() succeeded on an iterator that had previously run off the end. 1382 fDone = false; 1383 next(); 1384 } 1385 1386 }; 1387 preceding(int startPos)1388 void preceding(int startPos) { 1389 if (startPos == fTextIdx || seek(startPos) || populateNear(startPos)) { 1390 if (startPos == fTextIdx) { 1391 previous(); 1392 } else { 1393 // seek() leaves the BreakCache positioned at the preceding boundary 1394 // if the requested position is between two bounaries. 1395 // current() pushes the BreakCache position out to the BreakIterator itself. 1396 assert(startPos > fTextIdx); 1397 current(); 1398 } 1399 } 1400 return; 1401 }; 1402 1403 /** 1404 * Update the state of the public BreakIterator (fBI) to reflect the 1405 * current state of the break iterator cache (this). 1406 */ current()1407 int current() { 1408 fPosition = fTextIdx; 1409 fRuleStatusIndex = fStatuses[fBufIdx]; 1410 fDone = false; 1411 return fTextIdx; 1412 }; 1413 1414 /** 1415 * Add boundaries to the cache near the specified position. 1416 * The given position need not be a boundary itself. 1417 * The input position must be within the range of the text, and 1418 * on a code point boundary. 1419 * If the requested position is a break boundary, leave the iteration 1420 * position on it. 1421 * If the requested position is not a boundary, leave the iteration 1422 * position on the preceding boundary and include both the the 1423 * preceding and following boundaries in the cache. 1424 * Additional boundaries, either preceding or following, may be added 1425 * to the cache as a side effect. 1426 * 1427 * Return false if the operation failed. 1428 */ populateNear(int position)1429 boolean populateNear(int position) { 1430 assert(position < fBoundaries[fStartBufIdx] || position > fBoundaries[fEndBufIdx]); 1431 1432 // Find a boundary somewhere in the vicinity of the requested position. 1433 // Depending on the safe rules and the text data, it could be either before, at, or after 1434 // the requested position. 1435 1436 1437 // If the requested position is not near already cached positions, clear the existing cache, 1438 // find a near-by boundary and begin new cache contents there. 1439 1440 if ((position < fBoundaries[fStartBufIdx] - 15) || position > (fBoundaries[fEndBufIdx] + 15)) { 1441 int aBoundary = fText.getBeginIndex(); 1442 int ruleStatusIndex = 0; 1443 if (position > aBoundary + 20) { 1444 int backupPos = handleSafePrevious(position); 1445 if (backupPos > aBoundary) { 1446 // Advance to the boundary following the backup position. 1447 // There is a complication: the safe reverse rules identify pairs of code points 1448 // that are safe. If advancing from the safe point moves forwards by less than 1449 // two code points, we need to advance one more time to ensure that the boundary 1450 // is good, including a correct rules status value. 1451 // 1452 fPosition = backupPos; 1453 aBoundary = handleNext(); 1454 if (aBoundary == backupPos + 1 || 1455 (aBoundary == backupPos + 2 && 1456 Character.isHighSurrogate(fText.setIndex(backupPos)) && 1457 Character.isLowSurrogate(fText.next()))) { 1458 // The initial handleNext() only advanced by a single code point. Go again. 1459 // Safe rules identify safe pairs. 1460 aBoundary = handleNext(); 1461 } 1462 } 1463 ruleStatusIndex = fRuleStatusIndex; 1464 } 1465 reset(aBoundary, ruleStatusIndex); // Reset cache to hold aBoundary as a single starting point. 1466 } 1467 1468 // Fill in boundaries between existing cache content and the new requested position. 1469 1470 if (fBoundaries[fEndBufIdx] < position) { 1471 // The last position in the cache precedes the requested position. 1472 // Add following position(s) to the cache. 1473 while (fBoundaries[fEndBufIdx] < position) { 1474 if (!populateFollowing()) { 1475 assert false; 1476 return false; 1477 } 1478 } 1479 fBufIdx = fEndBufIdx; // Set iterator position to the end of the buffer. 1480 fTextIdx = fBoundaries[fBufIdx]; // Required because populateFollowing may add extra boundaries. 1481 while (fTextIdx > position) { // Move backwards to a position at or preceding the requested pos. 1482 previous(); 1483 } 1484 return true; 1485 } 1486 1487 if (fBoundaries[fStartBufIdx] > position) { 1488 // The first position in the cache is beyond the requested position. 1489 // back up more until we get a boundary <= the requested position. 1490 while (fBoundaries[fStartBufIdx] > position) { 1491 populatePreceding(); 1492 } 1493 fBufIdx = fStartBufIdx; // Set iterator position to the start of the buffer. 1494 fTextIdx = fBoundaries[fBufIdx]; // Required because populatePreceding may add extra boundaries. 1495 while (fTextIdx < position) { // Move forwards to a position at or following the requested pos. 1496 next(); 1497 } 1498 if (fTextIdx > position) { 1499 // If position is not itself a boundary, the next() loop above will overshoot. 1500 // Back up one, leaving cache position at the boundary preceding the requested position. 1501 previous(); 1502 } 1503 return true; 1504 } 1505 1506 assert fTextIdx == position; 1507 return true; 1508 1509 }; 1510 1511 /** 1512 * Add boundary(s) to the cache following the current last boundary. 1513 * Return false if at the end of the text, and no more boundaries can be added. 1514 * Leave iteration position at the first newly added boundary, or unchanged if no boundary was added. 1515 */ populateFollowing()1516 boolean populateFollowing() { 1517 int fromPosition = fBoundaries[fEndBufIdx]; 1518 int fromRuleStatusIdx = fStatuses[fEndBufIdx]; 1519 int pos = 0; 1520 int ruleStatusIdx = 0; 1521 1522 if (fDictionaryCache.following(fromPosition)) { 1523 addFollowing(fDictionaryCache.fBoundary, fDictionaryCache.fStatusIndex, UpdateCachePosition); 1524 return true; 1525 } 1526 1527 fPosition = fromPosition; 1528 pos = handleNext(); 1529 if (pos == BreakIterator.DONE) { 1530 return false; 1531 } 1532 1533 ruleStatusIdx = fRuleStatusIndex; 1534 if (fDictionaryCharCount > 0) { 1535 // The text segment obtained from the rules includes dictionary characters. 1536 // Subdivide it, with subdivided results going into the dictionary cache. 1537 fDictionaryCache.populateDictionary(fromPosition, pos, fromRuleStatusIdx, ruleStatusIdx); 1538 if (fDictionaryCache.following(fromPosition)) { 1539 addFollowing(fDictionaryCache.fBoundary, fDictionaryCache.fStatusIndex, UpdateCachePosition); 1540 return true; 1541 // TODO: may want to move a sizable chunk of the dictionary cache to the break cache at this point. 1542 // But be careful with interactions with populateNear(). 1543 } 1544 } 1545 1546 // Rule based segment did not include dictionary characters. 1547 // Or, it did contain dictionary chars, but the dictionary segmenter didn't handle them, 1548 // meaning that we didn't take the return, above. 1549 // Add its end point to the cache. 1550 addFollowing(pos, ruleStatusIdx, UpdateCachePosition); 1551 1552 // Add several non-dictionary boundaries at this point, to optimize straight forward iteration. 1553 // (subsequent calls to BreakIterator::next() will take the fast path, getting cached results. 1554 // 1555 for (int count=0; count<6; ++count) { 1556 pos = handleNext(); 1557 if (pos == BreakIterator.DONE || fDictionaryCharCount > 0) { 1558 break; 1559 } 1560 addFollowing(pos, fRuleStatusIndex, RetainCachePosition); 1561 } 1562 return true; 1563 }; 1564 1565 /** 1566 * Add one or more boundaries to the cache preceding the first currently cached boundary. 1567 * Leave the iteration position on the first added boundary. 1568 * Return false if no boundaries could be added (if at the start of the text.) 1569 */ populatePreceding()1570 boolean populatePreceding() { 1571 int textBegin = fText.getBeginIndex(); 1572 int fromPosition = fBoundaries[fStartBufIdx]; 1573 if (fromPosition == textBegin) { 1574 return false; 1575 } 1576 1577 int position = textBegin; 1578 int positionStatusIdx = 0; 1579 1580 if (fDictionaryCache.preceding(fromPosition)) { 1581 addPreceding(fDictionaryCache.fBoundary, fDictionaryCache.fStatusIndex, UpdateCachePosition); 1582 return true; 1583 } 1584 1585 int backupPosition = fromPosition; 1586 1587 // Find a boundary somewhere preceding the first already-cached boundary 1588 do { 1589 backupPosition = backupPosition - 30; 1590 if (backupPosition <= textBegin) { 1591 backupPosition = textBegin; 1592 } else { 1593 backupPosition = handleSafePrevious(backupPosition); 1594 } 1595 if (backupPosition == BreakIterator.DONE || backupPosition == textBegin) { 1596 position = textBegin; 1597 positionStatusIdx = 0; 1598 } else { 1599 // Advance to the boundary following the backup position. 1600 // There is a complication: the safe reverse rules identify pairs of code points 1601 // that are safe. If advancing from the safe point moves forwards by less than 1602 // two code points, we need to advance one more time to ensure that the boundary 1603 // is good, including a correct rules status value. 1604 // 1605 fPosition = backupPosition; // TODO: pass starting position in a clearer way. 1606 position = handleNext(); 1607 if (position == backupPosition + 1 || 1608 (position == backupPosition + 2 && 1609 Character.isHighSurrogate(fText.setIndex(backupPosition)) && 1610 Character.isLowSurrogate(fText.next()))) { 1611 // The initial handleNext() only advanced by a single code point. Go again. 1612 // Safe rules identify safe pairs. 1613 position = handleNext(); 1614 } 1615 positionStatusIdx = fRuleStatusIndex; 1616 } 1617 } while (position >= fromPosition); 1618 1619 // Find boundaries between the one we just located and the first already-cached boundary 1620 // Put them in a side buffer, because we don't yet know where they will fall in the circular cache buffer. 1621 1622 fSideBuffer.removeAllElements(); 1623 fSideBuffer.push(position); 1624 fSideBuffer.push(positionStatusIdx); 1625 1626 do { 1627 int prevPosition = fPosition = position; 1628 int prevStatusIdx = positionStatusIdx; 1629 position = handleNext(); 1630 positionStatusIdx = fRuleStatusIndex; 1631 if (position == BreakIterator.DONE) { 1632 break; 1633 } 1634 1635 boolean segmentHandledByDictionary = false; 1636 if (fDictionaryCharCount != 0) { 1637 // Segment from the rules includes dictionary characters. 1638 // Subdivide it, with subdivided results going into the dictionary cache. 1639 int dictSegEndPosition = position; 1640 fDictionaryCache.populateDictionary(prevPosition, dictSegEndPosition, prevStatusIdx, positionStatusIdx); 1641 while (fDictionaryCache.following(prevPosition)) { 1642 position = fDictionaryCache.fBoundary; 1643 positionStatusIdx = fDictionaryCache.fStatusIndex; 1644 segmentHandledByDictionary = true; 1645 assert(position > prevPosition); 1646 if (position >= fromPosition) { 1647 break; 1648 } 1649 assert(position <= dictSegEndPosition); 1650 fSideBuffer.push(position); 1651 fSideBuffer.push(positionStatusIdx); 1652 prevPosition = position; 1653 } 1654 assert(position==dictSegEndPosition || position>=fromPosition); 1655 } 1656 1657 if (!segmentHandledByDictionary && position < fromPosition) { 1658 fSideBuffer.push(position); 1659 fSideBuffer.push(positionStatusIdx); 1660 } 1661 } while (position < fromPosition); 1662 1663 // Move boundaries from the side buffer to the main circular buffer. 1664 boolean success = false; 1665 if (!fSideBuffer.isEmpty()) { 1666 positionStatusIdx = fSideBuffer.pop(); 1667 position = fSideBuffer.pop(); 1668 addPreceding(position, positionStatusIdx, UpdateCachePosition); 1669 success = true; 1670 } 1671 1672 while (!fSideBuffer.isEmpty()) { 1673 positionStatusIdx = fSideBuffer.pop(); 1674 position = fSideBuffer.pop(); 1675 if (!addPreceding(position, positionStatusIdx, RetainCachePosition)) { 1676 // No space in circular buffer to hold a new preceding result while 1677 // also retaining the current cache (iteration) position. 1678 // Bailing out is safe; the cache will refill again if needed. 1679 break; 1680 } 1681 } 1682 return success; 1683 }; 1684 1685 1686 static final boolean RetainCachePosition = false; 1687 static final boolean UpdateCachePosition = true; 1688 1689 /** 1690 * Add the boundary following the current position. 1691 * The current position can be left as it was, or changed to the newly added boundary, 1692 * as specified by the update parameter. 1693 */ addFollowing(int position, int ruleStatusIdx, boolean update)1694 void addFollowing(int position, int ruleStatusIdx, boolean update) { 1695 assert(position > fBoundaries[fEndBufIdx]); 1696 assert(ruleStatusIdx <= Short.MAX_VALUE); 1697 int nextIdx = modChunkSize(fEndBufIdx + 1); 1698 if (nextIdx == fStartBufIdx) { 1699 fStartBufIdx = modChunkSize(fStartBufIdx + 6); // TODO: experiment. Probably revert to 1. 1700 } 1701 fBoundaries[nextIdx] = position; 1702 fStatuses[nextIdx] = (short)ruleStatusIdx; 1703 fEndBufIdx = nextIdx; 1704 if (update == UpdateCachePosition) { 1705 // Set current position to the newly added boundary. 1706 fBufIdx = nextIdx; 1707 fTextIdx = position; 1708 } else { 1709 // Retaining the original cache position. 1710 // Check if the added boundary wraps around the buffer, and would over-write the original position. 1711 // It's the responsibility of callers of this function to not add too many. 1712 assert(nextIdx != fBufIdx); 1713 } 1714 1715 }; 1716 1717 1718 /** 1719 * Add the boundary preceding the current position. 1720 * The current position can be left as it was, or changed to the newly added boundary, 1721 * as specified by the update parameter. 1722 */ addPreceding(int position, int ruleStatusIdx, boolean update)1723 boolean addPreceding(int position, int ruleStatusIdx, boolean update) { 1724 assert(position < fBoundaries[fStartBufIdx]); 1725 assert(ruleStatusIdx <= Short.MAX_VALUE); 1726 int nextIdx = modChunkSize(fStartBufIdx - 1); 1727 if (nextIdx == fEndBufIdx) { 1728 if (fBufIdx == fEndBufIdx && update == RetainCachePosition) { 1729 // Failure. The insertion of the new boundary would claim the buffer position that is the 1730 // current iteration position. And we also want to retain the current iteration position. 1731 // (The buffer is already completely full of entries that precede the iteration position.) 1732 return false; 1733 } 1734 fEndBufIdx = modChunkSize(fEndBufIdx - 1); 1735 } 1736 fBoundaries[nextIdx] = position; 1737 fStatuses[nextIdx] = (short)ruleStatusIdx; 1738 fStartBufIdx = nextIdx; 1739 if (update == UpdateCachePosition) { 1740 fBufIdx = nextIdx; 1741 fTextIdx = position; 1742 } 1743 return true; 1744 }; 1745 1746 /** 1747 * Set the cache position to the specified position, or, if the position 1748 * falls between to cached boundaries, to the preceding boundary. 1749 * Fails if the requested position is outside of the range of boundaries currently held by the cache. 1750 * The startPosition must be on a code point boundary. 1751 * 1752 * Return true if successful, false if the specified position is after 1753 * the last cached boundary or before the first. 1754 */ 1755 boolean seek(int pos) { 1756 if (pos < fBoundaries[fStartBufIdx] || pos > fBoundaries[fEndBufIdx]) { 1757 return false; 1758 } 1759 if (pos == fBoundaries[fStartBufIdx]) { 1760 // Common case: seek(0), from BreakIterator::first() 1761 fBufIdx = fStartBufIdx; 1762 fTextIdx = fBoundaries[fBufIdx]; 1763 return true; 1764 } 1765 if (pos == fBoundaries[fEndBufIdx]) { 1766 fBufIdx = fEndBufIdx; 1767 fTextIdx = fBoundaries[fBufIdx]; 1768 return true; 1769 } 1770 1771 int min = fStartBufIdx; 1772 int max = fEndBufIdx; 1773 while (min != max) { 1774 int probe = (min + max + (min>max ? CACHE_SIZE : 0)) / 2; 1775 probe = modChunkSize(probe); 1776 if (fBoundaries[probe] > pos) { 1777 max = probe; 1778 } else { 1779 min = modChunkSize(probe + 1); 1780 } 1781 } 1782 assert(fBoundaries[max] > pos); 1783 fBufIdx = modChunkSize(max - 1); 1784 fTextIdx = fBoundaries[fBufIdx]; 1785 assert(fTextIdx <= pos); 1786 return true; 1787 1788 }; 1789 1790 1791 /** 1792 * copy constructor, used from RuleBasedBreakIterator.clone(). 1793 * 1794 * @param src 1795 */ BreakCache(BreakCache src)1796 BreakCache(BreakCache src) { 1797 fStartBufIdx = src.fStartBufIdx; 1798 fEndBufIdx = src.fEndBufIdx; 1799 fTextIdx = src.fTextIdx; 1800 fBufIdx = src.fBufIdx; 1801 fBoundaries = src.fBoundaries.clone(); 1802 fStatuses = src.fStatuses.clone(); 1803 fSideBuffer = new DictionaryBreakEngine.DequeI(); // Transient, no need to clone contents. 1804 } 1805 dumpCache()1806 void dumpCache() { 1807 System.out.printf("fTextIdx:%d fBufIdx:%d%n", fTextIdx, fBufIdx); 1808 for (int i=fStartBufIdx; ; i=modChunkSize(i+1)) { 1809 System.out.printf("%d %d%n", i, fBoundaries[i]); 1810 if (i == fEndBufIdx) { 1811 break; 1812 } 1813 } 1814 }; 1815 modChunkSize(int index)1816 private final int modChunkSize(int index) { return index & (CACHE_SIZE - 1); }; 1817 1818 static final int CACHE_SIZE = 128; 1819 // static_assert((CACHE_SIZE & (CACHE_SIZE-1)) == 0, "CACHE_SIZE must be power of two."); 1820 1821 int fStartBufIdx; 1822 int fEndBufIdx; // inclusive 1823 1824 int fTextIdx; 1825 int fBufIdx; 1826 1827 int[] fBoundaries = new int[CACHE_SIZE]; 1828 short[] fStatuses = new short[CACHE_SIZE]; 1829 1830 DictionaryBreakEngine.DequeI fSideBuffer = new DictionaryBreakEngine.DequeI(); 1831 }; 1832 1833 1834 1835 1836 } 1837 1838