1 /* GENERATED SOURCE. DO NOT MODIFY. */ 2 // © 2016 and later: Unicode, Inc. and others. 3 // License & terms of use: http://www.unicode.org/copyright.html#License 4 /* 5 ******************************************************************************* 6 * Copyright (C) 2005-2016 International Business Machines Corporation and 7 * others. All Rights Reserved. 8 ******************************************************************************* 9 */ 10 11 package ohos.global.icu.text; 12 13 import static ohos.global.icu.impl.CharacterIteration.DONE32; 14 import static ohos.global.icu.impl.CharacterIteration.next32; 15 import static ohos.global.icu.impl.CharacterIteration.nextTrail32; 16 17 import java.io.ByteArrayOutputStream; 18 import java.io.IOException; 19 import java.io.InputStream; 20 import java.io.OutputStream; 21 import java.nio.ByteBuffer; 22 import java.text.CharacterIterator; 23 import java.util.ArrayList; 24 import java.util.List; 25 26 import ohos.global.icu.impl.CharacterIteration; 27 import ohos.global.icu.impl.ICUBinary; 28 import ohos.global.icu.impl.ICUDebug; 29 import ohos.global.icu.impl.RBBIDataWrapper; 30 import ohos.global.icu.impl.Trie2; 31 import ohos.global.icu.lang.UCharacter; 32 import ohos.global.icu.lang.UProperty; 33 import ohos.global.icu.lang.UScript; 34 35 /** 36 * Rule Based Break Iterator 37 * This is a port of the C++ class RuleBasedBreakIterator from ICU4C. 38 * 39 * @hide exposed on OHOS 40 */ 41 public class RuleBasedBreakIterator extends BreakIterator { 42 //======================================================================= 43 // Constructors & Factories 44 //======================================================================= 45 46 /** 47 * private constructor 48 */ RuleBasedBreakIterator()49 private RuleBasedBreakIterator() { 50 fDictionaryCharCount = 0; 51 synchronized(gAllBreakEngines) { 52 fBreakEngines = new ArrayList<>(gAllBreakEngines); 53 } 54 } 55 56 /** 57 * Create a break iterator from a precompiled set of break rules. 58 * 59 * Creating a break iterator from the binary rules is much faster than 60 * creating one from source rules. 61 * 62 * The binary rules are generated by the RuleBasedBreakIterator.compileRules() function. 63 * Binary break iterator rules are not guaranteed to be compatible between 64 * different versions of ICU. 65 * 66 * @param is an input stream supplying the compiled binary rules. 67 * @throws IOException if there is an error while reading the rules from the InputStream. 68 * @see #compileRules(String, OutputStream) 69 */ getInstanceFromCompiledRules(InputStream is)70 public static RuleBasedBreakIterator getInstanceFromCompiledRules(InputStream is) throws IOException { 71 RuleBasedBreakIterator This = new RuleBasedBreakIterator(); 72 This.fRData = RBBIDataWrapper.get(ICUBinary.getByteBufferFromInputStreamAndCloseStream(is)); 73 return This; 74 } 75 76 /** 77 * Create a break iterator from a precompiled set of break rules. 78 * 79 * Creating a break iterator from the binary rules is much faster than 80 * creating one from source rules. 81 * 82 * The binary rules are generated by the RuleBasedBreakIterator.compileRules() function. 83 * Binary break iterator rules are not guaranteed to be compatible between 84 * different versions of ICU. 85 * 86 * @param bytes a buffer supplying the compiled binary rules. 87 * @throws IOException if there is an error while reading the rules from the buffer. 88 * @see #compileRules(String, OutputStream) 89 * @deprecated This API is ICU internal only. 90 * @hide draft / provisional / internal are hidden on OHOS 91 */ 92 @Deprecated getInstanceFromCompiledRules(ByteBuffer bytes)93 public static RuleBasedBreakIterator getInstanceFromCompiledRules(ByteBuffer bytes) throws IOException { 94 RuleBasedBreakIterator This = new RuleBasedBreakIterator(); 95 This.fRData = RBBIDataWrapper.get(bytes); 96 return This; 97 } 98 99 /** 100 * Construct a RuleBasedBreakIterator from a set of rules supplied as a string. 101 * @param rules The break rules to be used. 102 */ RuleBasedBreakIterator(String rules)103 public RuleBasedBreakIterator(String rules) { 104 this(); 105 try { 106 ByteArrayOutputStream ruleOS = new ByteArrayOutputStream(); 107 compileRules(rules, ruleOS); 108 fRData = RBBIDataWrapper.get(ByteBuffer.wrap(ruleOS.toByteArray())); 109 } catch (IOException e) { 110 ///CLOVER:OFF 111 // An IO exception can only arrive here if there is a bug in the RBBI Rule compiler, 112 // causing bogus compiled rules to be produced, but with no compile error raised. 113 RuntimeException rte = new RuntimeException("RuleBasedBreakIterator rule compilation internal error: " 114 + e.getMessage()); 115 throw rte; 116 ///CLOVER:ON 117 } 118 } 119 120 //======================================================================= 121 // Boilerplate 122 //======================================================================= 123 124 /** 125 * Clones this iterator. 126 * @return A newly-constructed RuleBasedBreakIterator with the same 127 * behavior as this one. 128 */ 129 @Override clone()130 public Object clone() { 131 RuleBasedBreakIterator result; 132 result = (RuleBasedBreakIterator)super.clone(); 133 if (fText != null) { 134 result.fText = (CharacterIterator)(fText.clone()); 135 } 136 synchronized (gAllBreakEngines) { 137 result.fBreakEngines = new ArrayList<>(gAllBreakEngines); 138 } 139 result.fLookAheadMatches = new LookAheadResults(); 140 result.fBreakCache = result.new BreakCache(fBreakCache); 141 result.fDictionaryCache = result.new DictionaryCache(fDictionaryCache); 142 return result; 143 } 144 145 146 /** 147 * Returns true if both BreakIterators are of the same class, have the same 148 * rules, and iterate over the same text. 149 */ 150 @Override equals(Object that)151 public boolean equals(Object that) { 152 if (that == null) { 153 return false; 154 } 155 if (this == that) { 156 return true; 157 } 158 try { 159 RuleBasedBreakIterator other = (RuleBasedBreakIterator) that; 160 if (fRData != other.fRData && (fRData == null || other.fRData == null)) { 161 return false; 162 } 163 if (fRData != null && other.fRData != null && 164 (!fRData.fRuleSource.equals(other.fRData.fRuleSource))) { 165 return false; 166 } 167 if (fText == null && other.fText == null) { 168 return true; 169 } 170 if (fText == null || other.fText == null || !fText.equals(other.fText)) { 171 return false; 172 } 173 return fPosition == other.fPosition; 174 } 175 catch(ClassCastException e) { 176 return false; 177 } 178 } 179 180 /** 181 * Returns the description (rules) used to create this iterator. 182 * (In ICU4C, the same function is RuleBasedBreakIterator::getRules()) 183 */ 184 @Override toString()185 public String toString() { 186 String retStr = ""; 187 if (fRData != null) { 188 retStr = fRData.fRuleSource; 189 } 190 return retStr; 191 } 192 193 /** 194 * Compute a hashcode for this BreakIterator 195 * @return A hash code 196 */ 197 @Override hashCode()198 public int hashCode() 199 { 200 return fRData.fRuleSource.hashCode(); 201 } 202 203 204 private static final int START_STATE = 1; // The state number of the starting state 205 private static final int STOP_STATE = 0; // The state-transition value indicating "stop" 206 207 // RBBIRunMode - the state machine runs an extra iteration at the beginning and end 208 // of user text. A variable with this enum type keeps track of where we 209 // are. The state machine only fetches user text input while in RUN mode. 210 private static final int RBBI_START = 0; 211 private static final int RBBI_RUN = 1; 212 private static final int RBBI_END = 2; 213 214 /** 215 * The character iterator through which this BreakIterator accesses the text. 216 */ 217 private CharacterIterator fText = new java.text.StringCharacterIterator(""); 218 219 /** 220 * The rule data for this BreakIterator instance. 221 * Not intended for public use. Declared public for testing purposes only. 222 * @deprecated This API is ICU internal only. 223 * @hide draft / provisional / internal are hidden on OHOS 224 */ 225 @Deprecated 226 public RBBIDataWrapper fRData; 227 228 /** 229 * The iteration state - current position, rule status for the current position, 230 * and whether the iterator ran off the end, yielding UBRK_DONE. 231 * Current position is pinned to be 0 < position <= text.length. 232 * Current position is always set to a boundary. 233 * 234 * The current position of the iterator. Pinned, 0 < fPosition <= text.length. 235 * Never has the value UBRK_DONE (-1). 236 */ 237 private int fPosition; 238 239 /** 240 * Index of the Rule {tag} values for the most recent match. 241 */ 242 private int fRuleStatusIndex; 243 244 /** 245 * True when iteration has run off the end, and iterator functions should return UBRK_DONE. 246 */ 247 private boolean fDone; 248 249 /** 250 * Cache of previously determined boundary positions. 251 */ 252 private BreakCache fBreakCache = new BreakCache(); 253 254 255 /** 256 * Counter for the number of characters encountered with the "dictionary" 257 * flag set. Normal RBBI iterators don't use it, although the code 258 * for updating it is live. Dictionary Based break iterators (a subclass 259 * of us) access this field directly. 260 * @hide draft / provisional / internal are hidden on OHOS 261 */ 262 private int fDictionaryCharCount; 263 264 private DictionaryCache fDictionaryCache = new DictionaryCache(); 265 266 /** 267 * ICU debug argument name for RBBI 268 */ 269 private static final String RBBI_DEBUG_ARG = "rbbi"; 270 271 /** 272 * Debugging flag. Trace operation of state machine when true. 273 */ 274 private static final boolean TRACE = ICUDebug.enabled(RBBI_DEBUG_ARG) 275 && ICUDebug.value(RBBI_DEBUG_ARG).indexOf("trace") >= 0; 276 277 /** 278 * The "default" break engine - just skips over ranges of dictionary words, 279 * producing no breaks. Should only be used if characters need to be handled 280 * by a dictionary but we have no dictionary implementation for them. 281 * 282 * Only one instance; shared by all break iterators. 283 */ 284 private static final UnhandledBreakEngine gUnhandledBreakEngine; 285 286 /** 287 * List of all known break engines, common for all break iterators. 288 * Lazily updated as break engines are needed, because instantiation of 289 * break engines is expensive. 290 * 291 * Because gAllBreakEngines can be referenced concurrently from different 292 * BreakIterator instances, all access is synchronized. 293 */ 294 private static final List<LanguageBreakEngine> gAllBreakEngines; 295 296 static { 297 gUnhandledBreakEngine = new UnhandledBreakEngine(); 298 gAllBreakEngines = new ArrayList<>(); 299 gAllBreakEngines.add(gUnhandledBreakEngine); 300 } 301 302 /** 303 * List of all known break engines. Similar to gAllBreakEngines, but local to a 304 * break iterator, allowing it to be used without synchronization. 305 */ 306 private List<LanguageBreakEngine> fBreakEngines; 307 308 /** 309 * Dump the contents of the state table and character classes for this break iterator. 310 * For debugging only. 311 * @deprecated This API is ICU internal only. 312 * @hide draft / provisional / internal are hidden on OHOS 313 */ 314 @Deprecated dump(java.io.PrintStream out)315 public void dump(java.io.PrintStream out) { 316 if (out == null) { 317 out = System.out; 318 } 319 this.fRData.dump(out); 320 } 321 322 /** 323 * Compile a set of source break rules into the binary state tables used 324 * by the break iterator engine. Creating a break iterator from precompiled 325 * rules is much faster than creating one from source rules. 326 * 327 * Binary break rules are not guaranteed to be compatible between different 328 * versions of ICU. 329 * 330 * 331 * @param rules The source form of the break rules 332 * @param ruleBinary An output stream to receive the compiled rules. 333 * @throws IOException If there is an error writing the output. 334 * @see #getInstanceFromCompiledRules(InputStream) 335 */ compileRules(String rules, OutputStream ruleBinary)336 public static void compileRules(String rules, OutputStream ruleBinary) throws IOException { 337 RBBIRuleBuilder.compileRules(rules, ruleBinary); 338 } 339 340 //======================================================================= 341 // BreakIterator overrides 342 //======================================================================= 343 344 /** 345 * Sets the current iteration position to the beginning of the text. 346 * (i.e., the CharacterIterator's starting offset). 347 * @return The offset of the beginning of the text. 348 */ 349 @Override first()350 public int first() { 351 if (fText == null) { 352 return BreakIterator.DONE; 353 } 354 fText.first(); 355 int start = fText.getIndex(); 356 if (!fBreakCache.seek(start)) { 357 fBreakCache.populateNear(start); 358 } 359 fBreakCache.current(); 360 assert(fPosition == start); 361 return fPosition; 362 } 363 364 /** 365 * Sets the current iteration position to the end of the text. 366 * (i.e., the CharacterIterator's ending offset). 367 * @return The text's past-the-end offset. 368 */ 369 @Override last()370 public int last() { 371 if (fText == null) { 372 return BreakIterator.DONE; 373 } 374 int endPos = fText.getEndIndex(); 375 boolean endShouldBeBoundary = isBoundary(endPos); // Has side effect of setting iterator position. 376 assert(endShouldBeBoundary); 377 if (fPosition != endPos) { 378 assert(fPosition == endPos); 379 } 380 return endPos; 381 } 382 383 /** 384 * Advances the iterator either forward or backward the specified number of steps. 385 * Negative values move backward, and positive values move forward. This is 386 * equivalent to repeatedly calling next() or previous(). 387 * @param n The number of steps to move. The sign indicates the direction 388 * (negative is backwards, and positive is forwards). 389 * @return The character offset of the boundary position n boundaries away from 390 * the current one. 391 */ 392 @Override next(int n)393 public int next(int n) { 394 int result = 0; 395 if (n > 0) { 396 for (; n > 0 && result != DONE; --n) { 397 result = next(); 398 } 399 } else if (n < 0) { 400 for (; n < 0 && result != DONE; ++n) { 401 result = previous(); 402 } 403 } else { 404 result = current(); 405 } 406 return result; 407 } 408 409 /** 410 * Advances the iterator to the next boundary position. 411 * @return The position of the first boundary after this one. 412 */ 413 @Override next()414 public int next() { 415 fBreakCache.next(); 416 return fDone ? DONE : fPosition; 417 } 418 419 /** 420 * Moves the iterator backwards, to the boundary preceding the current one. 421 * @return The position of the boundary position immediately preceding the starting position. 422 */ 423 @Override previous()424 public int previous() { 425 fBreakCache.previous(); 426 return fDone ? DONE : fPosition; 427 } 428 429 /** 430 * Sets the iterator to refer to the first boundary position following 431 * the specified position. 432 * @param startPos The position from which to begin searching for a break position. 433 * @return The position of the first break after the current position. 434 */ 435 @Override following(int startPos)436 public int following(int startPos) { 437 // if the supplied position is before the beginning, return the 438 // text's starting offset 439 if (startPos < fText.getBeginIndex()) { 440 return first(); 441 } 442 443 // Move requested offset to a code point start. It might be between a lead and trail surrogate. 444 // Or it may be beyond the end of the text. 445 startPos = CISetIndex32(fText, startPos); 446 fBreakCache.following(startPos); 447 return fDone ? DONE : fPosition; 448 } 449 450 451 /** 452 * Sets the iterator to refer to the last boundary position before the 453 * specified position. 454 * @param offset The position to begin searching for a break from. 455 * @return The position of the last boundary before the starting position. 456 */ 457 @Override preceding(int offset)458 public int preceding(int offset) { 459 if (fText == null || offset > fText.getEndIndex()) { 460 return last(); 461 } else if (offset < fText.getBeginIndex()) { 462 return first(); 463 } 464 465 // Move requested offset to a code point start. It might be between a lead and trail surrogate. 466 // int adjustedOffset = CISetIndex32(fText, offset); // TODO: restore to match ICU4C behavior. 467 int adjustedOffset = offset; 468 fBreakCache.preceding(adjustedOffset); 469 return fDone ? DONE : fPosition; 470 471 } 472 473 474 /** 475 * Throw IllegalArgumentException unless begin <= offset < end. 476 */ checkOffset(int offset, CharacterIterator text)477 protected static final void checkOffset(int offset, CharacterIterator text) { 478 if (offset < text.getBeginIndex() || offset > text.getEndIndex()) { 479 throw new IllegalArgumentException("offset out of bounds"); 480 } 481 } 482 483 484 /** 485 * Returns true if the specified position is a boundary position. As a side 486 * effect, leaves the iterator pointing to the first boundary position at 487 * or after "offset". 488 * @param offset the offset to check. 489 * @return True if "offset" is a boundary position. 490 */ 491 @Override isBoundary(int offset)492 public boolean isBoundary(int offset) { 493 // TODO: behavior difference with ICU4C, which considers out-of-range offsets 494 // to not be boundaries, and to not be errors. 495 checkOffset(offset, fText); 496 497 // Adjust offset to be on a code point boundary and not beyond the end of the text. 498 // Note that isBoundary() is always false for offsets that are not on code point boundaries. 499 // But we still need the side effect of leaving iteration at the following boundary. 500 int adjustedOffset = CISetIndex32(fText, offset); 501 502 boolean result = false; 503 if (fBreakCache.seek(adjustedOffset) || fBreakCache.populateNear(adjustedOffset)) { 504 result = (fBreakCache.current() == offset); 505 } 506 507 if (!result) { 508 // Not on a boundary. isBoundary() must leave iterator on the following boundary. 509 // fBreakCache.seek(), above, left us on the preceding boundary, so advance one. 510 next(); 511 } 512 return result; 513 514 } 515 516 /** 517 * Returns the current iteration position. Note that DONE is never 518 * returned from this function; if iteration has run to the end of a 519 * string, current() will return the length of the string while 520 * next() will return BreakIterator.DONE). 521 * @return The current iteration position. 522 */ 523 @Override current()524 public int current() { 525 return (fText != null) ? fPosition : BreakIterator.DONE; 526 } 527 528 529 /** 530 * Return the status tag from the break rule that determined the boundary at 531 * the current iteration position. The values appear in the rule source 532 * within brackets, {123}, for example. For rules that do not specify a 533 * status, a default value of 0 is returned. If more than one rule applies, 534 * the numerically largest of the possible status values is returned. 535 * <p> 536 * Of the standard types of ICU break iterators, only the word and line break 537 * iterator provides status values. The values are defined in 538 * class RuleBasedBreakIterator, and allow distinguishing between words 539 * that contain alphabetic letters, "words" that appear to be numbers, 540 * punctuation and spaces, words containing ideographic characters, and 541 * more. Call <code>getRuleStatus</code> after obtaining a boundary 542 * position from <code>next()</code>, <code>previous()</code>, or 543 * any other break iterator functions that returns a boundary position. 544 * <p> 545 * Note that <code>getRuleStatus()</code> returns the value corresponding to 546 * <code>current()</code> index even after <code>next()</code> has returned DONE. 547 * <p> 548 549 * @return the status from the break rule that determined the boundary 550 * at the current iteration position. 551 */ 552 553 @Override getRuleStatus()554 public int getRuleStatus() { 555 // Status records have this form: 556 // Count N <-- fLastRuleStatusIndex points here. 557 // Status val 0 558 // Status val 1 559 // ... 560 // Status val N-1 <-- the value we need to return 561 // The status values are sorted in ascending order. 562 // This function returns the last (largest) of the array of status values. 563 int idx = fRuleStatusIndex + fRData.fStatusTable[fRuleStatusIndex]; 564 int tagVal = fRData.fStatusTable[idx]; 565 return tagVal; 566 } 567 568 /** 569 * Get the status (tag) values from the break rule(s) that determined the boundary 570 * at the current iteration position. The values appear in the rule source 571 * within brackets, {123}, for example. The default status value for rules 572 * that do not explicitly provide one is zero. 573 * <p> 574 * The status values used by the standard ICU break rules are defined 575 * as public constants in class RuleBasedBreakIterator. 576 * <p> 577 * If the size of the output array is insufficient to hold the data, 578 * the output will be truncated to the available length. No exception 579 * will be thrown. 580 * 581 * @param fillInArray an array to be filled in with the status values. 582 * @return The number of rule status values from the rules that determined 583 * the boundary at the current iteration position. 584 * In the event that the array is too small, the return value 585 * is the total number of status values that were available, 586 * not the reduced number that were actually returned. 587 */ 588 @Override getRuleStatusVec(int[] fillInArray)589 public int getRuleStatusVec(int[] fillInArray) { 590 int numStatusVals = fRData.fStatusTable[fRuleStatusIndex]; 591 if (fillInArray != null) { 592 int numToCopy = Math.min(numStatusVals, fillInArray.length); 593 for (int i=0; i<numToCopy; i++) { 594 fillInArray[i] = fRData.fStatusTable[fRuleStatusIndex + i + 1]; 595 } 596 } 597 return numStatusVals; 598 } 599 600 /** 601 * Returns a CharacterIterator over the text being analyzed. 602 * <p> 603 * <b><i>Caution:</i></b>The state of the returned CharacterIterator 604 * must not be modified in any way while the BreakIterator is still in use. 605 * Doing so will lead to undefined behavior of the BreakIterator. 606 * Clone the returned CharacterIterator first and work with that. 607 * <p> 608 * The returned CharacterIterator is a reference 609 * to the <b>actual iterator being used</b> by the BreakIterator. 610 * No guarantees are made about the current position 611 * of this iterator when it is returned; it may differ from the 612 * BreakIterators current position. If you need to move that 613 * position to examine the text, clone this function's return value first. 614 * @return An iterator over the text being analyzed. 615 */ 616 @Override getText()617 public CharacterIterator getText() { 618 return fText; 619 } 620 621 /** 622 * Set the iterator to analyze a new piece of text. This function resets 623 * the current iteration position to the beginning of the text. 624 * (The old iterator is dropped.) 625 * <p> 626 * <b><i>Caution:</i></b> The supplied CharacterIterator is used 627 * directly by the BreakIterator, and must not be altered in any 628 * way by code outside of the BreakIterator. 629 * Doing so will lead to undefined behavior of the BreakIterator. 630 * 631 * @param newText An iterator over the text to analyze. 632 */ 633 @Override setText(CharacterIterator newText)634 public void setText(CharacterIterator newText) { 635 if (newText != null) { 636 fBreakCache.reset(newText.getBeginIndex(), 0); 637 } else { 638 fBreakCache.reset(); 639 } 640 fDictionaryCache.reset(); 641 fText = newText; 642 this.first(); 643 } 644 645 /** 646 * Control debug, trace and dump options. 647 * @deprecated This API is ICU internal only. 648 * @hide draft / provisional / internal are hidden on OHOS 649 */ 650 @Deprecated 651 public static final String fDebugEnv = ICUDebug.enabled(RBBI_DEBUG_ARG) ? 652 ICUDebug.value(RBBI_DEBUG_ARG) : null; 653 654 getLanguageBreakEngine(int c)655 private LanguageBreakEngine getLanguageBreakEngine(int c) { 656 657 // We have a dictionary character. 658 // Does an already instantiated break engine handle it? 659 for (LanguageBreakEngine candidate : fBreakEngines) { 660 if (candidate.handles(c)) { 661 return candidate; 662 } 663 } 664 665 synchronized (gAllBreakEngines) { 666 // This break iterator's list of break engines didn't handle the character. 667 // Check the global list, another break iterator may have instantiated the 668 // desired engine. 669 for (LanguageBreakEngine candidate : gAllBreakEngines) { 670 if (candidate.handles(c)) { 671 fBreakEngines.add(candidate); 672 return candidate; 673 } 674 } 675 676 // The global list doesn't have an existing engine, build one. 677 int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT); 678 if (script == UScript.KATAKANA || script == UScript.HIRAGANA) { 679 // Katakana, Hiragana and Han are handled by the same dictionary engine. 680 // Fold them together for mapping from script -> engine. 681 script = UScript.HAN; 682 } 683 684 LanguageBreakEngine eng; 685 try { 686 switch (script) { 687 case UScript.THAI: 688 eng = new ThaiBreakEngine(); 689 break; 690 case UScript.LAO: 691 eng = new LaoBreakEngine(); 692 break; 693 case UScript.MYANMAR: 694 eng = new BurmeseBreakEngine(); 695 break; 696 case UScript.KHMER: 697 eng = new KhmerBreakEngine(); 698 break; 699 case UScript.HAN: 700 eng = new CjkBreakEngine(false); 701 break; 702 case UScript.HANGUL: 703 eng = new CjkBreakEngine(true); 704 break; 705 default: 706 gUnhandledBreakEngine.handleChar(c); 707 eng = gUnhandledBreakEngine; 708 break; 709 } 710 } catch (IOException e) { 711 eng = null; 712 } 713 714 if (eng != null && eng != gUnhandledBreakEngine) { 715 gAllBreakEngines.add(eng); 716 fBreakEngines.add(eng); 717 } 718 return eng; 719 } // end synchronized(gAllBreakEngines) 720 } 721 722 private static final int kMaxLookaheads = 8; 723 private static class LookAheadResults { 724 int fUsedSlotLimit; 725 int[] fPositions; 726 int[] fKeys; 727 LookAheadResults()728 LookAheadResults() { 729 fUsedSlotLimit= 0; 730 fPositions = new int[kMaxLookaheads]; 731 fKeys = new int[kMaxLookaheads]; 732 } 733 getPosition(int key)734 int getPosition(int key) { 735 for (int i=0; i<fUsedSlotLimit; ++i) { 736 if (fKeys[i] == key) { 737 return fPositions[i]; 738 } 739 } 740 assert(false); 741 return -1; 742 } 743 setPosition(int key, int position)744 void setPosition(int key, int position) { 745 int i; 746 for (i=0; i<fUsedSlotLimit; ++i) { 747 if (fKeys[i] == key) { 748 fPositions[i] = position; 749 return; 750 } 751 } 752 if (i >= kMaxLookaheads) { 753 assert(false); 754 i = kMaxLookaheads - 1; 755 } 756 fKeys[i] = key; 757 fPositions[i] = position; 758 assert(fUsedSlotLimit == i); 759 fUsedSlotLimit = i + 1; 760 } 761 reset()762 void reset() { 763 fUsedSlotLimit = 0; 764 } 765 }; 766 private LookAheadResults fLookAheadMatches = new LookAheadResults(); 767 768 769 /** 770 * The State Machine Engine for moving forward is here. 771 * This function is the heart of the RBBI run time engine. 772 * 773 * Input 774 * fPosition, the position in the text to begin from. 775 * Output 776 * fPosition: the boundary following the starting position. 777 * fDictionaryCharCount the number of dictionary characters encountered. 778 * If > 0, the segment will be further subdivided 779 * fRuleStatusIndex Info from the state table indicating which rules caused the boundary. 780 * 781 * @return the new iterator position 782 * 783 * A note on supplementary characters and the position of underlying 784 * Java CharacterIterator: Normally, a character iterator is positioned at 785 * the char most recently returned by next(). Within this function, when 786 * a supplementary char is being processed, the char iterator is left 787 * sitting on the trail surrogate, in the middle of the code point. 788 * This is different from everywhere else, where an iterator always 789 * points at the lead surrogate of a supplementary. 790 */ handleNext()791 private int handleNext() { 792 if (TRACE) { 793 System.out.println("Handle Next pos char state category"); 794 } 795 796 // handleNext always sets the break tag value. 797 // Set the default for it. 798 fRuleStatusIndex = 0; 799 fDictionaryCharCount = 0; 800 801 // caches for quicker access 802 CharacterIterator text = fText; 803 Trie2 trie = fRData.fTrie; 804 805 short[] stateTable = fRData.fFTable.fTable; 806 int initialPosition = fPosition; 807 text.setIndex(initialPosition); 808 int result = initialPosition; 809 810 // Set up the starting char 811 int c = text.current(); 812 if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) { 813 c = nextTrail32(text, c); 814 if (c == DONE32) { 815 fDone = true; 816 return BreakIterator.DONE; 817 } 818 } 819 820 // Set the initial state for the state machine 821 int state = START_STATE; 822 int row = fRData.getRowIndex(state); 823 short category = 3; 824 int flagsState = fRData.fFTable.fFlags; 825 int mode = RBBI_RUN; 826 if ((flagsState & RBBIDataWrapper.RBBI_BOF_REQUIRED) != 0) { 827 category = 2; 828 mode = RBBI_START; 829 if (TRACE) { 830 System.out.print(" " + RBBIDataWrapper.intToString(text.getIndex(), 5)); 831 System.out.print(RBBIDataWrapper.intToHexString(c, 10)); 832 System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6)); 833 } 834 } 835 fLookAheadMatches.reset(); 836 837 // loop until we reach the end of the text or transition to state 0 838 while (state != STOP_STATE) { 839 if (c == DONE32) { 840 // Reached end of input string. 841 if (mode == RBBI_END) { 842 // We have already run the loop one last time with the 843 // character set to the pseudo {eof} value. Now it is time 844 // to unconditionally bail out. 845 break; 846 } 847 // Run the loop one last time with the fake end-of-input character category 848 mode = RBBI_END; 849 category = 1; 850 } 851 else if (mode == RBBI_RUN) { 852 // Get the char category. An incoming category of 1 or 2 mens that 853 // we are preset for doing the beginning or end of input, and 854 // that we shouldn't get a category from an actual text input character. 855 // 856 857 // look up the current character's character category, which tells us 858 // which column in the state table to look at. 859 // 860 category = (short) trie.get(c); 861 862 // Check the dictionary bit in the character's category. 863 // Counter is only used by dictionary based iterators (subclasses). 864 // Chars that need to be handled by a dictionary have a flag bit set 865 // in their category values. 866 // 867 if ((category & 0x4000) != 0) { 868 fDictionaryCharCount++; 869 // And off the dictionary flag bit. 870 category &= ~0x4000; 871 } 872 873 if (TRACE) { 874 System.out.print(" " + RBBIDataWrapper.intToString(text.getIndex(), 5)); 875 System.out.print(RBBIDataWrapper.intToHexString(c, 10)); 876 System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6)); 877 } 878 879 // Advance to the next character. 880 // If this is a beginning-of-input loop iteration, don't advance. 881 // The next iteration will be processing the first real input character. 882 c = text.next(); 883 if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) { 884 c = nextTrail32(text, c); 885 } 886 } 887 else { 888 mode = RBBI_RUN; 889 } 890 891 // look up a state transition in the state table 892 state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category]; 893 row = fRData.getRowIndex(state); 894 895 if (stateTable[row + RBBIDataWrapper.ACCEPTING] == -1) { 896 // Match found, common case 897 result = text.getIndex(); 898 if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c <= UTF16.CODEPOINT_MAX_VALUE) { 899 // The iterator has been left in the middle of a surrogate pair. 900 // We want the start of it. 901 result--; 902 } 903 904 // Remember the break status (tag) values. 905 fRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGIDX]; 906 } 907 908 int completedRule = stateTable[row + RBBIDataWrapper.ACCEPTING]; 909 if (completedRule > 0) { 910 // Lookahead match is completed 911 int lookaheadResult = fLookAheadMatches.getPosition(completedRule); 912 if (lookaheadResult >= 0) { 913 fRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGIDX]; 914 fPosition = lookaheadResult; 915 return lookaheadResult; 916 } 917 } 918 919 // If we are at the position of the '/' in a look-ahead (hard break) rule; 920 // record the current position, to be returned later, if the full rule matches. 921 // TODO: Move this check before the previous check of fAccepting. 922 // This would enable hard-break rules with no following context. 923 // But there are line break test failures when trying this. Investigate. 924 // Issue ICU-20837 925 int rule = stateTable[row + RBBIDataWrapper.LOOKAHEAD]; 926 if (rule != 0) { 927 int pos = text.getIndex(); 928 if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c <= UTF16.CODEPOINT_MAX_VALUE) { 929 // The iterator has been left in the middle of a surrogate pair. 930 // We want the beginning of it. 931 pos--; 932 } 933 fLookAheadMatches.setPosition(rule, pos); 934 } 935 936 937 } // End of state machine main loop 938 939 // The state machine is done. Check whether it found a match... 940 941 // If the iterator failed to advance in the match engine force it ahead by one. 942 // This indicates a defect in the break rules, which should always match 943 // at least one character. 944 945 if (result == initialPosition) { 946 if (TRACE) { 947 System.out.println("Iterator did not move. Advancing by 1."); 948 } 949 text.setIndex(initialPosition); 950 next32(text); 951 result = text.getIndex(); 952 fRuleStatusIndex = 0; 953 } 954 955 // Leave the iterator at our result position. 956 // (we may have advanced beyond the last accepting position chasing after 957 // longer matches that never completed.) 958 fPosition = result; 959 960 if (TRACE) { 961 System.out.println("result = " + result); 962 } 963 return result; 964 } 965 966 /** 967 * Iterate backwards from an arbitrary position in the input text using the Safe Reverse rules. 968 * This locates a "Safe Position" from which the forward break rules 969 * will operate correctly. A Safe Position is not necessarily a boundary itself. 970 * 971 * The logic of this function is very similar to handleNext(), above, but simpler 972 * because the safe table does not require as many options. 973 * 974 * @param fromPosition the position in the input text to begin the iteration. 975 * @hide draft / provisional / internal are hidden on OHOS 976 */ handleSafePrevious(int fromPosition)977 private int handleSafePrevious(int fromPosition) { 978 int state; 979 short category = 0; 980 int result = 0; 981 982 // caches for quicker access 983 CharacterIterator text = fText; 984 Trie2 trie = fRData.fTrie; 985 short[] stateTable = fRData.fRTable.fTable; 986 987 CISetIndex32(text, fromPosition); 988 if (TRACE) { 989 System.out.print("Handle Previous pos char state category"); 990 } 991 992 // if we're already at the start of the text, return DONE. 993 if (text.getIndex() == text.getBeginIndex()) { 994 return BreakIterator.DONE; 995 } 996 997 // Set the initial state for the state machine 998 int c = CharacterIteration.previous32(text); 999 state = START_STATE; 1000 int row = fRData.getRowIndex(state); 1001 1002 // loop until we reach the start of the text or transition to state 0 1003 // 1004 for (; c != DONE32; c = CharacterIteration.previous32(text)) { 1005 1006 // look up the current character's character category, which tells us 1007 // which column in the state table to look at. 1008 // 1009 // And off the dictionary flag bit. For reverse iteration it is not used. 1010 category = (short) trie.get(c); 1011 category &= ~0x4000; 1012 if (TRACE) { 1013 System.out.print(" " + RBBIDataWrapper.intToString(text.getIndex(), 5)); 1014 System.out.print(RBBIDataWrapper.intToHexString(c, 10)); 1015 System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6)); 1016 } 1017 1018 // State Transition - move machine to its next state 1019 // 1020 assert(category < fRData.fHeader.fCatCount); 1021 state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category]; 1022 row = fRData.getRowIndex(state); 1023 1024 if (state == STOP_STATE) { 1025 // This is the normal exit from the lookup state machine. 1026 // Transition to state zero means we have found a safe point. 1027 break; 1028 } 1029 } 1030 1031 // The state machine is done. 1032 result = text.getIndex(); 1033 if (TRACE) { 1034 System.out.println("result = " + result); 1035 } 1036 return result; 1037 } 1038 1039 /** 1040 * Set the index of a CharacterIterator. 1041 * Pin the index to the valid range range of BeginIndex <= index <= EndIndex. 1042 * If the index points to a trail surrogate of a supplementary character, adjust it 1043 * to the start (lead surrogate) index. 1044 * 1045 * @param ci A CharacterIterator to set 1046 * @param index the index to set 1047 * @return the resulting index, possibly pinned or adjusted. 1048 */ 1049 private static int CISetIndex32(CharacterIterator ci, int index) { 1050 if (index <= ci.getBeginIndex()) { 1051 ci.first(); 1052 } else if (index >= ci.getEndIndex()) { 1053 ci.setIndex(ci.getEndIndex()); 1054 } else if (Character.isLowSurrogate(ci.setIndex(index))) { 1055 if (!Character.isHighSurrogate(ci.previous())) { 1056 ci.next(); 1057 } 1058 } 1059 return ci.getIndex(); 1060 } 1061 1062 /** DictionaryCache stores the boundaries obtained from a run of dictionary characters. 1063 * Dictionary boundaries are moved first to this cache, then from here 1064 * to the main BreakCache, where they may inter-leave with non-dictionary 1065 * boundaries. The public BreakIterator API always fetches directly 1066 * from the main BreakCache, not from here. 1067 * 1068 * In common situations, the number of boundaries in a single dictionary run 1069 * should be quite small, it will be terminated by punctuation, spaces, 1070 * or any other non-dictionary characters. The main BreakCache may end 1071 * up with boundaries from multiple dictionary based runs. 1072 * 1073 * The boundaries are stored in a simple ArrayList (vector), with the 1074 * assumption that they will be accessed sequentially. 1075 */ 1076 class DictionaryCache { 1077 1078 void reset() { 1079 fPositionInCache = -1; 1080 fStart = 0; 1081 fLimit = 0; 1082 fFirstRuleStatusIndex = 0; 1083 fOtherRuleStatusIndex = 0; 1084 fBreaks.removeAllElements(); 1085 }; 1086 1087 boolean following(int fromPos) { 1088 if (fromPos >= fLimit || fromPos < fStart) { 1089 fPositionInCache = -1; 1090 return false; 1091 } 1092 1093 // Sequential iteration, move from previous boundary to the following 1094 1095 int r = 0; 1096 if (fPositionInCache >= 0 && fPositionInCache < fBreaks.size() && fBreaks.elementAt(fPositionInCache) == fromPos) { 1097 ++fPositionInCache; 1098 if (fPositionInCache >= fBreaks.size()) { 1099 fPositionInCache = -1; 1100 return false; 1101 } 1102 r = fBreaks.elementAt(fPositionInCache); 1103 assert(r > fromPos); 1104 fBoundary = r; 1105 fStatusIndex = fOtherRuleStatusIndex; 1106 return true; 1107 } 1108 1109 // Random indexing. Linear search for the boundary following the given position. 1110 1111 for (fPositionInCache = 0; fPositionInCache < fBreaks.size(); ++fPositionInCache) { 1112 r= fBreaks.elementAt(fPositionInCache); 1113 if (r > fromPos) { 1114 fBoundary = r; 1115 fStatusIndex = fOtherRuleStatusIndex; 1116 return true; 1117 } 1118 } 1119 1120 // Internal error. fStart <= fromPos < fLimit, but no cached boundary. 1121 assert(false); 1122 fPositionInCache = -1; 1123 return false; 1124 }; 1125 preceding(int fromPos)1126 boolean preceding(int fromPos) { 1127 if (fromPos <= fStart || fromPos > fLimit) { 1128 fPositionInCache = -1; 1129 return false; 1130 } 1131 1132 if (fromPos == fLimit) { 1133 fPositionInCache = fBreaks.size() - 1; 1134 if (fPositionInCache >= 0) { 1135 assert(fBreaks.elementAt(fPositionInCache) == fromPos); 1136 } 1137 } 1138 1139 int r; 1140 if (fPositionInCache > 0 && fPositionInCache < fBreaks.size() && fBreaks.elementAt(fPositionInCache) == fromPos) { 1141 --fPositionInCache; 1142 r = fBreaks.elementAt(fPositionInCache); 1143 assert(r < fromPos); 1144 fBoundary = r; 1145 fStatusIndex = ( r== fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex; 1146 return true; 1147 } 1148 1149 if (fPositionInCache == 0) { 1150 fPositionInCache = -1; 1151 return false; 1152 } 1153 1154 for (fPositionInCache = fBreaks.size()-1; fPositionInCache >= 0; --fPositionInCache) { 1155 r = fBreaks.elementAt(fPositionInCache); 1156 if (r < fromPos) { 1157 fBoundary = r; 1158 fStatusIndex = ( r == fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex; 1159 return true; 1160 } 1161 } 1162 assert(false); 1163 fPositionInCache = -1; 1164 return false; 1165 }; 1166 1167 /** 1168 * Populate the cache with the dictionary based boundaries within a region of text. 1169 * @param startPos The start position of a range of text 1170 * @param endPos The end position of a range of text 1171 * @param firstRuleStatus The rule status index that applies to the break at startPos 1172 * @param otherRuleStatus The rule status index that applies to boundaries other than startPos 1173 * @hide draft / provisional / internal are hidden on OHOS 1174 */ populateDictionary(int startPos, int endPos, int firstRuleStatus, int otherRuleStatus)1175 void populateDictionary(int startPos, int endPos, 1176 int firstRuleStatus, int otherRuleStatus) { 1177 if ((endPos - startPos) <= 1) { 1178 return; 1179 } 1180 1181 reset(); 1182 fFirstRuleStatusIndex = firstRuleStatus; 1183 fOtherRuleStatusIndex = otherRuleStatus; 1184 1185 int rangeStart = startPos; 1186 int rangeEnd = endPos; 1187 1188 int category; 1189 int current; 1190 int foundBreakCount = 0; 1191 1192 // Loop through the text, looking for ranges of dictionary characters. 1193 // For each span, find the appropriate break engine, and ask it to find 1194 // any breaks within the span. 1195 1196 fText.setIndex(rangeStart); 1197 int c = CharacterIteration.current32(fText); 1198 category = (short)fRData.fTrie.get(c); 1199 1200 while(true) { 1201 while((current = fText.getIndex()) < rangeEnd && (category & 0x4000) == 0) { 1202 c = CharacterIteration.next32(fText); // pre-increment 1203 category = (short)fRData.fTrie.get(c); 1204 } 1205 if (current >= rangeEnd) { 1206 break; 1207 } 1208 1209 // We now have a dictionary character. Get the appropriate language object 1210 // to deal with it. 1211 LanguageBreakEngine lbe = getLanguageBreakEngine(c); 1212 1213 // Ask the language object if there are any breaks. It will add them to the cache and 1214 // leave the text pointer on the other side of its range, ready to search for the next one. 1215 if (lbe != null) { 1216 foundBreakCount += lbe.findBreaks(fText, rangeStart, rangeEnd, fBreaks); 1217 } 1218 1219 // Reload the loop variables for the next go-round 1220 c = CharacterIteration.current32(fText); 1221 category = (short)fRData.fTrie.get(c); 1222 } 1223 1224 // If we found breaks, ensure that the first and last entries are 1225 // the original starting and ending position. And initialize the 1226 // cache iteration position to the first entry. 1227 1228 // System.out.printf("foundBreakCount = %d%n", foundBreakCount); 1229 if (foundBreakCount > 0) { 1230 assert(foundBreakCount == fBreaks.size()); 1231 if (startPos < fBreaks.elementAt(0)) { 1232 // The dictionary did not place a boundary at the start of the segment of text. 1233 // Add one now. This should not commonly happen, but it would be easy for interactions 1234 // of the rules for dictionary segments and the break engine implementations to 1235 // inadvertently cause it. Cover it here, just in case. 1236 fBreaks.offer(startPos); 1237 } 1238 if (endPos > fBreaks.peek()) { 1239 fBreaks.push(endPos); 1240 } 1241 fPositionInCache = 0; 1242 // Note: Dictionary matching may extend beyond the original limit. 1243 fStart = fBreaks.elementAt(0); 1244 fLimit = fBreaks.peek(); 1245 } else { 1246 // there were no language-based breaks, even though the segment contained 1247 // dictionary characters. Subsequent attempts to fetch boundaries from the dictionary cache 1248 // for this range will fail, and the calling code will fall back to the rule based boundaries. 1249 } 1250 1251 }; 1252 1253 DictionaryCache()1254 DictionaryCache() { 1255 fPositionInCache = -1; 1256 fBreaks = new DictionaryBreakEngine.DequeI(); 1257 } 1258 1259 /** 1260 * copy constructor. Used by RuleBasedBreakIterator.clone(). 1261 * 1262 * @param src the source object to be copied. 1263 */ DictionaryCache(DictionaryCache src)1264 DictionaryCache(DictionaryCache src) { 1265 try { 1266 fBreaks = (DictionaryBreakEngine.DequeI)src.fBreaks.clone(); 1267 } 1268 catch (CloneNotSupportedException e) { 1269 throw new RuntimeException(e); 1270 } 1271 fPositionInCache = src.fPositionInCache; 1272 fStart = src.fStart; 1273 fLimit = src.fLimit; 1274 fFirstRuleStatusIndex = src.fFirstRuleStatusIndex; 1275 fOtherRuleStatusIndex = src.fOtherRuleStatusIndex; 1276 fBoundary = src.fBoundary; 1277 fStatusIndex = src.fStatusIndex; 1278 } 1279 1280 // A data structure containing the boundaries themselves. Essentially a vector of raw ints. 1281 DictionaryBreakEngine.DequeI fBreaks; 1282 int fPositionInCache; // Index in fBreaks of last boundary returned by following() 1283 // // or preceding(). Optimizes sequential access. 1284 int fStart; // Text position of first boundary in cache. 1285 int fLimit; // Last boundary in cache. Which is the limit of the 1286 // // text segment being handled by the dictionary. 1287 int fFirstRuleStatusIndex; // Rule status info for first boundary. 1288 int fOtherRuleStatusIndex; // Rule status info for 2nd through last boundaries. 1289 int fBoundary; // Current boundary. Set by preceding(), following(). 1290 int fStatusIndex; // Current rule status index. Set by preceding, following(). 1291 }; 1292 1293 1294 1295 1296 /* 1297 * class BreakCache 1298 * 1299 * Cache of break boundary positions and rule status values. 1300 * Break iterator API functions, next(), previous(), etc., will use cached results 1301 * when possible, and otherwise cache new results as they are obtained. 1302 * 1303 * Uniformly caches both dictionary and rule based (non-dictionary) boundaries. 1304 * 1305 * The cache is implemented as a single circular buffer. 1306 */ 1307 1308 /* 1309 * size of the circular cache buffer. 1310 */ 1311 1312 class BreakCache { 1313 BreakCache()1314 BreakCache() { 1315 reset(); 1316 }; 1317 reset(int pos, int ruleStatus)1318 void reset(int pos, int ruleStatus) { 1319 fStartBufIdx = 0; 1320 fEndBufIdx = 0; 1321 fTextIdx = pos; 1322 fBufIdx = 0; 1323 fBoundaries[0] = pos; 1324 fStatuses[0] = (short)ruleStatus; 1325 } 1326 reset()1327 void reset() {reset(0, 0); }; 1328 next()1329 void next() { 1330 if (fBufIdx == fEndBufIdx) { 1331 fDone = !populateFollowing(); 1332 fPosition = fTextIdx; 1333 fRuleStatusIndex = fStatuses[fBufIdx]; 1334 } else { 1335 fBufIdx = modChunkSize(fBufIdx + 1); 1336 fTextIdx = fPosition = fBoundaries[fBufIdx]; 1337 fRuleStatusIndex = fStatuses[fBufIdx]; 1338 } 1339 }; 1340 previous()1341 void previous() { 1342 int initialBufIdx = fBufIdx; 1343 if (fBufIdx == fStartBufIdx) { 1344 // At start of cache. Prepend to it. 1345 populatePreceding(); 1346 } else { 1347 // Cache already holds the next boundary 1348 fBufIdx = modChunkSize(fBufIdx - 1); 1349 fTextIdx = fBoundaries[fBufIdx]; 1350 } 1351 fDone = (fBufIdx == initialBufIdx); 1352 fPosition = fTextIdx; 1353 fRuleStatusIndex = fStatuses[fBufIdx]; 1354 return; 1355 }; 1356 1357 // Move the iteration state to the position following the startPosition. 1358 // Input position must be pinned to the input length. following(int startPos)1359 void following(int startPos) { 1360 if (startPos == fTextIdx || seek(startPos) || populateNear(startPos)) { 1361 // startPos is in the cache. Do a next() from that position. 1362 // TODO: an awkward set of interactions with bi->fDone 1363 // seek() does not clear it; it can't because of interactions with populateNear(). 1364 // next() does not clear it in the fast-path case, where everything matters. Maybe it should. 1365 // So clear it here, for the case where seek() succeeded on an iterator that had previously run off the end. 1366 fDone = false; 1367 next(); 1368 } 1369 1370 }; 1371 preceding(int startPos)1372 void preceding(int startPos) { 1373 if (startPos == fTextIdx || seek(startPos) || populateNear(startPos)) { 1374 if (startPos == fTextIdx) { 1375 previous(); 1376 } else { 1377 // seek() leaves the BreakCache positioned at the preceding boundary 1378 // if the requested position is between two bounaries. 1379 // current() pushes the BreakCache position out to the BreakIterator itself. 1380 assert(startPos > fTextIdx); 1381 current(); 1382 } 1383 } 1384 return; 1385 }; 1386 1387 /** 1388 * Update the state of the public BreakIterator (fBI) to reflect the 1389 * current state of the break iterator cache (this). 1390 */ current()1391 int current() { 1392 fPosition = fTextIdx; 1393 fRuleStatusIndex = fStatuses[fBufIdx]; 1394 fDone = false; 1395 return fTextIdx; 1396 }; 1397 1398 /** 1399 * Add boundaries to the cache near the specified position. 1400 * The given position need not be a boundary itself. 1401 * The input position must be within the range of the text, and 1402 * on a code point boundary. 1403 * If the requested position is a break boundary, leave the iteration 1404 * position on it. 1405 * If the requested position is not a boundary, leave the iteration 1406 * position on the preceding boundary and include both the the 1407 * preceding and following boundaries in the cache. 1408 * Additional boundaries, either preceding or following, may be added 1409 * to the cache as a side effect. 1410 * 1411 * Return false if the operation failed. 1412 */ populateNear(int position)1413 boolean populateNear(int position) { 1414 assert(position < fBoundaries[fStartBufIdx] || position > fBoundaries[fEndBufIdx]); 1415 1416 // Find a boundary somewhere in the vicinity of the requested position. 1417 // Depending on the safe rules and the text data, it could be either before, at, or after 1418 // the requested position. 1419 1420 1421 // If the requested position is not near already cached positions, clear the existing cache, 1422 // find a near-by boundary and begin new cache contents there. 1423 1424 if ((position < fBoundaries[fStartBufIdx] - 15) || position > (fBoundaries[fEndBufIdx] + 15)) { 1425 int aBoundary = fText.getBeginIndex(); 1426 int ruleStatusIndex = 0; 1427 if (position > aBoundary + 20) { 1428 int backupPos = handleSafePrevious(position); 1429 if (backupPos > aBoundary) { 1430 // Advance to the boundary following the backup position. 1431 // There is a complication: the safe reverse rules identify pairs of code points 1432 // that are safe. If advancing from the safe point moves forwards by less than 1433 // two code points, we need to advance one more time to ensure that the boundary 1434 // is good, including a correct rules status value. 1435 // 1436 fPosition = backupPos; 1437 aBoundary = handleNext(); 1438 if (aBoundary == backupPos + 1 || 1439 (aBoundary == backupPos + 2 && 1440 Character.isHighSurrogate(fText.setIndex(backupPos)) && 1441 Character.isLowSurrogate(fText.next()))) { 1442 // The initial handleNext() only advanced by a single code point. Go again. 1443 // Safe rules identify safe pairs. 1444 aBoundary = handleNext(); 1445 } 1446 } 1447 ruleStatusIndex = fRuleStatusIndex; 1448 } 1449 reset(aBoundary, ruleStatusIndex); // Reset cache to hold aBoundary as a single starting point. 1450 } 1451 1452 // Fill in boundaries between existing cache content and the new requested position. 1453 1454 if (fBoundaries[fEndBufIdx] < position) { 1455 // The last position in the cache precedes the requested position. 1456 // Add following position(s) to the cache. 1457 while (fBoundaries[fEndBufIdx] < position) { 1458 if (!populateFollowing()) { 1459 assert false; 1460 return false; 1461 } 1462 } 1463 fBufIdx = fEndBufIdx; // Set iterator position to the end of the buffer. 1464 fTextIdx = fBoundaries[fBufIdx]; // Required because populateFollowing may add extra boundaries. 1465 while (fTextIdx > position) { // Move backwards to a position at or preceding the requested pos. 1466 previous(); 1467 } 1468 return true; 1469 } 1470 1471 if (fBoundaries[fStartBufIdx] > position) { 1472 // The first position in the cache is beyond the requested position. 1473 // back up more until we get a boundary <= the requested position. 1474 while (fBoundaries[fStartBufIdx] > position) { 1475 populatePreceding(); 1476 } 1477 fBufIdx = fStartBufIdx; // Set iterator position to the start of the buffer. 1478 fTextIdx = fBoundaries[fBufIdx]; // Required because populatePreceding may add extra boundaries. 1479 while (fTextIdx < position) { // Move forwards to a position at or following the requested pos. 1480 next(); 1481 } 1482 if (fTextIdx > position) { 1483 // If position is not itself a boundary, the next() loop above will overshoot. 1484 // Back up one, leaving cache position at the boundary preceding the requested position. 1485 previous(); 1486 } 1487 return true; 1488 } 1489 1490 assert fTextIdx == position; 1491 return true; 1492 1493 }; 1494 1495 /** 1496 * Add boundary(s) to the cache following the current last boundary. 1497 * Return false if at the end of the text, and no more boundaries can be added. 1498 * Leave iteration position at the first newly added boundary, or unchanged if no boundary was added. 1499 */ populateFollowing()1500 boolean populateFollowing() { 1501 int fromPosition = fBoundaries[fEndBufIdx]; 1502 int fromRuleStatusIdx = fStatuses[fEndBufIdx]; 1503 int pos = 0; 1504 int ruleStatusIdx = 0; 1505 1506 if (fDictionaryCache.following(fromPosition)) { 1507 addFollowing(fDictionaryCache.fBoundary, fDictionaryCache.fStatusIndex, UpdateCachePosition); 1508 return true; 1509 } 1510 1511 fPosition = fromPosition; 1512 pos = handleNext(); 1513 if (pos == BreakIterator.DONE) { 1514 return false; 1515 } 1516 1517 ruleStatusIdx = fRuleStatusIndex; 1518 if (fDictionaryCharCount > 0) { 1519 // The text segment obtained from the rules includes dictionary characters. 1520 // Subdivide it, with subdivided results going into the dictionary cache. 1521 fDictionaryCache.populateDictionary(fromPosition, pos, fromRuleStatusIdx, ruleStatusIdx); 1522 if (fDictionaryCache.following(fromPosition)) { 1523 addFollowing(fDictionaryCache.fBoundary, fDictionaryCache.fStatusIndex, UpdateCachePosition); 1524 return true; 1525 // TODO: may want to move a sizable chunk of the dictionary cache to the break cache at this point. 1526 // But be careful with interactions with populateNear(). 1527 } 1528 } 1529 1530 // Rule based segment did not include dictionary characters. 1531 // Or, it did contain dictionary chars, but the dictionary segmenter didn't handle them, 1532 // meaning that we didn't take the return, above. 1533 // Add its end point to the cache. 1534 addFollowing(pos, ruleStatusIdx, UpdateCachePosition); 1535 1536 // Add several non-dictionary boundaries at this point, to optimize straight forward iteration. 1537 // (subsequent calls to BreakIterator::next() will take the fast path, getting cached results. 1538 // 1539 for (int count=0; count<6; ++count) { 1540 pos = handleNext(); 1541 if (pos == BreakIterator.DONE || fDictionaryCharCount > 0) { 1542 break; 1543 } 1544 addFollowing(pos, fRuleStatusIndex, RetainCachePosition); 1545 } 1546 return true; 1547 }; 1548 1549 /** 1550 * Add one or more boundaries to the cache preceding the first currently cached boundary. 1551 * Leave the iteration position on the first added boundary. 1552 * Return false if no boundaries could be added (if at the start of the text.) 1553 */ populatePreceding()1554 boolean populatePreceding() { 1555 int textBegin = fText.getBeginIndex(); 1556 int fromPosition = fBoundaries[fStartBufIdx]; 1557 if (fromPosition == textBegin) { 1558 return false; 1559 } 1560 1561 int position = textBegin; 1562 int positionStatusIdx = 0; 1563 1564 if (fDictionaryCache.preceding(fromPosition)) { 1565 addPreceding(fDictionaryCache.fBoundary, fDictionaryCache.fStatusIndex, UpdateCachePosition); 1566 return true; 1567 } 1568 1569 int backupPosition = fromPosition; 1570 1571 // Find a boundary somewhere preceding the first already-cached boundary 1572 do { 1573 backupPosition = backupPosition - 30; 1574 if (backupPosition <= textBegin) { 1575 backupPosition = textBegin; 1576 } else { 1577 backupPosition = handleSafePrevious(backupPosition); 1578 } 1579 if (backupPosition == BreakIterator.DONE || backupPosition == textBegin) { 1580 position = textBegin; 1581 positionStatusIdx = 0; 1582 } else { 1583 // Advance to the boundary following the backup position. 1584 // There is a complication: the safe reverse rules identify pairs of code points 1585 // that are safe. If advancing from the safe point moves forwards by less than 1586 // two code points, we need to advance one more time to ensure that the boundary 1587 // is good, including a correct rules status value. 1588 // 1589 fPosition = backupPosition; // TODO: pass starting position in a clearer way. 1590 position = handleNext(); 1591 if (position == backupPosition + 1 || 1592 (position == backupPosition + 2 && 1593 Character.isHighSurrogate(fText.setIndex(backupPosition)) && 1594 Character.isLowSurrogate(fText.next()))) { 1595 // The initial handleNext() only advanced by a single code point. Go again. 1596 // Safe rules identify safe pairs. 1597 position = handleNext(); 1598 } 1599 positionStatusIdx = fRuleStatusIndex; 1600 } 1601 } while (position >= fromPosition); 1602 1603 // Find boundaries between the one we just located and the first already-cached boundary 1604 // Put them in a side buffer, because we don't yet know where they will fall in the circular cache buffer. 1605 1606 fSideBuffer.removeAllElements(); 1607 fSideBuffer.push(position); 1608 fSideBuffer.push(positionStatusIdx); 1609 1610 do { 1611 int prevPosition = fPosition = position; 1612 int prevStatusIdx = positionStatusIdx; 1613 position = handleNext(); 1614 positionStatusIdx = fRuleStatusIndex; 1615 if (position == BreakIterator.DONE) { 1616 break; 1617 } 1618 1619 boolean segmentHandledByDictionary = false; 1620 if (fDictionaryCharCount != 0) { 1621 // Segment from the rules includes dictionary characters. 1622 // Subdivide it, with subdivided results going into the dictionary cache. 1623 int dictSegEndPosition = position; 1624 fDictionaryCache.populateDictionary(prevPosition, dictSegEndPosition, prevStatusIdx, positionStatusIdx); 1625 while (fDictionaryCache.following(prevPosition)) { 1626 position = fDictionaryCache.fBoundary; 1627 positionStatusIdx = fDictionaryCache.fStatusIndex; 1628 segmentHandledByDictionary = true; 1629 assert(position > prevPosition); 1630 if (position >= fromPosition) { 1631 break; 1632 } 1633 assert(position <= dictSegEndPosition); 1634 fSideBuffer.push(position); 1635 fSideBuffer.push(positionStatusIdx); 1636 prevPosition = position; 1637 } 1638 assert(position==dictSegEndPosition || position>=fromPosition); 1639 } 1640 1641 if (!segmentHandledByDictionary && position < fromPosition) { 1642 fSideBuffer.push(position); 1643 fSideBuffer.push(positionStatusIdx); 1644 } 1645 } while (position < fromPosition); 1646 1647 // Move boundaries from the side buffer to the main circular buffer. 1648 boolean success = false; 1649 if (!fSideBuffer.isEmpty()) { 1650 positionStatusIdx = fSideBuffer.pop(); 1651 position = fSideBuffer.pop(); 1652 addPreceding(position, positionStatusIdx, UpdateCachePosition); 1653 success = true; 1654 } 1655 1656 while (!fSideBuffer.isEmpty()) { 1657 positionStatusIdx = fSideBuffer.pop(); 1658 position = fSideBuffer.pop(); 1659 if (!addPreceding(position, positionStatusIdx, RetainCachePosition)) { 1660 // No space in circular buffer to hold a new preceding result while 1661 // also retaining the current cache (iteration) position. 1662 // Bailing out is safe; the cache will refill again if needed. 1663 break; 1664 } 1665 } 1666 return success; 1667 }; 1668 1669 1670 static final boolean RetainCachePosition = false; 1671 static final boolean UpdateCachePosition = true; 1672 1673 /** 1674 * Add the boundary following the current position. 1675 * The current position can be left as it was, or changed to the newly added boundary, 1676 * as specified by the update parameter. 1677 */ addFollowing(int position, int ruleStatusIdx, boolean update)1678 void addFollowing(int position, int ruleStatusIdx, boolean update) { 1679 assert(position > fBoundaries[fEndBufIdx]); 1680 assert(ruleStatusIdx <= Short.MAX_VALUE); 1681 int nextIdx = modChunkSize(fEndBufIdx + 1); 1682 if (nextIdx == fStartBufIdx) { 1683 fStartBufIdx = modChunkSize(fStartBufIdx + 6); // TODO: experiment. Probably revert to 1. 1684 } 1685 fBoundaries[nextIdx] = position; 1686 fStatuses[nextIdx] = (short)ruleStatusIdx; 1687 fEndBufIdx = nextIdx; 1688 if (update == UpdateCachePosition) { 1689 // Set current position to the newly added boundary. 1690 fBufIdx = nextIdx; 1691 fTextIdx = position; 1692 } else { 1693 // Retaining the original cache position. 1694 // Check if the added boundary wraps around the buffer, and would over-write the original position. 1695 // It's the responsibility of callers of this function to not add too many. 1696 assert(nextIdx != fBufIdx); 1697 } 1698 1699 }; 1700 1701 1702 /** 1703 * Add the boundary preceding the current position. 1704 * The current position can be left as it was, or changed to the newly added boundary, 1705 * as specified by the update parameter. 1706 */ addPreceding(int position, int ruleStatusIdx, boolean update)1707 boolean addPreceding(int position, int ruleStatusIdx, boolean update) { 1708 assert(position < fBoundaries[fStartBufIdx]); 1709 assert(ruleStatusIdx <= Short.MAX_VALUE); 1710 int nextIdx = modChunkSize(fStartBufIdx - 1); 1711 if (nextIdx == fEndBufIdx) { 1712 if (fBufIdx == fEndBufIdx && update == RetainCachePosition) { 1713 // Failure. The insertion of the new boundary would claim the buffer position that is the 1714 // current iteration position. And we also want to retain the current iteration position. 1715 // (The buffer is already completely full of entries that precede the iteration position.) 1716 return false; 1717 } 1718 fEndBufIdx = modChunkSize(fEndBufIdx - 1); 1719 } 1720 fBoundaries[nextIdx] = position; 1721 fStatuses[nextIdx] = (short)ruleStatusIdx; 1722 fStartBufIdx = nextIdx; 1723 if (update == UpdateCachePosition) { 1724 fBufIdx = nextIdx; 1725 fTextIdx = position; 1726 } 1727 return true; 1728 }; 1729 1730 /** 1731 * Set the cache position to the specified position, or, if the position 1732 * falls between to cached boundaries, to the preceding boundary. 1733 * Fails if the requested position is outside of the range of boundaries currently held by the cache. 1734 * The startPosition must be on a code point boundary. 1735 * 1736 * Return true if successful, false if the specified position is after 1737 * the last cached boundary or before the first. 1738 */ 1739 boolean seek(int pos) { 1740 if (pos < fBoundaries[fStartBufIdx] || pos > fBoundaries[fEndBufIdx]) { 1741 return false; 1742 } 1743 if (pos == fBoundaries[fStartBufIdx]) { 1744 // Common case: seek(0), from BreakIterator::first() 1745 fBufIdx = fStartBufIdx; 1746 fTextIdx = fBoundaries[fBufIdx]; 1747 return true; 1748 } 1749 if (pos == fBoundaries[fEndBufIdx]) { 1750 fBufIdx = fEndBufIdx; 1751 fTextIdx = fBoundaries[fBufIdx]; 1752 return true; 1753 } 1754 1755 int min = fStartBufIdx; 1756 int max = fEndBufIdx; 1757 while (min != max) { 1758 int probe = (min + max + (min>max ? CACHE_SIZE : 0)) / 2; 1759 probe = modChunkSize(probe); 1760 if (fBoundaries[probe] > pos) { 1761 max = probe; 1762 } else { 1763 min = modChunkSize(probe + 1); 1764 } 1765 } 1766 assert(fBoundaries[max] > pos); 1767 fBufIdx = modChunkSize(max - 1); 1768 fTextIdx = fBoundaries[fBufIdx]; 1769 assert(fTextIdx <= pos); 1770 return true; 1771 1772 }; 1773 1774 1775 /** 1776 * copy constructor, used from RuleBasedBreakIterator.clone(). 1777 * 1778 * @param src 1779 */ BreakCache(BreakCache src)1780 BreakCache(BreakCache src) { 1781 fStartBufIdx = src.fStartBufIdx; 1782 fEndBufIdx = src.fEndBufIdx; 1783 fTextIdx = src.fTextIdx; 1784 fBufIdx = src.fBufIdx; 1785 fBoundaries = src.fBoundaries.clone(); 1786 fStatuses = src.fStatuses.clone(); 1787 fSideBuffer = new DictionaryBreakEngine.DequeI(); // Transient, no need to clone contents. 1788 } 1789 dumpCache()1790 void dumpCache() { 1791 System.out.printf("fTextIdx:%d fBufIdx:%d%n", fTextIdx, fBufIdx); 1792 for (int i=fStartBufIdx; ; i=modChunkSize(i+1)) { 1793 System.out.printf("%d %d%n", i, fBoundaries[i]); 1794 if (i == fEndBufIdx) { 1795 break; 1796 } 1797 } 1798 }; 1799 modChunkSize(int index)1800 private final int modChunkSize(int index) { return index & (CACHE_SIZE - 1); }; 1801 1802 static final int CACHE_SIZE = 128; 1803 // static_assert((CACHE_SIZE & (CACHE_SIZE-1)) == 0, "CACHE_SIZE must be power of two."); 1804 1805 int fStartBufIdx; 1806 int fEndBufIdx; // inclusive 1807 1808 int fTextIdx; 1809 int fBufIdx; 1810 1811 int[] fBoundaries = new int[CACHE_SIZE]; 1812 short[] fStatuses = new short[CACHE_SIZE]; 1813 1814 DictionaryBreakEngine.DequeI fSideBuffer = new DictionaryBreakEngine.DequeI(); 1815 }; 1816 1817 1818 1819 1820 } 1821 1822