1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 *************************************************************************** 5 * Copyright (C) 1999-2016 International Business Machines Corporation * 6 * and others. All rights reserved. * 7 *************************************************************************** 8 9 ********************************************************************** 10 * Date Name Description 11 * 10/22/99 alan Creation. 12 * 11/11/99 rgillam Complete port from Java. 13 ********************************************************************** 14 */ 15 16 #ifndef RBBI_H 17 #define RBBI_H 18 19 #include "unicode/utypes.h" 20 21 /** 22 * \file 23 * \brief C++ API: Rule Based Break Iterator 24 */ 25 26 #if !UCONFIG_NO_BREAK_ITERATION 27 28 #include "unicode/brkiter.h" 29 #include "unicode/udata.h" 30 #include "unicode/parseerr.h" 31 #include "unicode/schriter.h" 32 33 U_NAMESPACE_BEGIN 34 35 /** @internal */ 36 class LanguageBreakEngine; 37 struct RBBIDataHeader; 38 class RBBIDataWrapper; 39 class UnhandledEngine; 40 class UStack; 41 42 /** 43 * 44 * A subclass of BreakIterator whose behavior is specified using a list of rules. 45 * <p>Instances of this class are most commonly created by the factory methods of 46 * BreakIterator::createWordInstance(), BreakIterator::createLineInstance(), etc., 47 * and then used via the abstract API in class BreakIterator</p> 48 * 49 * <p>See the ICU User Guide for information on Break Iterator Rules.</p> 50 * 51 * <p>This class is not intended to be subclassed.</p> 52 */ 53 class U_COMMON_API RuleBasedBreakIterator /*U_FINAL*/ : public BreakIterator { 54 55 private: 56 /** 57 * The UText through which this BreakIterator accesses the text 58 * @internal (private) 59 */ 60 UText fText; 61 62 #ifndef U_HIDE_INTERNAL_API 63 public: 64 #endif /* U_HIDE_INTERNAL_API */ 65 /** 66 * The rule data for this BreakIterator instance. 67 * Not for general use; Public only for testing purposes. 68 * @internal 69 */ 70 RBBIDataWrapper *fData; 71 private: 72 73 /** 74 * The current position of the iterator. Pinned, 0 < fPosition <= text.length. 75 * Never has the value UBRK_DONE (-1). 76 */ 77 int32_t fPosition; 78 79 /** 80 * TODO: 81 */ 82 int32_t fRuleStatusIndex; 83 84 /** 85 * Cache of previously determined boundary positions. 86 */ 87 class BreakCache; 88 BreakCache *fBreakCache; 89 90 /** 91 * Cache of boundary positions within a region of text that has been 92 * sub-divided by dictionary based breaking. 93 */ 94 class DictionaryCache; 95 DictionaryCache *fDictionaryCache; 96 97 /** 98 * 99 * If present, UStack of LanguageBreakEngine objects that might handle 100 * dictionary characters. Searched from top to bottom to find an object to 101 * handle a given character. 102 * @internal (private) 103 */ 104 UStack *fLanguageBreakEngines; 105 106 /** 107 * 108 * If present, the special LanguageBreakEngine used for handling 109 * characters that are in the dictionary set, but not handled by any 110 * LangugageBreakEngine. 111 * @internal (private) 112 */ 113 UnhandledEngine *fUnhandledBreakEngine; 114 115 /** 116 * Counter for the number of characters encountered with the "dictionary" 117 * flag set. 118 * @internal (private) 119 */ 120 uint32_t fDictionaryCharCount; 121 122 /** 123 * A character iterator that refers to the same text as the UText, above. 124 * Only included for compatibility with old API, which was based on CharacterIterators. 125 * Value may be adopted from outside, or one of fSCharIter or fDCharIter, below. 126 */ 127 CharacterIterator *fCharIter; 128 129 /** 130 * When the input text is provided by a UnicodeString, this will point to 131 * a characterIterator that wraps that data. Needed only for the 132 * implementation of getText(), a backwards compatibility issue. 133 */ 134 StringCharacterIterator fSCharIter; 135 136 /** 137 * True when iteration has run off the end, and iterator functions should return UBRK_DONE. 138 */ 139 UBool fDone; 140 141 //======================================================================= 142 // constructors 143 //======================================================================= 144 145 /** 146 * Constructor from a flattened set of RBBI data in malloced memory. 147 * RulesBasedBreakIterators built from a custom set of rules 148 * are created via this constructor; the rules are compiled 149 * into memory, then the break iterator is constructed here. 150 * 151 * The break iterator adopts the memory, and will 152 * free it when done. 153 * @internal (private) 154 */ 155 RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status); 156 157 /** @internal */ 158 friend class RBBIRuleBuilder; 159 /** @internal */ 160 friend class BreakIterator; 161 162 public: 163 164 /** Default constructor. Creates an empty shell of an iterator, with no 165 * rules or text to iterate over. Object can subsequently be assigned to. 166 * @stable ICU 2.2 167 */ 168 RuleBasedBreakIterator(); 169 170 /** 171 * Copy constructor. Will produce a break iterator with the same behavior, 172 * and which iterates over the same text, as the one passed in. 173 * @param that The RuleBasedBreakIterator passed to be copied 174 * @stable ICU 2.0 175 */ 176 RuleBasedBreakIterator(const RuleBasedBreakIterator& that); 177 178 /** 179 * Construct a RuleBasedBreakIterator from a set of rules supplied as a string. 180 * @param rules The break rules to be used. 181 * @param parseError In the event of a syntax error in the rules, provides the location 182 * within the rules of the problem. 183 * @param status Information on any errors encountered. 184 * @stable ICU 2.2 185 */ 186 RuleBasedBreakIterator( const UnicodeString &rules, 187 UParseError &parseError, 188 UErrorCode &status); 189 190 /** 191 * Construct a RuleBasedBreakIterator from a set of precompiled binary rules. 192 * Binary rules are obtained from RulesBasedBreakIterator::getBinaryRules(). 193 * Construction of a break iterator in this way is substantially faster than 194 * construction from source rules. 195 * 196 * Ownership of the storage containing the compiled rules remains with the 197 * caller of this function. The compiled rules must not be modified or 198 * deleted during the life of the break iterator. 199 * 200 * The compiled rules are not compatible across different major versions of ICU. 201 * The compiled rules are compatible only between machines with the same 202 * byte ordering (little or big endian) and the same base character set family 203 * (ASCII or EBCDIC). 204 * 205 * @see #getBinaryRules 206 * @param compiledRules A pointer to the compiled break rules to be used. 207 * @param ruleLength The length of the compiled break rules, in bytes. This 208 * corresponds to the length value produced by getBinaryRules(). 209 * @param status Information on any errors encountered, including invalid 210 * binary rules. 211 * @stable ICU 4.8 212 */ 213 RuleBasedBreakIterator(const uint8_t *compiledRules, 214 uint32_t ruleLength, 215 UErrorCode &status); 216 217 /** 218 * This constructor uses the udata interface to create a BreakIterator 219 * whose internal tables live in a memory-mapped file. "image" is an 220 * ICU UDataMemory handle for the pre-compiled break iterator tables. 221 * @param image handle to the memory image for the break iterator data. 222 * Ownership of the UDataMemory handle passes to the Break Iterator, 223 * which will be responsible for closing it when it is no longer needed. 224 * @param status Information on any errors encountered. 225 * @see udata_open 226 * @see #getBinaryRules 227 * @stable ICU 2.8 228 */ 229 RuleBasedBreakIterator(UDataMemory* image, UErrorCode &status); 230 231 /** 232 * Destructor 233 * @stable ICU 2.0 234 */ 235 virtual ~RuleBasedBreakIterator(); 236 237 /** 238 * Assignment operator. Sets this iterator to have the same behavior, 239 * and iterate over the same text, as the one passed in. 240 * @param that The RuleBasedBreakItertor passed in 241 * @return the newly created RuleBasedBreakIterator 242 * @stable ICU 2.0 243 */ 244 RuleBasedBreakIterator& operator=(const RuleBasedBreakIterator& that); 245 246 /** 247 * Equality operator. Returns TRUE if both BreakIterators are of the 248 * same class, have the same behavior, and iterate over the same text. 249 * @param that The BreakIterator to be compared for equality 250 * @return TRUE if both BreakIterators are of the 251 * same class, have the same behavior, and iterate over the same text. 252 * @stable ICU 2.0 253 */ 254 virtual UBool operator==(const BreakIterator& that) const; 255 256 /** 257 * Not-equal operator. If operator== returns TRUE, this returns FALSE, 258 * and vice versa. 259 * @param that The BreakIterator to be compared for inequality 260 * @return TRUE if both BreakIterators are not same. 261 * @stable ICU 2.0 262 */ 263 UBool operator!=(const BreakIterator& that) const; 264 265 /** 266 * Returns a newly-constructed RuleBasedBreakIterator with the same 267 * behavior, and iterating over the same text, as this one. 268 * Differs from the copy constructor in that it is polymorphic, and 269 * will correctly clone (copy) a derived class. 270 * clone() is thread safe. Multiple threads may simultaneously 271 * clone the same source break iterator. 272 * @return a newly-constructed RuleBasedBreakIterator 273 * @stable ICU 2.0 274 */ 275 virtual BreakIterator* clone() const; 276 277 /** 278 * Compute a hash code for this BreakIterator 279 * @return A hash code 280 * @stable ICU 2.0 281 */ 282 virtual int32_t hashCode(void) const; 283 284 /** 285 * Returns the description used to create this iterator 286 * @return the description used to create this iterator 287 * @stable ICU 2.0 288 */ 289 virtual const UnicodeString& getRules(void) const; 290 291 //======================================================================= 292 // BreakIterator overrides 293 //======================================================================= 294 295 /** 296 * <p> 297 * Return a CharacterIterator over the text being analyzed. 298 * The returned character iterator is owned by the break iterator, and must 299 * not be deleted by the caller. Repeated calls to this function may 300 * return the same CharacterIterator. 301 * </p> 302 * <p> 303 * The returned character iterator must not be used concurrently with 304 * the break iterator. If concurrent operation is needed, clone the 305 * returned character iterator first and operate on the clone. 306 * </p> 307 * <p> 308 * When the break iterator is operating on text supplied via a UText, 309 * this function will fail. Lacking any way to signal failures, it 310 * returns an CharacterIterator containing no text. 311 * The function getUText() provides similar functionality, 312 * is reliable, and is more efficient. 313 * </p> 314 * 315 * TODO: deprecate this function? 316 * 317 * @return An iterator over the text being analyzed. 318 * @stable ICU 2.0 319 */ 320 virtual CharacterIterator& getText(void) const; 321 322 323 /** 324 * Get a UText for the text being analyzed. 325 * The returned UText is a shallow clone of the UText used internally 326 * by the break iterator implementation. It can safely be used to 327 * access the text without impacting any break iterator operations, 328 * but the underlying text itself must not be altered. 329 * 330 * @param fillIn A UText to be filled in. If NULL, a new UText will be 331 * allocated to hold the result. 332 * @param status receives any error codes. 333 * @return The current UText for this break iterator. If an input 334 * UText was provided, it will always be returned. 335 * @stable ICU 3.4 336 */ 337 virtual UText *getUText(UText *fillIn, UErrorCode &status) const; 338 339 /** 340 * Set the iterator to analyze a new piece of text. This function resets 341 * the current iteration position to the beginning of the text. 342 * @param newText An iterator over the text to analyze. The BreakIterator 343 * takes ownership of the character iterator. The caller MUST NOT delete it! 344 * @stable ICU 2.0 345 */ 346 virtual void adoptText(CharacterIterator* newText); 347 348 /** 349 * Set the iterator to analyze a new piece of text. This function resets 350 * the current iteration position to the beginning of the text. 351 * 352 * The BreakIterator will retain a reference to the supplied string. 353 * The caller must not modify or delete the text while the BreakIterator 354 * retains the reference. 355 * 356 * @param newText The text to analyze. 357 * @stable ICU 2.0 358 */ 359 virtual void setText(const UnicodeString& newText); 360 361 /** 362 * Reset the break iterator to operate over the text represented by 363 * the UText. The iterator position is reset to the start. 364 * 365 * This function makes a shallow clone of the supplied UText. This means 366 * that the caller is free to immediately close or otherwise reuse the 367 * Utext that was passed as a parameter, but that the underlying text itself 368 * must not be altered while being referenced by the break iterator. 369 * 370 * @param text The UText used to change the text. 371 * @param status Receives any error codes. 372 * @stable ICU 3.4 373 */ 374 virtual void setText(UText *text, UErrorCode &status); 375 376 /** 377 * Sets the current iteration position to the beginning of the text, position zero. 378 * @return The offset of the beginning of the text, zero. 379 * @stable ICU 2.0 380 */ 381 virtual int32_t first(void); 382 383 /** 384 * Sets the current iteration position to the end of the text. 385 * @return The text's past-the-end offset. 386 * @stable ICU 2.0 387 */ 388 virtual int32_t last(void); 389 390 /** 391 * Advances the iterator either forward or backward the specified number of steps. 392 * Negative values move backward, and positive values move forward. This is 393 * equivalent to repeatedly calling next() or previous(). 394 * @param n The number of steps to move. The sign indicates the direction 395 * (negative is backwards, and positive is forwards). 396 * @return The character offset of the boundary position n boundaries away from 397 * the current one. 398 * @stable ICU 2.0 399 */ 400 virtual int32_t next(int32_t n); 401 402 /** 403 * Advances the iterator to the next boundary position. 404 * @return The position of the first boundary after this one. 405 * @stable ICU 2.0 406 */ 407 virtual int32_t next(void); 408 409 /** 410 * Moves the iterator backwards, to the last boundary preceding this one. 411 * @return The position of the last boundary position preceding this one. 412 * @stable ICU 2.0 413 */ 414 virtual int32_t previous(void); 415 416 /** 417 * Sets the iterator to refer to the first boundary position following 418 * the specified position. 419 * @param offset The position from which to begin searching for a break position. 420 * @return The position of the first break after the current position. 421 * @stable ICU 2.0 422 */ 423 virtual int32_t following(int32_t offset); 424 425 /** 426 * Sets the iterator to refer to the last boundary position before the 427 * specified position. 428 * @param offset The position to begin searching for a break from. 429 * @return The position of the last boundary before the starting position. 430 * @stable ICU 2.0 431 */ 432 virtual int32_t preceding(int32_t offset); 433 434 /** 435 * Returns true if the specified position is a boundary position. As a side 436 * effect, leaves the iterator pointing to the first boundary position at 437 * or after "offset". 438 * @param offset the offset to check. 439 * @return True if "offset" is a boundary position. 440 * @stable ICU 2.0 441 */ 442 virtual UBool isBoundary(int32_t offset); 443 444 /** 445 * Returns the current iteration position. Note that UBRK_DONE is never 446 * returned from this function; if iteration has run to the end of a 447 * string, current() will return the length of the string while 448 * next() will return UBRK_DONE). 449 * @return The current iteration position. 450 * @stable ICU 2.0 451 */ 452 virtual int32_t current(void) const; 453 454 455 /** 456 * Return the status tag from the break rule that determined the boundary at 457 * the current iteration position. For break rules that do not specify a 458 * status, a default value of 0 is returned. If more than one break rule 459 * would cause a boundary to be located at some position in the text, 460 * the numerically largest of the applicable status values is returned. 461 * <p> 462 * Of the standard types of ICU break iterators, only word break and 463 * line break provide status values. The values are defined in 464 * the header file ubrk.h. For Word breaks, the status allows distinguishing between words 465 * that contain alphabetic letters, "words" that appear to be numbers, 466 * punctuation and spaces, words containing ideographic characters, and 467 * more. For Line Break, the status distinguishes between hard (mandatory) breaks 468 * and soft (potential) break positions. 469 * <p> 470 * <code>getRuleStatus()</code> can be called after obtaining a boundary 471 * position from <code>next()</code>, <code>previous()</code>, or 472 * any other break iterator functions that returns a boundary position. 473 * <p> 474 * Note that <code>getRuleStatus()</code> returns the value corresponding to 475 * <code>current()</code> index even after <code>next()</code> has returned DONE. 476 * <p> 477 * When creating custom break rules, one is free to define whatever 478 * status values may be convenient for the application. 479 * <p> 480 * @return the status from the break rule that determined the boundary 481 * at the current iteration position. 482 * 483 * @see UWordBreak 484 * @stable ICU 2.2 485 */ 486 virtual int32_t getRuleStatus() const; 487 488 /** 489 * Get the status (tag) values from the break rule(s) that determined the boundary 490 * at the current iteration position. 491 * <p> 492 * The returned status value(s) are stored into an array provided by the caller. 493 * The values are stored in sorted (ascending) order. 494 * If the capacity of the output array is insufficient to hold the data, 495 * the output will be truncated to the available length, and a 496 * U_BUFFER_OVERFLOW_ERROR will be signaled. 497 * 498 * @param fillInVec an array to be filled in with the status values. 499 * @param capacity the length of the supplied vector. A length of zero causes 500 * the function to return the number of status values, in the 501 * normal way, without attempting to store any values. 502 * @param status receives error codes. 503 * @return The number of rule status values from the rules that determined 504 * the boundary at the current iteration position. 505 * In the event of a U_BUFFER_OVERFLOW_ERROR, the return value 506 * is the total number of status values that were available, 507 * not the reduced number that were actually returned. 508 * @see getRuleStatus 509 * @stable ICU 3.0 510 */ 511 virtual int32_t getRuleStatusVec(int32_t *fillInVec, int32_t capacity, UErrorCode &status); 512 513 /** 514 * Returns a unique class ID POLYMORPHICALLY. Pure virtual override. 515 * This method is to implement a simple version of RTTI, since not all 516 * C++ compilers support genuine RTTI. Polymorphic operator==() and 517 * clone() methods call this method. 518 * 519 * @return The class ID for this object. All objects of a 520 * given class have the same class ID. Objects of 521 * other classes have different class IDs. 522 * @stable ICU 2.0 523 */ 524 virtual UClassID getDynamicClassID(void) const; 525 526 /** 527 * Returns the class ID for this class. This is useful only for 528 * comparing to a return value from getDynamicClassID(). For example: 529 * 530 * Base* polymorphic_pointer = createPolymorphicObject(); 531 * if (polymorphic_pointer->getDynamicClassID() == 532 * Derived::getStaticClassID()) ... 533 * 534 * @return The class ID for all objects of this class. 535 * @stable ICU 2.0 536 */ 537 static UClassID U_EXPORT2 getStaticClassID(void); 538 539 /** 540 * Deprecated functionality. Use clone() instead. 541 * 542 * Create a clone (copy) of this break iterator in memory provided 543 * by the caller. The idea is to increase performance by avoiding 544 * a storage allocation. Use of this function is NOT RECOMMENDED. 545 * Performance gains are minimal, and correct buffer management is 546 * tricky. Use clone() instead. 547 * 548 * @param stackBuffer The pointer to the memory into which the cloned object 549 * should be placed. If NULL, allocate heap memory 550 * for the cloned object. 551 * @param BufferSize The size of the buffer. If zero, return the required 552 * buffer size, but do not clone the object. If the 553 * size was too small (but not zero), allocate heap 554 * storage for the cloned object. 555 * 556 * @param status Error status. U_SAFECLONE_ALLOCATED_WARNING will be 557 * returned if the provided buffer was too small, and 558 * the clone was therefore put on the heap. 559 * 560 * @return Pointer to the clone object. This may differ from the stackBuffer 561 * address if the byte alignment of the stack buffer was not suitable 562 * or if the stackBuffer was too small to hold the clone. 563 * @deprecated ICU 52. Use clone() instead. 564 */ 565 virtual BreakIterator * createBufferClone(void *stackBuffer, 566 int32_t &BufferSize, 567 UErrorCode &status); 568 569 570 /** 571 * Return the binary form of compiled break rules, 572 * which can then be used to create a new break iterator at some 573 * time in the future. Creating a break iterator from pre-compiled rules 574 * is much faster than building one from the source form of the 575 * break rules. 576 * 577 * The binary data can only be used with the same version of ICU 578 * and on the same platform type (processor endian-ness) 579 * 580 * @param length Returns the length of the binary data. (Out parameter.) 581 * 582 * @return A pointer to the binary (compiled) rule data. The storage 583 * belongs to the RulesBasedBreakIterator object, not the 584 * caller, and must not be modified or deleted. 585 * @stable ICU 4.8 586 */ 587 virtual const uint8_t *getBinaryRules(uint32_t &length); 588 589 /** 590 * Set the subject text string upon which the break iterator is operating 591 * without changing any other aspect of the matching state. 592 * The new and previous text strings must have the same content. 593 * 594 * This function is intended for use in environments where ICU is operating on 595 * strings that may move around in memory. It provides a mechanism for notifying 596 * ICU that the string has been relocated, and providing a new UText to access the 597 * string in its new position. 598 * 599 * Note that the break iterator implementation never copies the underlying text 600 * of a string being processed, but always operates directly on the original text 601 * provided by the user. Refreshing simply drops the references to the old text 602 * and replaces them with references to the new. 603 * 604 * Caution: this function is normally used only by very specialized, 605 * system-level code. One example use case is with garbage collection that moves 606 * the text in memory. 607 * 608 * @param input The new (moved) text string. 609 * @param status Receives errors detected by this function. 610 * @return *this 611 * 612 * @stable ICU 49 613 */ 614 virtual RuleBasedBreakIterator &refreshInputText(UText *input, UErrorCode &status); 615 616 617 private: 618 //======================================================================= 619 // implementation 620 //======================================================================= 621 /** 622 * Dumps caches and performs other actions associated with a complete change 623 * in text or iteration position. 624 * @internal (private) 625 */ 626 void reset(void); 627 628 /** 629 * Common initialization function, used by constructors and bufferClone. 630 * @internal (private) 631 */ 632 void init(UErrorCode &status); 633 634 /** 635 * Iterate backwards from an arbitrary position in the input text using the 636 * synthesized Safe Reverse rules. 637 * This locates a "Safe Position" from which the forward break rules 638 * will operate correctly. A Safe Position is not necessarily a boundary itself. 639 * 640 * @param fromPosition the position in the input text to begin the iteration. 641 * @internal (private) 642 */ 643 int32_t handleSafePrevious(int32_t fromPosition); 644 645 /** 646 * Find a rule-based boundary by running the state machine. 647 * Input 648 * fPosition, the position in the text to begin from. 649 * Output 650 * fPosition: the boundary following the starting position. 651 * fDictionaryCharCount the number of dictionary characters encountered. 652 * If > 0, the segment will be further subdivided 653 * fRuleStatusIndex Info from the state table indicating which rules caused the boundary. 654 * 655 * @internal (private) 656 */ 657 int32_t handleNext(); 658 659 660 /** 661 * This function returns the appropriate LanguageBreakEngine for a 662 * given character c. 663 * @param c A character in the dictionary set 664 * @internal (private) 665 */ 666 const LanguageBreakEngine *getLanguageBreakEngine(UChar32 c); 667 668 public: 669 #ifndef U_HIDE_INTERNAL_API 670 /** 671 * Debugging function only. 672 * @internal 673 */ 674 void dumpCache(); 675 676 /** 677 * Debugging function only. 678 * @internal 679 */ 680 void dumpTables(); 681 682 #endif /* U_HIDE_INTERNAL_API */ 683 }; 684 685 //------------------------------------------------------------------------------ 686 // 687 // Inline Functions Definitions ... 688 // 689 //------------------------------------------------------------------------------ 690 691 inline UBool RuleBasedBreakIterator::operator!=(const BreakIterator& that) const { 692 return !operator==(that); 693 } 694 695 U_NAMESPACE_END 696 697 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 698 699 #endif 700