1 /* 2 ********************************************************************** 3 * Copyright (C) 2002-2007, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ********************************************************************** 6 * file name: regex.h 7 * encoding: US-ASCII 8 * indentation:4 9 * 10 * created on: 2002oct22 11 * created by: Andy Heninger 12 * 13 * ICU Regular Expressions, API for C++ 14 */ 15 16 #ifndef REGEX_H 17 #define REGEX_H 18 19 #define REGEX_DEBUG 20 21 /** 22 * \file 23 * \brief C++ API: Regular Expressions 24 * 25 * <h2>Regular Expression API</h2> 26 * 27 * <p>The ICU API for processing regular expressions consists of two classes, 28 * <code>RegexPattern</code> and <code>RegexMatcher</code>. 29 * <code>RegexPattern</code> objects represent a pre-processed, or compiled 30 * regular expression. They are created from a regular expression pattern string, 31 * and can be used to create <code>RegexMatcher</code> objects for the pattern.</p> 32 * 33 * <p>Class <code>RegexMatcher</code> bundles together a regular expression 34 * pattern and a target string to which the search pattern will be applied. 35 * <code>RegexMatcher</code> includes API for doing plain find or search 36 * operations, for search and replace operations, and for obtaining detailed 37 * information about bounds of a match. </p> 38 * 39 * <p>Note that by constructing <code>RegexMatcher</code> objects directly from regular 40 * expression pattern strings application code can be simplified and the explicit 41 * need for <code>RegexPattern</code> objects can usually be eliminated. 42 * </p> 43 */ 44 45 #include "unicode/utypes.h" 46 47 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 48 49 #include "unicode/uobject.h" 50 #include "unicode/unistr.h" 51 #include "unicode/parseerr.h" 52 53 #include "unicode/uregex.h" 54 55 U_NAMESPACE_BEGIN 56 57 58 // Forward Declarations... 59 60 class RegexMatcher; 61 class RegexPattern; 62 class UVector; 63 class UVector32; 64 class UnicodeSet; 65 struct REStackFrame; 66 struct Regex8BitSet; 67 class RuleBasedBreakIterator; 68 class RegexCImpl; 69 70 71 72 73 /** 74 * RBBIPatternDump Debug function, displays the compiled form of a pattern. 75 * @internal 76 */ 77 #ifdef REGEX_DEBUG 78 U_INTERNAL void U_EXPORT2 79 RegexPatternDump(const RegexPattern *pat); 80 #else 81 #define RegexPatternDump(pat) 82 #endif 83 84 85 86 /** 87 * Class <code>RegexPattern</code> represents a compiled regular expression. It includes 88 * factory methods for creating a RegexPattern object from the source (string) form 89 * of a regular expression, methods for creating RegexMatchers that allow the pattern 90 * to be applied to input text, and a few convenience methods for simple common 91 * uses of regular expressions. 92 * 93 * <p>Class RegexPattern is not intended to be subclassed.</p> 94 * 95 * @stable ICU 2.4 96 */ 97 class U_I18N_API RegexPattern: public UObject { 98 public: 99 100 /** 101 * default constructor. Create a RegexPattern object that refers to no actual 102 * pattern. Not normally needed; RegexPattern objects are usually 103 * created using the factory method <code>compile()</code>. 104 * 105 * @stable ICU 2.4 106 */ 107 RegexPattern(); 108 109 /** 110 * Copy Constructor. Create a new RegexPattern object that is equivalent 111 * to the source object. 112 * @param source the pattern object to be copied. 113 * @stable ICU 2.4 114 */ 115 RegexPattern(const RegexPattern &source); 116 117 /** 118 * Destructor. Note that a RegexPattern object must persist so long as any 119 * RegexMatcher objects that were created from the RegexPattern are active. 120 * @stable ICU 2.4 121 */ 122 virtual ~RegexPattern(); 123 124 /** 125 * Comparison operator. Two RegexPattern objects are considered equal if they 126 * were constructed from identical source patterns using the same match flag 127 * settings. 128 * @param that a RegexPattern object to compare with "this". 129 * @return TRUE if the objects are equivalent. 130 * @stable ICU 2.4 131 */ 132 UBool operator==(const RegexPattern& that) const; 133 134 /** 135 * Comparison operator. Two RegexPattern objects are considered equal if they 136 * were constructed from identical source patterns using the same match flag 137 * settings. 138 * @param that a RegexPattern object to compare with "this". 139 * @return TRUE if the objects are different. 140 * @stable ICU 2.4 141 */ 142 inline UBool operator!=(const RegexPattern& that) const {return ! operator ==(that);}; 143 144 /** 145 * Assignment operator. After assignment, this RegexPattern will behave identically 146 * to the source object. 147 * @stable ICU 2.4 148 */ 149 RegexPattern &operator =(const RegexPattern &source); 150 151 /** 152 * Create an exact copy of this RegexPattern object. Since RegexPattern is not 153 * intended to be subclasses, <code>clone()</code> and the copy construction are 154 * equivalent operations. 155 * @return the copy of this RegexPattern 156 * @stable ICU 2.4 157 */ 158 virtual RegexPattern *clone() const; 159 160 161 /** 162 * Compiles the regular expression in string form into a RegexPattern 163 * object. These compile methods, rather than the constructors, are the usual 164 * way that RegexPattern objects are created. 165 * 166 * <p>Note that RegexPattern objects must not be deleted while RegexMatcher 167 * objects created from the pattern are active. RegexMatchers keep a pointer 168 * back to their pattern, so premature deletion of the pattern is a 169 * catastrophic error.</p> 170 * 171 * <p>All pattern match mode flags are set to their default values.</p> 172 * 173 * <p>Note that it is often more convenient to construct a RegexMatcher directly 174 * from a pattern string rather than separately compiling the pattern and 175 * then creating a RegexMatcher object from the pattern.</p> 176 * 177 * @param regex The regular expression to be compiled. 178 * @param pe Receives the position (line and column nubers) of any error 179 * within the regular expression.) 180 * @param status A reference to a UErrorCode to receive any errors. 181 * @return A regexPattern object for the compiled pattern. 182 * 183 * @stable ICU 2.4 184 */ 185 static RegexPattern * U_EXPORT2 compile( const UnicodeString ®ex, 186 UParseError &pe, 187 UErrorCode &status); 188 189 /** 190 * Compiles the regular expression in string form into a RegexPattern 191 * object using the specified match mode flags. These compile methods, 192 * rather than the constructors, are the usual way that RegexPattern objects 193 * are created. 194 * 195 * <p>Note that RegexPattern objects must not be deleted while RegexMatcher 196 * objects created from the pattern are active. RegexMatchers keep a pointer 197 * back to their pattern, so premature deletion of the pattern is a 198 * catastrophic error.</p> 199 * 200 * <p>Note that it is often more convenient to construct a RegexMatcher directly 201 * from a pattern string instead of than separately compiling the pattern and 202 * then creating a RegexMatcher object from the pattern.</p> 203 * 204 * @param regex The regular expression to be compiled. 205 * @param flags The match mode flags to be used. 206 * @param pe Receives the position (line and column nubers) of any error 207 * within the regular expression.) 208 * @param status A reference to a UErrorCode to receive any errors. 209 * @return A regexPattern object for the compiled pattern. 210 * 211 * @stable ICU 2.4 212 */ 213 static RegexPattern * U_EXPORT2 compile( const UnicodeString ®ex, 214 uint32_t flags, 215 UParseError &pe, 216 UErrorCode &status); 217 218 219 /** 220 * Compiles the regular expression in string form into a RegexPattern 221 * object using the specified match mode flags. These compile methods, 222 * rather than the constructors, are the usual way that RegexPattern objects 223 * are created. 224 * 225 * <p>Note that RegexPattern objects must not be deleted while RegexMatcher 226 * objects created from the pattern are active. RegexMatchers keep a pointer 227 * back to their pattern, so premature deletion of the pattern is a 228 * catastrophic error.</p> 229 * 230 * <p>Note that it is often more convenient to construct a RegexMatcher directly 231 * from a pattern string instead of than separately compiling the pattern and 232 * then creating a RegexMatcher object from the pattern.</p> 233 * 234 * @param regex The regular expression to be compiled. 235 * @param flags The match mode flags to be used. 236 * @param status A reference to a UErrorCode to receive any errors. 237 * @return A regexPattern object for the compiled pattern. 238 * 239 * @stable ICU 2.6 240 */ 241 static RegexPattern * U_EXPORT2 compile( const UnicodeString ®ex, 242 uint32_t flags, 243 UErrorCode &status); 244 245 246 /** 247 * Get the match mode flags that were used when compiling this pattern. 248 * @return the match mode flags 249 * @stable ICU 2.4 250 */ 251 virtual uint32_t flags() const; 252 253 /** 254 * Creates a RegexMatcher that will match the given input against this pattern. The 255 * RegexMatcher can then be used to perform match, find or replace operations 256 * on the input. Note that a RegexPattern object must not be deleted while 257 * RegexMatchers created from it still exist and might possibly be used again. 258 * <p> 259 * The matcher will retain a reference to the supplied input string, and all regexp 260 * pattern matching operations happen directly on this original string. It is 261 * critical that the string not be altered or deleted before use by the regular 262 * expression operations is complete. 263 * 264 * @param input The input string to which the regular expression will be applied. 265 * @param status A reference to a UErrorCode to receive any errors. 266 * @return A RegexMatcher object for this pattern and input. 267 * 268 * @stable ICU 2.4 269 */ 270 virtual RegexMatcher *matcher(const UnicodeString &input, 271 UErrorCode &status) const; 272 273 private: 274 /** 275 * Cause a compilation error if an application accidently attempts to 276 * create a matcher with a (UChar *) string as input rather than 277 * a UnicodeString. Avoids a dangling reference to a temporary string. 278 * <p> 279 * To efficiently work with UChar *strings, wrap the data in a UnicodeString 280 * using one of the aliasing constructors, such as 281 * <code>UnicodeString(UBool isTerminated, const UChar *text, int32_t textLength);</code> 282 * 283 * @internal 284 */ 285 RegexMatcher *matcher(const UChar *input, 286 UErrorCode &status) const; 287 public: 288 289 290 /** 291 * Creates a RegexMatcher that will match against this pattern. The 292 * RegexMatcher can be used to perform match, find or replace operations. 293 * Note that a RegexPattern object must not be deleted while 294 * RegexMatchers created from it still exist and might possibly be used again. 295 * 296 * @param status A reference to a UErrorCode to receive any errors. 297 * @return A RegexMatcher object for this pattern and input. 298 * 299 * @stable ICU 2.6 300 */ 301 virtual RegexMatcher *matcher(UErrorCode &status) const; 302 303 304 /** 305 * Test whether a string matches a regular expression. This convenience function 306 * both compiles the reguluar expression and applies it in a single operation. 307 * Note that if the same pattern needs to be applied repeatedly, this method will be 308 * less efficient than creating and reusing a RegexMatcher object. 309 * 310 * @param regex The regular expression 311 * @param input The string data to be matched 312 * @param pe Receives the position of any syntax errors within the regular expression 313 * @param status A reference to a UErrorCode to receive any errors. 314 * @return True if the regular expression exactly matches the full input string. 315 * 316 * @stable ICU 2.4 317 */ 318 static UBool U_EXPORT2 matches(const UnicodeString ®ex, 319 const UnicodeString &input, 320 UParseError &pe, 321 UErrorCode &status); 322 323 324 /** 325 * Returns the regular expression from which this pattern was compiled. 326 * @stable ICU 2.4 327 */ 328 virtual UnicodeString pattern() const; 329 330 331 /** 332 * Split a string into fields. Somewhat like split() from Perl. 333 * The pattern matches identify delimiters that separate the input 334 * into fields. The input data between the matches becomes the 335 * fields themselves. 336 * <p> 337 * For the best performance on split() operations, 338 * <code>RegexMatcher::split</code> is perferable to this function 339 * 340 * @param input The string to be split into fields. The field delimiters 341 * match the pattern (in the "this" object) 342 * @param dest An array of UnicodeStrings to receive the results of the split. 343 * This is an array of actual UnicodeString objects, not an 344 * array of pointers to strings. Local (stack based) arrays can 345 * work well here. 346 * @param destCapacity The number of elements in the destination array. 347 * If the number of fields found is less than destCapacity, the 348 * extra strings in the destination array are not altered. 349 * If the number of destination strings is less than the number 350 * of fields, the trailing part of the input string, including any 351 * field delimiters, is placed in the last destination string. 352 * @param status A reference to a UErrorCode to receive any errors. 353 * @return The number of fields into which the input string was split. 354 * @stable ICU 2.4 355 */ 356 virtual int32_t split(const UnicodeString &input, 357 UnicodeString dest[], 358 int32_t destCapacity, 359 UErrorCode &status) const; 360 361 362 /** 363 * ICU "poor man's RTTI", returns a UClassID for the actual class. 364 * 365 * @stable ICU 2.4 366 */ 367 virtual UClassID getDynamicClassID() const; 368 369 /** 370 * ICU "poor man's RTTI", returns a UClassID for this class. 371 * 372 * @stable ICU 2.4 373 */ 374 static UClassID U_EXPORT2 getStaticClassID(); 375 376 private: 377 // 378 // Implementation Data 379 // 380 UnicodeString fPattern; // The original pattern string. 381 uint32_t fFlags; // The flags used when compiling the pattern. 382 // 383 UVector32 *fCompiledPat; // The compiled pattern p-code. 384 UnicodeString fLiteralText; // Any literal string data from the pattern, 385 // after un-escaping, for use during the match. 386 387 UVector *fSets; // Any UnicodeSets referenced from the pattern. 388 Regex8BitSet *fSets8; // (and fast sets for latin-1 range.) 389 390 391 UErrorCode fDeferredStatus; // status if some prior error has left this 392 // RegexPattern in an unusable state. 393 394 int32_t fMinMatchLen; // Minimum Match Length. All matches will have length 395 // >= this value. For some patterns, this calculated 396 // value may be less than the true shortest 397 // possible match. 398 399 int32_t fFrameSize; // Size of a state stack frame in the 400 // execution engine. 401 402 int32_t fDataSize; // The size of the data needed by the pattern that 403 // does not go on the state stack, but has just 404 // a single copy per matcher. 405 406 UVector32 *fGroupMap; // Map from capture group number to position of 407 // the group's variables in the matcher stack frame. 408 409 int32_t fMaxCaptureDigits; 410 411 UnicodeSet **fStaticSets; // Ptr to static (shared) sets for predefined 412 // regex character classes, e.g. Word. 413 414 Regex8BitSet *fStaticSets8; // Ptr to the static (shared) latin-1 only 415 // sets for predefined regex classes. 416 417 int32_t fStartType; // Info on how a match must start. 418 int32_t fInitialStringIdx; // 419 int32_t fInitialStringLen; 420 UnicodeSet *fInitialChars; 421 UChar32 fInitialChar; 422 Regex8BitSet *fInitialChars8; 423 424 friend class RegexCompile; 425 friend class RegexMatcher; 426 friend class RegexCImpl; 427 428 // 429 // Implementation Methods 430 // 431 void init(); // Common initialization, for use by constructors. 432 void zap(); // Common cleanup 433 #ifdef REGEX_DEBUG 434 void dumpOp(int32_t index) const; 435 friend void U_EXPORT2 RegexPatternDump(const RegexPattern *); 436 #endif 437 438 }; 439 440 441 442 /** 443 * class RegexMatcher bundles together a reular expression pattern and 444 * input text to which the expression can be applied. It includes methods 445 * for testing for matches, and for find and replace operations. 446 * 447 * <p>Class RegexMatcher is not intended to be subclassed.</p> 448 * 449 * @stable ICU 2.4 450 */ 451 class U_I18N_API RegexMatcher: public UObject { 452 public: 453 454 /** 455 * Construct a RegexMatcher for a regular expression. 456 * This is a convenience method that avoids the need to explicitly create 457 * a RegexPattern object. Note that if several RegexMatchers need to be 458 * created for the same expression, it will be more efficient to 459 * separately create and cache a RegexPattern object, and use 460 * its matcher() method to create the RegexMatcher objects. 461 * 462 * @param regexp The Regular Expression to be compiled. 463 * @param flags Regular expression options, such as case insensitive matching. 464 * @see UREGEX_CASE_INSENSITIVE 465 * @param status Any errors are reported by setting this UErrorCode variable. 466 * @stable ICU 2.6 467 */ 468 RegexMatcher(const UnicodeString ®exp, uint32_t flags, UErrorCode &status); 469 470 /** 471 * Construct a RegexMatcher for a regular expression. 472 * This is a convenience method that avoids the need to explicitly create 473 * a RegexPattern object. Note that if several RegexMatchers need to be 474 * created for the same expression, it will be more efficient to 475 * separately create and cache a RegexPattern object, and use 476 * its matcher() method to create the RegexMatcher objects. 477 * <p> 478 * The matcher will retain a reference to the supplied input string, and all regexp 479 * pattern matching operations happen directly on the original string. It is 480 * critical that the string not be altered or deleted before use by the regular 481 * expression operations is complete. 482 * 483 * @param regexp The Regular Expression to be compiled. 484 * @param input The string to match. The matcher retains a reference to the 485 * caller's string; mo copy is made. 486 * @param flags Regular expression options, such as case insensitive matching. 487 * @see UREGEX_CASE_INSENSITIVE 488 * @param status Any errors are reported by setting this UErrorCode variable. 489 * @stable ICU 2.6 490 */ 491 RegexMatcher(const UnicodeString ®exp, const UnicodeString &input, 492 uint32_t flags, UErrorCode &status); 493 494 private: 495 /** 496 * Cause a compilation error if an application accidently attempts to 497 * create a matcher with a (UChar *) string as input rather than 498 * a UnicodeString. Avoids a dangling reference to a temporary string. 499 * <p> 500 * To efficiently work with UChar *strings, wrap the data in a UnicodeString 501 * using one of the aliasing constructors, such as 502 * <code>UnicodeString(UBool isTerminated, const UChar *text, int32_t textLength);</code> 503 * 504 * @internal 505 */ 506 RegexMatcher(const UnicodeString ®exp, const UChar *input, 507 uint32_t flags, UErrorCode &status); 508 public: 509 510 511 /** 512 * Destructor. 513 * 514 * @stable ICU 2.4 515 */ 516 virtual ~RegexMatcher(); 517 518 519 /** 520 * Attempts to match the entire input region against the pattern. 521 * @param status A reference to a UErrorCode to receive any errors. 522 * @return TRUE if there is a match 523 * @stable ICU 2.4 524 */ 525 virtual UBool matches(UErrorCode &status); 526 527 /** 528 * Resets the matcher, then attempts to match the input beginning 529 * at the specified startIndex, and extending to the end of the input. 530 * The input region is reset to include the entire input string. 531 * A successful match must extend to the end of the input. 532 * @param startIndex The input string index at which to begin matching. 533 * @param status A reference to a UErrorCode to receive any errors. 534 * @return TRUE if there is a match 535 * @stable ICU 2.8 536 */ 537 virtual UBool matches(int32_t startIndex, UErrorCode &status); 538 539 540 541 542 /** 543 * Attempts to match the input string, starting from the beginning of the region, 544 * against the pattern. Like the matches() method, this function 545 * always starts at the beginning of the input region; 546 * unlike that function, it does not require that the entire region be matched. 547 * 548 * <p>If the match succeeds then more information can be obtained via the <code>start()</code>, 549 * <code>end()</code>, and <code>group()</code> functions.</p> 550 * 551 * @param status A reference to a UErrorCode to receive any errors. 552 * @return TRUE if there is a match at the start of the input string. 553 * @stable ICU 2.4 554 */ 555 virtual UBool lookingAt(UErrorCode &status); 556 557 558 /** 559 * Attempts to match the input string, starting from the specified index, against the pattern. 560 * The match may be of any length, and is not required to extend to the end 561 * of the input string. Contrast with match(). 562 * 563 * <p>If the match succeeds then more information can be obtained via the <code>start()</code>, 564 * <code>end()</code>, and <code>group()</code> functions.</p> 565 * 566 * @param startIndex The input string index at which to begin matching. 567 * @param status A reference to a UErrorCode to receive any errors. 568 * @return TRUE if there is a match. 569 * @stable ICU 2.8 570 */ 571 virtual UBool lookingAt(int32_t startIndex, UErrorCode &status); 572 573 /** 574 * Find the next pattern match in the input string. 575 * The find begins searching the input at the location following the end of 576 * the previous match, or at the start of the string if there is no previous match. 577 * If a match is found, <code>start(), end()</code> and <code>group()</code> 578 * will provide more information regarding the match. 579 * <p>Note that if the input string is changed by the application, 580 * use find(startPos, status) instead of find(), because the saved starting 581 * position may not be valid with the altered input string.</p> 582 * @return TRUE if a match is found. 583 * @stable ICU 2.4 584 */ 585 virtual UBool find(); 586 587 588 /** 589 * Resets this RegexMatcher and then attempts to find the next substring of the 590 * input string that matches the pattern, starting at the specified index. 591 * 592 * @param start the position in the input string to begin the search 593 * @param status A reference to a UErrorCode to receive any errors. 594 * @return TRUE if a match is found. 595 * @stable ICU 2.4 596 */ 597 virtual UBool find(int32_t start, UErrorCode &status); 598 599 600 /** 601 * Returns a string containing the text matched by the previous match. 602 * If the pattern can match an empty string, an empty string may be returned. 603 * @param status A reference to a UErrorCode to receive any errors. 604 * Possible errors are U_REGEX_INVALID_STATE if no match 605 * has been attempted or the last match failed. 606 * @return a string containing the matched input text. 607 * @stable ICU 2.4 608 */ 609 virtual UnicodeString group(UErrorCode &status) const; 610 611 612 /** 613 * Returns a string containing the text captured by the given group 614 * during the previous match operation. Group(0) is the entire match. 615 * 616 * @param groupNum the capture group number 617 * @param status A reference to a UErrorCode to receive any errors. 618 * Possible errors are U_REGEX_INVALID_STATE if no match 619 * has been attempted or the last match failed and 620 * U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number. 621 * @return the captured text 622 * @stable ICU 2.4 623 */ 624 virtual UnicodeString group(int32_t groupNum, UErrorCode &status) const; 625 626 627 /** 628 * Returns the number of capturing groups in this matcher's pattern. 629 * @return the number of capture groups 630 * @stable ICU 2.4 631 */ 632 virtual int32_t groupCount() const; 633 634 635 /** 636 * Returns the index in the input string of the start of the text matched 637 * during the previous match operation. 638 * @param status a reference to a UErrorCode to receive any errors. 639 * @return The position in the input string of the start of the last match. 640 * @stable ICU 2.4 641 */ 642 virtual int32_t start(UErrorCode &status) const; 643 644 645 /** 646 * Returns the index in the input string of the start of the text matched by the 647 * specified capture group during the previous match operation. Return -1 if 648 * the capture group exists in the pattern, but was not part of the last match. 649 * 650 * @param group the capture group number 651 * @param status A reference to a UErrorCode to receive any errors. Possible 652 * errors are U_REGEX_INVALID_STATE if no match has been 653 * attempted or the last match failed, and 654 * U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number 655 * @return the start position of substring matched by the specified group. 656 * @stable ICU 2.4 657 */ 658 virtual int32_t start(int32_t group, UErrorCode &status) const; 659 660 661 /** 662 * Returns the index in the input string of the first character following the 663 * text matched during the previous match operation. 664 * @param status A reference to a UErrorCode to receive any errors. Possible 665 * errors are U_REGEX_INVALID_STATE if no match has been 666 * attempted or the last match failed. 667 * @return the index of the last character matched, plus one. 668 * @stable ICU 2.4 669 */ 670 virtual int32_t end(UErrorCode &status) const; 671 672 673 /** 674 * Returns the index in the input string of the character following the 675 * text matched by the specified capture group during the previous match operation. 676 * @param group the capture group number 677 * @param status A reference to a UErrorCode to receive any errors. Possible 678 * errors are U_REGEX_INVALID_STATE if no match has been 679 * attempted or the last match failed and 680 * U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number 681 * @return the index of the first character following the text 682 * captured by the specifed group during the previous match operation. 683 * Return -1 if the capture group exists in the pattern but was not part of the match. 684 * @stable ICU 2.4 685 */ 686 virtual int32_t end(int32_t group, UErrorCode &status) const; 687 688 689 /** 690 * Resets this matcher. The effect is to remove any memory of previous matches, 691 * and to cause subsequent find() operations to begin at the beginning of 692 * the input string. 693 * 694 * @return this RegexMatcher. 695 * @stable ICU 2.4 696 */ 697 virtual RegexMatcher &reset(); 698 699 700 /** 701 * Resets this matcher, and set the current input position. 702 * The effect is to remove any memory of previous matches, 703 * and to cause subsequent find() operations to begin at 704 * the specified position in the input string. 705 * <p> 706 * The matcher's region is reset to its default, which is the entire 707 * input string. 708 * <p> 709 * An alternative to this function is to set a match region 710 * beginning at the desired index. 711 * 712 * @return this RegexMatcher. 713 * @stable ICU 2.8 714 */ 715 virtual RegexMatcher &reset(int32_t index, UErrorCode &status); 716 717 718 /** 719 * Resets this matcher with a new input string. This allows instances of RegexMatcher 720 * to be reused, which is more efficient than creating a new RegexMatcher for 721 * each input string to be processed. 722 * @param input The new string on which subsequent pattern matches will operate. 723 * The matcher retains a reference to the callers string, and operates 724 * directly on that. Ownership of the string remains with the caller. 725 * Because no copy of the string is made, it is essential that the 726 * caller not delete the string until after regexp operations on it 727 * are done. 728 * @return this RegexMatcher. 729 * @stable ICU 2.4 730 */ 731 virtual RegexMatcher &reset(const UnicodeString &input); 732 733 private: 734 /** 735 * Cause a compilation error if an application accidently attempts to 736 * reset a matcher with a (UChar *) string as input rather than 737 * a UnicodeString. Avoids a dangling reference to a temporary string. 738 * <p> 739 * To efficiently work with UChar *strings, wrap the data in a UnicodeString 740 * using one of the aliasing constructors, such as 741 * <code>UnicodeString(UBool isTerminated, const UChar *text, int32_t textLength);</code> 742 * 743 * @internal 744 */ 745 RegexMatcher &reset(const UChar *input); 746 public: 747 748 /** 749 * Returns the input string being matched. The returned string is not a copy, 750 * but the live input string. It should not be altered or deleted. 751 * @return the input string 752 * @stable ICU 2.4 753 */ 754 virtual const UnicodeString &input() const; 755 756 757 758 /** Sets the limits of this matcher's region. 759 * The region is the part of the input string that will be searched to find a match. 760 * Invoking this method resets the matcher, and then sets the region to start 761 * at the index specified by the start parameter and end at the index specified 762 * by the end parameter. 763 * 764 * Depending on the transparency and anchoring being used (see useTransparentBounds 765 * and useAnchoringBounds), certain constructs such as anchors may behave differently 766 * at or around the boundaries of the region 767 * 768 * The function will fail if start is greater than limit, or if either index 769 * is less than zero or greater than the length of the string being matched. 770 * 771 * @param start The index to begin searches at. 772 * @param limit The index to end searches at (exclusive). 773 * @param status A reference to a UErrorCode to receive any errors. 774 * @draft ICU 4.0 775 */ 776 virtual RegexMatcher ®ion(int32_t start, int32_t limit, UErrorCode &status); 777 778 779 /** 780 * Reports the start index of this matcher's region. The searches this matcher 781 * conducts are limited to finding matches within regionStart (inclusive) and 782 * regionEnd (exclusive). 783 * 784 * @return The starting index of this matcher's region. 785 * @draft ICU 4.0 786 */ 787 virtual int32_t regionStart() const; 788 789 790 /** 791 * Reports the end (limit) index (exclusive) of this matcher's region. The searches 792 * this matcher conducts are limited to finding matches within regionStart 793 * (inclusive) and regionEnd (exclusive). 794 * 795 * @return The ending point of this matcher's region. 796 * @draft ICU 4.0 797 */ 798 virtual int32_t regionEnd() const; 799 800 /** 801 * Queries the transparency of region bounds for this matcher. 802 * See useTransparentBounds for a description of transparent and opaque bounds. 803 * By default, a matcher uses opaque region boundaries. 804 * 805 * @return TRUE if this matcher is using opaque bounds, false if it is not. 806 * @draft ICU 4.0 807 */ 808 virtual UBool hasTransparentBounds() const; 809 810 /** 811 * Sets the transparency of region bounds for this matcher. 812 * Invoking this function with an argument of true will set this matcher to use transparent bounds. 813 * If the boolean argument is false, then opaque bounds will be used. 814 * 815 * Using transparent bounds, the boundaries of this matcher's region are transparent 816 * to lookahead, lookbehind, and boundary matching constructs. Those constructs can 817 * see text beyond the boundaries of the region while checking for a match. 818 * 819 * With opaque bounds, no text outside of the matcher's region is visible to lookahead, 820 * lookbehind, and boundary matching constructs. 821 * 822 * By default, a matcher uses opaque bounds. 823 * 824 * @param b TRUE for transparent bounds; FALSE for opaque bounds 825 * @return This Matcher; 826 * @draft ICU 4.0 827 **/ 828 virtual RegexMatcher &useTransparentBounds(UBool b); 829 830 831 /** 832 * Return true if this matcher is using anchoring bounds. 833 * By default, matchers use anchoring region boounds. 834 * 835 * @return TRUE if this matcher is using anchoring bounds. 836 * @draft ICU 4.0 837 */ 838 virtual UBool hasAnchoringBounds() const; 839 840 /** 841 * Set whether this matcher is using Anchoring Bounds for its region. 842 * With anchoring bounds, pattern anchors such as ^ and $ will match at the start 843 * and end of the region. Without Anchoring Bounds, anchors will only match at 844 * the positions they would in the complete text. 845 * 846 * Anchoring Bounds are the default for regions. 847 * 848 * @param b TRUE if to enable anchoring bounds; FALSE to disable them. 849 * @return This Matcher 850 * @draft ICU 4.0 851 */ 852 virtual RegexMatcher &useAnchoringBounds(UBool b); 853 854 /** 855 * Return TRUE if the most recent matching operation touched the 856 * end of the text being processed. In this case, additional input text could 857 * change the results of that match. 858 * 859 * hitEnd() is defined for both successful and unsuccessful matches. 860 * In either case hitEnd() will return TRUE if if the end of the text was 861 * reached at any point during the matching process. 862 * 863 * @return TRUE if the most recent match hit the end of input 864 * @draft ICU 4.0 865 */ 866 virtual UBool hitEnd() const; 867 868 /** 869 * Return TRUE the most recent match succeeded and additional input could cause 870 * it to fail. If this method returns false and a match was found, then more input 871 * might change the match but the match won't be lost. If a match was not found, 872 * then requireEnd has no meaning. 873 * 874 * @return TRUE if more input could cause the most recent match to no longer match. 875 * @draft ICU 4.0 876 */ 877 virtual UBool requireEnd() const; 878 879 880 881 882 883 /** 884 * Returns the pattern that is interpreted by this matcher. 885 * @return the RegexPattern for this RegexMatcher 886 * @stable ICU 2.4 887 */ 888 virtual const RegexPattern &pattern() const; 889 890 891 /** 892 * Replaces every substring of the input that matches the pattern 893 * with the given replacement string. This is a convenience function that 894 * provides a complete find-and-replace-all operation. 895 * 896 * This method first resets this matcher. It then scans the input string 897 * looking for matches of the pattern. Input that is not part of any 898 * match is left unchanged; each match is replaced in the result by the 899 * replacement string. The replacement string may contain references to 900 * capture groups. 901 * 902 * @param replacement a string containing the replacement text. 903 * @param status a reference to a UErrorCode to receive any errors. 904 * @return a string containing the results of the find and replace. 905 * @stable ICU 2.4 906 */ 907 virtual UnicodeString replaceAll(const UnicodeString &replacement, UErrorCode &status); 908 909 910 /** 911 * Replaces the first substring of the input that matches 912 * the pattern with the replacement string. This is a convenience 913 * function that provides a complete find-and-replace operation. 914 * 915 * <p>This function first resets this RegexMatcher. It then scans the input string 916 * looking for a match of the pattern. Input that is not part 917 * of the match is appended directly to the result string; the match is replaced 918 * in the result by the replacement string. The replacement string may contain 919 * references to captured groups.</p> 920 * 921 * <p>The state of the matcher (the position at which a subsequent find() 922 * would begin) after completing a replaceFirst() is not specified. The 923 * RegexMatcher should be reset before doing additional find() operations.</p> 924 * 925 * @param replacement a string containing the replacement text. 926 * @param status a reference to a UErrorCode to receive any errors. 927 * @return a string containing the results of the find and replace. 928 * @stable ICU 2.4 929 */ 930 virtual UnicodeString replaceFirst(const UnicodeString &replacement, UErrorCode &status); 931 932 /** 933 * Implements a replace operation intended to be used as part of an 934 * incremental find-and-replace. 935 * 936 * <p>The input string, starting from the end of the previous replacement and ending at 937 * the start of the current match, is appended to the destination string. Then the 938 * replacement string is appended to the output string, 939 * including handling any substitutions of captured text.</p> 940 * 941 * <p>For simple, prepackaged, non-incremental find-and-replace 942 * operations, see replaceFirst() or replaceAll().</p> 943 * 944 * @param dest A UnicodeString to which the results of the find-and-replace are appended. 945 * @param replacement A UnicodeString that provides the text to be substituted for 946 * the input text that matched the regexp pattern. The replacement 947 * text may contain references to captured text from the 948 * input. 949 * @param status A reference to a UErrorCode to receive any errors. Possible 950 * errors are U_REGEX_INVALID_STATE if no match has been 951 * attempted or the last match failed, and U_INDEX_OUTOFBOUNDS_ERROR 952 * if the replacement text specifies a capture group that 953 * does not exist in the pattern. 954 * 955 * @return this RegexMatcher 956 * @stable ICU 2.4 957 * 958 */ 959 virtual RegexMatcher &appendReplacement(UnicodeString &dest, 960 const UnicodeString &replacement, UErrorCode &status); 961 962 963 /** 964 * As the final step in a find-and-replace operation, append the remainder 965 * of the input string, starting at the position following the last appendReplacement(), 966 * to the destination string. <code>appendTail()</code> is intended to be invoked after one 967 * or more invocations of the <code>RegexMatcher::appendReplacement()</code>. 968 * 969 * @param dest A UnicodeString to which the results of the find-and-replace are appended. 970 * @return the destination string. 971 * @stable ICU 2.4 972 */ 973 virtual UnicodeString &appendTail(UnicodeString &dest); 974 975 976 977 /** 978 * Split a string into fields. Somewhat like split() from Perl. 979 * The pattern matches identify delimiters that separate the input 980 * into fields. The input data between the matches becomes the 981 * fields themselves. 982 * <p> 983 * 984 * @param input The string to be split into fields. The field delimiters 985 * match the pattern (in the "this" object). This matcher 986 * will be reset to this input string. 987 * @param dest An array of UnicodeStrings to receive the results of the split. 988 * This is an array of actual UnicodeString objects, not an 989 * array of pointers to strings. Local (stack based) arrays can 990 * work well here. 991 * @param destCapacity The number of elements in the destination array. 992 * If the number of fields found is less than destCapacity, the 993 * extra strings in the destination array are not altered. 994 * If the number of destination strings is less than the number 995 * of fields, the trailing part of the input string, including any 996 * field delimiters, is placed in the last destination string. 997 * @param status A reference to a UErrorCode to receive any errors. 998 * @return The number of fields into which the input string was split. 999 * @stable ICU 2.6 1000 */ 1001 virtual int32_t split(const UnicodeString &input, 1002 UnicodeString dest[], 1003 int32_t destCapacity, 1004 UErrorCode &status); 1005 1006 1007 1008 /** 1009 * setTrace Debug function, enable/disable tracing of the matching engine. 1010 * For internal ICU development use only. DO NO USE!!!! 1011 * @internal 1012 */ 1013 void setTrace(UBool state); 1014 1015 1016 /** 1017 * ICU "poor man's RTTI", returns a UClassID for this class. 1018 * 1019 * @stable ICU 2.2 1020 */ 1021 static UClassID U_EXPORT2 getStaticClassID(); 1022 1023 /** 1024 * ICU "poor man's RTTI", returns a UClassID for the actual class. 1025 * 1026 * @stable ICU 2.2 1027 */ 1028 virtual UClassID getDynamicClassID() const; 1029 1030 private: 1031 // Constructors and other object boilerplate are private. 1032 // Instances of RegexMatcher can not be assigned, copied, cloned, etc. 1033 RegexMatcher(); // default constructor not implemented 1034 RegexMatcher(const RegexPattern *pat); 1035 RegexMatcher(const RegexMatcher &other); 1036 RegexMatcher &operator =(const RegexMatcher &rhs); 1037 friend class RegexPattern; 1038 friend class RegexCImpl; 1039 public: 1040 /** @internal */ 1041 void resetPreserveRegion(); // Reset matcher state, but preserve any region. 1042 private: 1043 1044 // 1045 // MatchAt This is the internal interface to the match engine itself. 1046 // Match status comes back in matcher member variables. 1047 // 1048 void MatchAt(int32_t startIdx, UBool toEnd, UErrorCode &status); 1049 inline void backTrack(int32_t &inputIdx, int32_t &patIdx); 1050 UBool isWordBoundary(int32_t pos); // perform Perl-like \b test 1051 UBool isUWordBoundary(int32_t pos); // perform RBBI based \b test 1052 REStackFrame *resetStack(); 1053 inline REStackFrame *StateSave(REStackFrame *fp, int32_t savePatIdx, 1054 int32_t frameSize, UErrorCode &status); 1055 1056 1057 const RegexPattern *fPattern; 1058 RegexPattern *fPatternOwned; // Non-NULL if this matcher owns the pattern, and 1059 // should delete it when through. 1060 1061 const UnicodeString *fInput; // The text being matched. Is never NULL. 1062 1063 int32_t fRegionStart; // Start of the input region, default = 0. 1064 int32_t fRegionLimit; // End of input region, default to input.length. 1065 1066 int32_t fAnchorStart; // Region bounds for anchoring operations (^ or $). 1067 int32_t fAnchorLimit; // See useAnchoringBounds 1068 1069 int32_t fLookStart; // Region bounds for look-ahead/behind and 1070 int32_t fLookLimit; // and other boundary tests. See 1071 // useTransparentBounds 1072 1073 int32_t fActiveStart; // Currently active bounds for matching. 1074 int32_t fActiveLimit; // Usually is the same as region, but 1075 // is changed to fLookStart/Limit when 1076 // entering look around regions. 1077 1078 UBool fTransparentBounds; // True if using transparent bounds. 1079 UBool fAnchoringBounds; // True if using anchoring bounds. 1080 1081 UBool fMatch; // True if the last attempted match was successful. 1082 int32_t fMatchStart; // Position of the start of the most recent match 1083 int32_t fMatchEnd; // First position after the end of the most recent match 1084 // Zero if no previous match, even when a region 1085 // is active. 1086 int32_t fLastMatchEnd; // First position after the end of the previous match, 1087 // or -1 if there was no previous match. 1088 int32_t fAppendPosition; // First position after the end of the previous 1089 // appendReplacement(). As described by the 1090 // JavaDoc for Java Matcher, where it is called 1091 // "append position" 1092 UBool fHitEnd; // True if the last match touched the end of input. 1093 UBool fRequireEnd; // True if the last match required end-of-input 1094 // (matched $ or Z) 1095 1096 UVector32 *fStack; 1097 REStackFrame *fFrame; // After finding a match, the last active stack frame, 1098 // which will contain the capture group results. 1099 // NOT valid while match engine is running. 1100 1101 int32_t *fData; // Data area for use by the compiled pattern. 1102 int32_t fSmallData[8]; // Use this for data if it's enough. 1103 1104 UBool fTraceDebug; // Set true for debug tracing of match engine. 1105 1106 UErrorCode fDeferredStatus; // Save error state if that cannot be immediately 1107 // reported, or that permanently disables this matcher. 1108 1109 RuleBasedBreakIterator *fWordBreakItr; 1110 1111 1112 }; 1113 1114 U_NAMESPACE_END 1115 #endif // UCONFIG_NO_REGULAR_EXPRESSIONS 1116 #endif 1117