1 /* 2 ********************************************************************** 3 * Copyright (C) 2004-2010, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ********************************************************************** 6 * file name: uregex.h 7 * encoding: US-ASCII 8 * indentation:4 9 * 10 * created on: 2004mar09 11 * created by: Andy Heninger 12 * 13 * ICU Regular Expressions, API for C 14 */ 15 16 /** 17 * \file 18 * \brief C API: Regular Expressions 19 * 20 * <p>This is a C wrapper around the C++ RegexPattern and RegexMatcher classes.</p> 21 */ 22 23 #ifndef UREGEX_H 24 #define UREGEX_H 25 26 #include "unicode/utext.h" 27 #include "unicode/utypes.h" 28 29 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 30 31 #include "unicode/localpointer.h" 32 #include "unicode/parseerr.h" 33 34 struct URegularExpression; 35 /** 36 * Structure representing a compiled regular rexpression, plus the results 37 * of a match operation. 38 * @stable ICU 3.0 39 */ 40 typedef struct URegularExpression URegularExpression; 41 42 43 /** 44 * Constants for Regular Expression Match Modes. 45 * @stable ICU 2.4 46 */ 47 typedef enum URegexpFlag{ 48 49 #ifndef U_HIDE_DRAFT_API 50 /** Forces normalization of pattern and strings. 51 Not implemented yet, just a placeholder, hence draft. 52 @draft ICU 2.4 */ 53 UREGEX_CANON_EQ = 128, 54 #endif 55 /** Enable case insensitive matching. @stable ICU 2.4 */ 56 UREGEX_CASE_INSENSITIVE = 2, 57 58 /** Allow white space and comments within patterns @stable ICU 2.4 */ 59 UREGEX_COMMENTS = 4, 60 61 /** If set, '.' matches line terminators, otherwise '.' matching stops at line end. 62 * @stable ICU 2.4 */ 63 UREGEX_DOTALL = 32, 64 65 /** If set, treat the entire pattern as a literal string. 66 * Metacharacters or escape sequences in the input sequence will be given 67 * no special meaning. Not implemented yet as of ICU 4.4. 68 * 69 * The flags CASE_INSENSITIVE and UNICODE_CASE retain their impact 70 * on matching when used in conjunction with this flag. 71 * The other flags become superfluous. 72 * TODO: say which escapes are still handled; anything Java does 73 * early (\\u) we should still do. 74 * @stable ICU 4.0 75 */ 76 UREGEX_LITERAL = 16, 77 78 /** Control behavior of "$" and "^" 79 * If set, recognize line terminators within string, 80 * otherwise, match only at start and end of input string. 81 * @stable ICU 2.4 */ 82 UREGEX_MULTILINE = 8, 83 84 /** Unix-only line endings. 85 * When this mode is enabled, only \\u000a is recognized as a line ending 86 * in the behavior of ., ^, and $. 87 * @stable ICU 4.0 88 */ 89 UREGEX_UNIX_LINES = 1, 90 91 /** Unicode word boundaries. 92 * If set, \b uses the Unicode TR 29 definition of word boundaries. 93 * Warning: Unicode word boundaries are quite different from 94 * traditional regular expression word boundaries. See 95 * http://unicode.org/reports/tr29/#Word_Boundaries 96 * @stable ICU 2.8 97 */ 98 UREGEX_UWORD = 256, 99 100 /** Error on Unrecognized backslash escapes. 101 * If set, fail with an error on patterns that contain 102 * backslash-escaped ASCII letters without a known specail 103 * meaning. If this flag is not set, these 104 * escaped letters represent themselves. 105 * @stable ICU 4.0 106 */ 107 UREGEX_ERROR_ON_UNKNOWN_ESCAPES = 512 108 109 } URegexpFlag; 110 111 /** 112 * Open (compile) an ICU regular expression. Compiles the regular expression in 113 * string form into an internal representation using the specified match mode flags. 114 * The resulting regular expression handle can then be used to perform various 115 * matching operations. 116 * 117 * 118 * @param pattern The Regular Expression pattern to be compiled. 119 * @param patternLength The length of the pattern, or -1 if the pattern is 120 * NUL termintated. 121 * @param flags Flags that alter the default matching behavior for 122 * the regular expression, UREGEX_CASE_INSENSITIVE, for 123 * example. For default behavior, set this parameter to zero. 124 * See <code>enum URegexpFlag</code>. All desired flags 125 * are bitwise-ORed together. 126 * @param pe Receives the position (line and column nubers) of any syntax 127 * error within the source regular expression string. If this 128 * information is not wanted, pass NULL for this parameter. 129 * @param status Receives error detected by this function. 130 * @stable ICU 3.0 131 * 132 */ 133 U_STABLE URegularExpression * U_EXPORT2 134 uregex_open( const UChar *pattern, 135 int32_t patternLength, 136 uint32_t flags, 137 UParseError *pe, 138 UErrorCode *status); 139 140 /** 141 * Open (compile) an ICU regular expression. Compiles the regular expression in 142 * string form into an internal representation using the specified match mode flags. 143 * The resulting regular expression handle can then be used to perform various 144 * matching operations. 145 * <p> 146 * The contents of the pattern UText will be extracted and saved. Ownership of the 147 * UText struct itself remains with the caller. This is to match the behavior of 148 * uregex_open(). 149 * 150 * @param pattern The Regular Expression pattern to be compiled. 151 * @param flags Flags that alter the default matching behavior for 152 * the regular expression, UREGEX_CASE_INSENSITIVE, for 153 * example. For default behavior, set this parameter to zero. 154 * See <code>enum URegexpFlag</code>. All desired flags 155 * are bitwise-ORed together. 156 * @param pe Receives the position (line and column nubers) of any syntax 157 * error within the source regular expression string. If this 158 * information is not wanted, pass NULL for this parameter. 159 * @param status Receives error detected by this function. 160 * 161 * @internal ICU 4.4 technology preview 162 */ 163 U_INTERNAL URegularExpression * U_EXPORT2 164 uregex_openUText(UText *pattern, 165 uint32_t flags, 166 UParseError *pe, 167 UErrorCode *status); 168 169 /** 170 * Open (compile) an ICU regular expression. The resulting regular expression 171 * handle can then be used to perform various matching operations. 172 * <p> 173 * This function is the same as uregex_open, except that the pattern 174 * is supplied as an 8 bit char * string in the default code page. 175 * 176 * @param pattern The Regular Expression pattern to be compiled, 177 * NUL termintated. 178 * @param flags Flags that alter the default matching behavior for 179 * the regular expression, UREGEX_CASE_INSENSITIVE, for 180 * example. For default behavior, set this parameter to zero. 181 * See <code>enum URegexpFlag</code>. All desired flags 182 * are bitwise-ORed together. 183 * @param pe Receives the position (line and column nubers) of any syntax 184 * error within the source regular expression string. If this 185 * information is not wanted, pass NULL for this parameter. 186 * @param status Receives errors detected by this function. 187 * @return The URegularExpression object representing the compiled 188 * pattern. 189 * 190 * @stable ICU 3.0 191 */ 192 #if !UCONFIG_NO_CONVERSION 193 U_STABLE URegularExpression * U_EXPORT2 194 uregex_openC( const char *pattern, 195 uint32_t flags, 196 UParseError *pe, 197 UErrorCode *status); 198 #endif 199 200 201 202 /** 203 * Close the regular expression, recovering all resources (memory) it 204 * was holding. 205 * 206 * @param regexp The regular expression to be closed. 207 * @stable ICU 3.0 208 */ 209 U_STABLE void U_EXPORT2 210 uregex_close(URegularExpression *regexp); 211 212 #if U_SHOW_CPLUSPLUS_API 213 214 U_NAMESPACE_BEGIN 215 216 /** 217 * \class LocalURegularExpressionPointer 218 * "Smart pointer" class, closes a URegularExpression via uregex_close(). 219 * For most methods see the LocalPointerBase base class. 220 * 221 * @see LocalPointerBase 222 * @see LocalPointer 223 * @draft ICU 4.4 224 */ 225 U_DEFINE_LOCAL_OPEN_POINTER(LocalURegularExpressionPointer, URegularExpression, uregex_close); 226 227 U_NAMESPACE_END 228 229 #endif 230 231 /** 232 * Make a copy of a compiled regular expression. Cloning a regular 233 * expression is faster than opening a second instance from the source 234 * form of the expression, and requires less memory. 235 * <p> 236 * Note that the current input string and the position of any matched text 237 * within it are not cloned; only the pattern itself and and the 238 * match mode flags are copied. 239 * <p> 240 * Cloning can be particularly useful to threaded applications that perform 241 * multiple match operations in parallel. Each concurrent RE 242 * operation requires its own instance of a URegularExpression. 243 * 244 * @param regexp The compiled regular expression to be cloned. 245 * @param status Receives indication of any errors encountered 246 * @return the cloned copy of the compiled regular expression. 247 * @stable ICU 3.0 248 */ 249 U_STABLE URegularExpression * U_EXPORT2 250 uregex_clone(const URegularExpression *regexp, UErrorCode *status); 251 252 /** 253 * Returns a pointer to the source form of the pattern for this regular expression. 254 * This function will work even if the pattern was originally specified as a UText. 255 * 256 * @param regexp The compiled regular expression. 257 * @param patLength This output parameter will be set to the length of the 258 * pattern string. A NULL pointer may be used here if the 259 * pattern length is not needed, as would be the case if 260 * the pattern is known in advance to be a NUL terminated 261 * string. 262 * @param status Receives errors detected by this function. 263 * @return a pointer to the pattern string. The storage for the string is 264 * owned by the regular expression object, and must not be 265 * altered or deleted by the application. The returned string 266 * will remain valid until the regular expression is closed. 267 * @stable ICU 3.0 268 */ 269 U_STABLE const UChar * U_EXPORT2 270 uregex_pattern(const URegularExpression *regexp, 271 int32_t *patLength, 272 UErrorCode *status); 273 274 /** 275 * Returns the source text of the pattern for this regular expression. 276 * This function will work even if the pattern was originally specified as a UChar string. 277 * 278 * @param regexp The compiled regular expression. 279 * @param status Receives errors detected by this function. 280 * @return the pattern text. The storage for the text is owned by the regular expression 281 * object, and must not be altered or deleted. 282 * 283 * @internal ICU 4.4 technology preview 284 */ 285 U_INTERNAL UText * U_EXPORT2 286 uregex_patternUText(const URegularExpression *regexp, 287 UErrorCode *status); 288 289 290 /** 291 * Get the match mode flags that were specified when compiling this regular expression. 292 * @param status Receives errors detected by this function. 293 * @param regexp The compiled regular expression. 294 * @return The match mode flags 295 * @see URegexpFlag 296 * @stable ICU 3.0 297 */ 298 U_STABLE int32_t U_EXPORT2 299 uregex_flags(const URegularExpression *regexp, 300 UErrorCode *status); 301 302 303 /** 304 * Set the subject text string upon which the regular expression will look for matches. 305 * This function may be called any number of times, allowing the regular 306 * expression pattern to be applied to different strings. 307 * <p> 308 * Regular expression matching operations work directly on the application's 309 * string data. No copy is made. The subject string data must not be 310 * altered after calling this function until after all regular expression 311 * operations involving this string data are completed. 312 * <p> 313 * Zero length strings are permitted. In this case, no subsequent match 314 * operation will dereference the text string pointer. 315 * 316 * @param regexp The compiled regular expression. 317 * @param text The subject text string. 318 * @param textLength The length of the subject text, or -1 if the string 319 * is NUL terminated. 320 * @param status Receives errors detected by this function. 321 * @stable ICU 3.0 322 */ 323 U_STABLE void U_EXPORT2 324 uregex_setText(URegularExpression *regexp, 325 const UChar *text, 326 int32_t textLength, 327 UErrorCode *status); 328 329 330 /** 331 * Set the subject text string upon which the regular expression will look for matches. 332 * This function may be called any number of times, allowing the regular 333 * expression pattern to be applied to different strings. 334 * <p> 335 * Regular expression matching operations work directly on the application's 336 * string data; only a shallow clone is made. The subject string data must not be 337 * altered after calling this function until after all regular expression 338 * operations involving this string data are completed. 339 * 340 * @param regexp The compiled regular expression. 341 * @param text The subject text string. 342 * @param status Receives errors detected by this function. 343 * 344 * @internal ICU 4.4 technology preview 345 */ 346 U_INTERNAL void U_EXPORT2 347 uregex_setUText(URegularExpression *regexp, 348 UText *text, 349 UErrorCode *status); 350 351 /** 352 * Get the subject text that is currently associated with this 353 * regular expression object. If the input was supplied using uregex_setText(), 354 * that pointer will be returned. Otherwise, the characters in the input will 355 * be extracted to a buffer and returned. In either case, ownership remains 356 * with the regular expression object. 357 * 358 * This function will work even if the input was originally specified as a UText. 359 * 360 * @param regexp The compiled regular expression. 361 * @param textLength The length of the string is returned in this output parameter. 362 * A NULL pointer may be used here if the 363 * text length is not needed, as would be the case if 364 * the text is known in advance to be a NUL terminated 365 * string. 366 * @param status Receives errors detected by this function. 367 * @return Pointer to the subject text string currently associated with 368 * this regular expression. 369 * @stable ICU 3.0 370 */ 371 U_STABLE const UChar * U_EXPORT2 372 uregex_getText(URegularExpression *regexp, 373 int32_t *textLength, 374 UErrorCode *status); 375 376 377 /** 378 * Get the subject text that is currently associated with this 379 * regular expression object. 380 * 381 * This function will work even if the input was originally specified as a UChar string. 382 * 383 * @param regexp The compiled regular expression. 384 * @param dest A mutable UText in which to store the current input. 385 * If NULL, a new UText will be created as an immutable shallow clone 386 * of the actual input string. 387 * @param status Receives errors detected by this function. 388 * @return The subject text currently associated with this regular expression. 389 * If a pre-allocated UText was provided, it will always be used and returned. 390 * 391 * @internal ICU 4.4 technology preview 392 */ 393 U_INTERNAL UText * U_EXPORT2 394 uregex_getUText(URegularExpression *regexp, 395 UText *dest, 396 UErrorCode *status); 397 398 // BEGIN android-added 399 // Removed it after Android upgrade to ICU4.6. 400 /** 401 * Set the subject text string upon which the regular expression is looking for matches 402 * without changing any other aspect of the matching state. 403 * The new and previous text strings must have the same content. 404 * 405 * This function is intended for use in environments where ICU is operating on 406 * strings that may move around in memory. It provides a mechanism for notifying 407 * ICU that the string has been relocated, and providing a new UText to access the 408 * string in its new position. 409 * 410 * Caution: this function is normally used only by very specialized 411 * system-level code. 412 * 413 * @param regexp The compiled regular expression. 414 * @param text The new (moved) text string. 415 * @param status Receives errors detected by this function. 416 * 417 * @internal ICU 4.6 418 */ 419 U_INTERNAL void U_EXPORT2 420 uregex_refreshUText(URegularExpression *regexp, 421 UText *text, 422 UErrorCode *status); 423 // END android-added 424 425 /** 426 * Attempts to match the input string against the pattern. 427 * To succeed, the match must extend to the end of the string, 428 * or cover the complete match region. 429 * 430 * If startIndex >= zero the match operation starts at the specified 431 * index and must extend to the end of the input string. Any region 432 * that has been specified is reset. 433 * 434 * If startIndex == -1 the match must cover the input region, or the entire 435 * input string if no region has been set. This directly corresponds to 436 * Matcher.matches() in Java 437 * 438 * @param regexp The compiled regular expression. 439 * @param startIndex The input string index at which to begin matching, or -1 440 * to match the input Region. 441 * @param status Receives errors detected by this function. 442 * @return TRUE if there is a match 443 * @stable ICU 3.0 444 */ 445 U_STABLE UBool U_EXPORT2 446 uregex_matches(URegularExpression *regexp, 447 int32_t startIndex, 448 UErrorCode *status); 449 450 /** 451 * Attempts to match the input string, starting from the specified index, against the pattern. 452 * The match may be of any length, and is not required to extend to the end 453 * of the input string. Contrast with uregex_matches(). 454 * 455 * <p>If startIndex is >= 0 any input region that was set for this 456 * URegularExpression is reset before the operation begins. 457 * 458 * <p>If the specified starting index == -1 the match begins at the start of the input 459 * region, or at the start of the full string if no region has been specified. 460 * This corresponds directly with Matcher.lookingAt() in Java. 461 * 462 * <p>If the match succeeds then more information can be obtained via the 463 * <code>uregexp_start()</code>, <code>uregexp_end()</code>, 464 * and <code>uregexp_group()</code> functions.</p> 465 * 466 * @param regexp The compiled regular expression. 467 * @param startIndex The input string index at which to begin matching, or 468 * -1 to match the Input Region 469 * @param status A reference to a UErrorCode to receive any errors. 470 * @return TRUE if there is a match. 471 * @stable ICU 3.0 472 */ 473 U_STABLE UBool U_EXPORT2 474 uregex_lookingAt(URegularExpression *regexp, 475 int32_t startIndex, 476 UErrorCode *status); 477 478 /** 479 * Find the first matching substring of the input string that matches the pattern. 480 * If startIndex is >= zero the search for a match begins at the specified index, 481 * and any match region is reset. This corresponds directly with 482 * Matcher.find(startIndex) in Java. 483 * 484 * If startIndex == -1 the search begins at the start of the input region, 485 * or at the start of the full string if no region has been specified. 486 * 487 * If a match is found, <code>uregex_start(), uregex_end()</code>, and 488 * <code>uregex_group()</code> will provide more information regarding the match. 489 * 490 * @param regexp The compiled regular expression. 491 * @param startIndex The position in the input string to begin the search, or 492 * -1 to search within the Input Region. 493 * @param status A reference to a UErrorCode to receive any errors. 494 * @return TRUE if a match is found. 495 * @stable ICU 3.0 496 */ 497 U_STABLE UBool U_EXPORT2 498 uregex_find(URegularExpression *regexp, 499 int32_t startIndex, 500 UErrorCode *status); 501 502 /** 503 * Find the next pattern match in the input string. Begin searching 504 * the input at the location following the end of he previous match, 505 * or at the start of the string (or region) if there is no 506 * previous match. If a match is found, <code>uregex_start(), uregex_end()</code>, and 507 * <code>uregex_group()</code> will provide more information regarding the match. 508 * 509 * @param regexp The compiled regular expression. 510 * @param status A reference to a UErrorCode to receive any errors. 511 * @return TRUE if a match is found. 512 * @see uregex_reset 513 * @stable ICU 3.0 514 */ 515 U_STABLE UBool U_EXPORT2 516 uregex_findNext(URegularExpression *regexp, 517 UErrorCode *status); 518 519 /** 520 * Get the number of capturing groups in this regular expression's pattern. 521 * @param regexp The compiled regular expression. 522 * @param status A reference to a UErrorCode to receive any errors. 523 * @return the number of capture groups 524 * @stable ICU 3.0 525 */ 526 U_STABLE int32_t U_EXPORT2 527 uregex_groupCount(URegularExpression *regexp, 528 UErrorCode *status); 529 530 /** Extract the string for the specified matching expression or subexpression. 531 * Group #0 is the complete string of matched text. 532 * Group #1 is the text matched by the first set of capturing parentheses. 533 * 534 * @param regexp The compiled regular expression. 535 * @param groupNum The capture group to extract. Group 0 is the complete 536 * match. The value of this parameter must be 537 * less than or equal to the number of capture groups in 538 * the pattern. 539 * @param dest Buffer to receive the matching string data 540 * @param destCapacity Capacity of the dest buffer. 541 * @param status A reference to a UErrorCode to receive any errors. 542 * @return Length of matching data, 543 * or -1 if no applicable match. 544 * @stable ICU 3.0 545 */ 546 U_STABLE int32_t U_EXPORT2 547 uregex_group(URegularExpression *regexp, 548 int32_t groupNum, 549 UChar *dest, 550 int32_t destCapacity, 551 UErrorCode *status); 552 553 /** Extract the string for the specified matching expression or subexpression. 554 * Group #0 is the complete string of matched text. 555 * Group #1 is the text matched by the first set of capturing parentheses. 556 * 557 * @param regexp The compiled regular expression. 558 * @param groupNum The capture group to extract. Group 0 is the complete 559 * match. The value of this parameter must be 560 * less than or equal to the number of capture groups in 561 * the pattern. 562 * @param dest Mutable UText to receive the matching string data. 563 * If NULL, a new UText will be created (which may not be mutable). 564 * @param status A reference to a UErrorCode to receive any errors. 565 * @return The matching string data. If a pre-allocated UText was provided, 566 * it will always be used and returned. 567 * 568 * @internal ICU 4.4 technology preview 569 */ 570 U_INTERNAL UText * U_EXPORT2 571 uregex_groupUText(URegularExpression *regexp, 572 int32_t groupNum, 573 UText *dest, 574 UErrorCode *status); 575 576 577 /** 578 * Returns the index in the input string of the start of the text matched by the 579 * specified capture group during the previous match operation. Return -1 if 580 * the capture group was not part of the last match. 581 * Group #0 refers to the complete range of matched text. 582 * Group #1 refers to the text matched by the first set of capturing parentheses. 583 * 584 * @param regexp The compiled regular expression. 585 * @param groupNum The capture group number 586 * @param status A reference to a UErrorCode to receive any errors. 587 * @return the starting position in the input of the text matched 588 * by the specified group. 589 * @stable ICU 3.0 590 */ 591 U_STABLE int32_t U_EXPORT2 592 uregex_start(URegularExpression *regexp, 593 int32_t groupNum, 594 UErrorCode *status); 595 596 /** 597 * Returns the index in the input string of the position following the end 598 * of the text matched by the specified capture group. 599 * Return -1 if the capture group was not part of the last match. 600 * Group #0 refers to the complete range of matched text. 601 * Group #1 refers to the text matched by the first set of capturing parentheses. 602 * 603 * @param regexp The compiled regular expression. 604 * @param groupNum The capture group number 605 * @param status A reference to a UErrorCode to receive any errors. 606 * @return the index of the position following the last matched character. 607 * @stable ICU 3.0 608 */ 609 U_STABLE int32_t U_EXPORT2 610 uregex_end(URegularExpression *regexp, 611 int32_t groupNum, 612 UErrorCode *status); 613 614 /** 615 * Reset any saved state from the previous match. Has the effect of 616 * causing uregex_findNext to begin at the specified index, and causing 617 * uregex_start(), uregex_end() and uregex_group() to return an error 618 * indicating that there is no match information available. Clears any 619 * match region that may have been set. 620 * 621 * @param regexp The compiled regular expression. 622 * @param index The position in the text at which a 623 * uregex_findNext() should begin searching. 624 * @param status A reference to a UErrorCode to receive any errors. 625 * @stable ICU 3.0 626 */ 627 U_STABLE void U_EXPORT2 628 uregex_reset(URegularExpression *regexp, 629 int32_t index, 630 UErrorCode *status); 631 632 633 /** Sets the limits of the matching region for this URegularExpression. 634 * The region is the part of the input string that will be considered when matching. 635 * Invoking this method resets any saved state from the previous match, 636 * then sets the region to start at the index specified by the start parameter 637 * and end at the index specified by the end parameter. 638 * 639 * Depending on the transparency and anchoring being used (see useTransparentBounds 640 * and useAnchoringBounds), certain constructs such as anchors may behave differently 641 * at or around the boundaries of the region 642 * 643 * The function will fail if start is greater than limit, or if either index 644 * is less than zero or greater than the length of the string being matched. 645 * 646 * @param regexp The compiled regular expression. 647 * @param regionStart The index to begin searches at. 648 * @param regionLimit The index to end searches at (exclusive). 649 * @param status A pointer to a UErrorCode to receive any errors. 650 * @stable ICU 4.0 651 */ 652 U_STABLE void U_EXPORT2 653 uregex_setRegion(URegularExpression *regexp, 654 int32_t regionStart, 655 int32_t regionLimit, 656 UErrorCode *status); 657 658 /** 659 * Reports the start index of the matching region. Any matches found are limited to 660 * to the region bounded by regionStart (inclusive) and regionEnd (exclusive). 661 * 662 * @param regexp The compiled regular expression. 663 * @param status A pointer to a UErrorCode to receive any errors. 664 * @return The starting index of this matcher's region. 665 * @stable ICU 4.0 666 */ 667 U_STABLE int32_t U_EXPORT2 668 uregex_regionStart(const URegularExpression *regexp, 669 UErrorCode *status); 670 671 672 673 /** 674 * Reports the end index (exclusive) of the matching region for this URegularExpression. 675 * Any matches found are limited to to the region bounded by regionStart (inclusive) 676 * and regionEnd (exclusive). 677 * 678 * @param regexp The compiled regular expression. 679 * @param status A pointer to a UErrorCode to receive any errors. 680 * @return The ending point of this matcher's region. 681 * @stable ICU 4.0 682 */ 683 U_STABLE int32_t U_EXPORT2 684 uregex_regionEnd(const URegularExpression *regexp, 685 UErrorCode *status); 686 687 /** 688 * Queries the transparency of region bounds for this URegularExpression. 689 * See useTransparentBounds for a description of transparent and opaque bounds. 690 * By default, matching boundaries are opaque. 691 * 692 * @param regexp The compiled regular expression. 693 * @param status A pointer to a UErrorCode to receive any errors. 694 * @return TRUE if this matcher is using opaque bounds, false if it is not. 695 * @stable ICU 4.0 696 */ 697 U_STABLE UBool U_EXPORT2 698 uregex_hasTransparentBounds(const URegularExpression *regexp, 699 UErrorCode *status); 700 701 702 /** 703 * Sets the transparency of region bounds for this URegularExpression. 704 * Invoking this function with an argument of TRUE will set matches to use transparent bounds. 705 * If the boolean argument is FALSE, then opaque bounds will be used. 706 * 707 * Using transparent bounds, the boundaries of the matching region are transparent 708 * to lookahead, lookbehind, and boundary matching constructs. Those constructs can 709 * see text beyond the boundaries of the region while checking for a match. 710 * 711 * With opaque bounds, no text outside of the matching region is visible to lookahead, 712 * lookbehind, and boundary matching constructs. 713 * 714 * By default, opaque bounds are used. 715 * 716 * @param regexp The compiled regular expression. 717 * @param b TRUE for transparent bounds; FALSE for opaque bounds 718 * @param status A pointer to a UErrorCode to receive any errors. 719 * @stable ICU 4.0 720 **/ 721 U_STABLE void U_EXPORT2 722 uregex_useTransparentBounds(URegularExpression *regexp, 723 UBool b, 724 UErrorCode *status); 725 726 727 /** 728 * Return true if this URegularExpression is using anchoring bounds. 729 * By default, anchoring region bounds are used. 730 * 731 * @param regexp The compiled regular expression. 732 * @param status A pointer to a UErrorCode to receive any errors. 733 * @return TRUE if this matcher is using anchoring bounds. 734 * @stable ICU 4.0 735 */ 736 U_STABLE UBool U_EXPORT2 737 uregex_hasAnchoringBounds(const URegularExpression *regexp, 738 UErrorCode *status); 739 740 741 /** 742 * Set whether this URegularExpression is using Anchoring Bounds for its region. 743 * With anchoring bounds, pattern anchors such as ^ and $ will match at the start 744 * and end of the region. Without Anchoring Bounds, anchors will only match at 745 * the positions they would in the complete text. 746 * 747 * Anchoring Bounds are the default for regions. 748 * 749 * @param regexp The compiled regular expression. 750 * @param b TRUE if to enable anchoring bounds; FALSE to disable them. 751 * @param status A pointer to a UErrorCode to receive any errors. 752 * @stable ICU 4.0 753 */ 754 U_STABLE void U_EXPORT2 755 uregex_useAnchoringBounds(URegularExpression *regexp, 756 UBool b, 757 UErrorCode *status); 758 759 /** 760 * Return TRUE if the most recent matching operation touched the 761 * end of the text being processed. In this case, additional input text could 762 * change the results of that match. 763 * 764 * @param regexp The compiled regular expression. 765 * @param status A pointer to a UErrorCode to receive any errors. 766 * @return TRUE if the most recent match hit the end of input 767 * @stable ICU 4.0 768 */ 769 U_STABLE UBool U_EXPORT2 770 uregex_hitEnd(const URegularExpression *regexp, 771 UErrorCode *status); 772 773 /** 774 * Return TRUE the most recent match succeeded and additional input could cause 775 * it to fail. If this function returns false and a match was found, then more input 776 * might change the match but the match won't be lost. If a match was not found, 777 * then requireEnd has no meaning. 778 * 779 * @param regexp The compiled regular expression. 780 * @param status A pointer to a UErrorCode to receive any errors. 781 * @return TRUE if more input could cause the most recent match to no longer match. 782 * @stable ICU 4.0 783 */ 784 U_STABLE UBool U_EXPORT2 785 uregex_requireEnd(const URegularExpression *regexp, 786 UErrorCode *status); 787 788 789 790 791 792 /** 793 * Replaces every substring of the input that matches the pattern 794 * with the given replacement string. This is a convenience function that 795 * provides a complete find-and-replace-all operation. 796 * 797 * This method scans the input string looking for matches of the pattern. 798 * Input that is not part of any match is copied unchanged to the 799 * destination buffer. Matched regions are replaced in the output 800 * buffer by the replacement string. The replacement string may contain 801 * references to capture groups; these take the form of $1, $2, etc. 802 * 803 * @param regexp The compiled regular expression. 804 * @param replacementText A string containing the replacement text. 805 * @param replacementLength The length of the replacement string, or 806 * -1 if it is NUL terminated. 807 * @param destBuf A (UChar *) buffer that will receive the result. 808 * @param destCapacity The capacity of the desitnation buffer. 809 * @param status A reference to a UErrorCode to receive any errors. 810 * @return The length of the string resulting from the find 811 * and replace operation. In the event that the 812 * destination capacity is inadequate, the return value 813 * is still the full length of the untruncated string. 814 * @stable ICU 3.0 815 */ 816 U_STABLE int32_t U_EXPORT2 817 uregex_replaceAll(URegularExpression *regexp, 818 const UChar *replacementText, 819 int32_t replacementLength, 820 UChar *destBuf, 821 int32_t destCapacity, 822 UErrorCode *status); 823 824 /** 825 * Replaces every substring of the input that matches the pattern 826 * with the given replacement string. This is a convenience function that 827 * provides a complete find-and-replace-all operation. 828 * 829 * This method scans the input string looking for matches of the pattern. 830 * Input that is not part of any match is copied unchanged to the 831 * destination buffer. Matched regions are replaced in the output 832 * buffer by the replacement string. The replacement string may contain 833 * references to capture groups; these take the form of $1, $2, etc. 834 * 835 * @param regexp The compiled regular expression. 836 * @param replacement A string containing the replacement text. 837 * @param dest A mutable UText that will receive the result. 838 * If NULL, a new UText will be created (which may not be mutable). 839 * @param status A reference to a UErrorCode to receive any errors. 840 * @return A UText containing the results of the find and replace. 841 * If a pre-allocated UText was provided, it will always be used and returned. 842 * 843 * @internal ICU 4.4 technology preview 844 */ 845 U_INTERNAL UText * U_EXPORT2 846 uregex_replaceAllUText(URegularExpression *regexp, 847 UText *replacement, 848 UText *dest, 849 UErrorCode *status); 850 851 /** 852 * Replaces the first substring of the input that matches the pattern 853 * with the given replacement string. This is a convenience function that 854 * provides a complete find-and-replace operation. 855 * 856 * This method scans the input string looking for a match of the pattern. 857 * All input that is not part of the match is copied unchanged to the 858 * destination buffer. The matched region is replaced in the output 859 * buffer by the replacement string. The replacement string may contain 860 * references to capture groups; these take the form of $1, $2, etc. 861 * 862 * @param regexp The compiled regular expression. 863 * @param replacementText A string containing the replacement text. 864 * @param replacementLength The length of the replacement string, or 865 * -1 if it is NUL terminated. 866 * @param destBuf A (UChar *) buffer that will receive the result. 867 * @param destCapacity The capacity of the desitnation buffer. 868 * @param status a reference to a UErrorCode to receive any errors. 869 * @return The length of the string resulting from the find 870 * and replace operation. In the event that the 871 * destination capacity is inadequate, the return value 872 * is still the full length of the untruncated string. 873 * @stable ICU 3.0 874 */ 875 U_STABLE int32_t U_EXPORT2 876 uregex_replaceFirst(URegularExpression *regexp, 877 const UChar *replacementText, 878 int32_t replacementLength, 879 UChar *destBuf, 880 int32_t destCapacity, 881 UErrorCode *status); 882 883 /** 884 * Replaces the first substring of the input that matches the pattern 885 * with the given replacement string. This is a convenience function that 886 * provides a complete find-and-replace operation. 887 * 888 * This method scans the input string looking for a match of the pattern. 889 * All input that is not part of the match is copied unchanged to the 890 * destination buffer. The matched region is replaced in the output 891 * buffer by the replacement string. The replacement string may contain 892 * references to capture groups; these take the form of $1, $2, etc. 893 * 894 * @param regexp The compiled regular expression. 895 * @param replacement A string containing the replacement text. 896 * @param dest A mutable UText that will receive the result. 897 * If NULL, a new UText will be created (which may not be mutable). 898 * @param status A reference to a UErrorCode to receive any errors. 899 * @return A UText containing the results of the find and replace. 900 * If a pre-allocated UText was provided, it will always be used and returned. 901 * 902 * @internal ICU 4.4 technology preview 903 */ 904 U_INTERNAL UText * U_EXPORT2 905 uregex_replaceFirstUText(URegularExpression *regexp, 906 UText *replacement, 907 UText *dest, 908 UErrorCode *status); 909 910 911 /** 912 * Implements a replace operation intended to be used as part of an 913 * incremental find-and-replace. 914 * 915 * <p>The input string, starting from the end of the previous match and ending at 916 * the start of the current match, is appended to the destination string. Then the 917 * replacement string is appended to the output string, 918 * including handling any substitutions of captured text.</p> 919 * 920 * <p>A note on preflight computation of buffersize and error handling: 921 * Calls to uregex_appendReplacement() and uregex_appendTail() are 922 * designed to be chained, one after another, with the destination 923 * buffer pointer and buffer capacity updated after each in preparation 924 * to for the next. If the destination buffer is exhausted partway through such a 925 * sequence, a U_BUFFER_OVERFLOW_ERROR status will be returned. Normal 926 * ICU conventions are for a function to perform no action if it is 927 * called with an error status, but for this one case, uregex_appendRepacement() 928 * will operate normally so that buffer size computations will complete 929 * correctly. 930 * 931 * <p>For simple, prepackaged, non-incremental find-and-replace 932 * operations, see replaceFirst() or replaceAll().</p> 933 * 934 * @param regexp The regular expression object. 935 * @param replacementText The string that will replace the matched portion of the 936 * input string as it is copied to the destination buffer. 937 * The replacement text may contain references ($1, for 938 * example) to capture groups from the match. 939 * @param replacementLength The length of the replacement text string, 940 * or -1 if the string is NUL terminated. 941 * @param destBuf The buffer into which the results of the 942 * find-and-replace are placed. On return, this pointer 943 * will be updated to refer to the beginning of the 944 * unused portion of buffer, leaving it in position for 945 * a subsequent call to this function. 946 * @param destCapacity The size of the output buffer, On return, this 947 * parameter will be updated to reflect the space remaining 948 * unused in the output buffer. 949 * @param status A reference to a UErrorCode to receive any errors. 950 * @return The length of the result string. In the event that 951 * destCapacity is inadequate, the full length of the 952 * untruncated output string is returned. 953 * 954 * @stable ICU 3.0 955 * 956 */ 957 U_STABLE int32_t U_EXPORT2 958 uregex_appendReplacement(URegularExpression *regexp, 959 const UChar *replacementText, 960 int32_t replacementLength, 961 UChar **destBuf, 962 int32_t *destCapacity, 963 UErrorCode *status); 964 965 966 /** 967 * Implements a replace operation intended to be used as part of an 968 * incremental find-and-replace. 969 * 970 * <p>The input string, starting from the end of the previous match and ending at 971 * the start of the current match, is appended to the destination string. Then the 972 * replacement string is appended to the output string, 973 * including handling any substitutions of captured text.</p> 974 * 975 * <p>For simple, prepackaged, non-incremental find-and-replace 976 * operations, see replaceFirst() or replaceAll().</p> 977 * 978 * @param regexp The regular expression object. 979 * @param replacementText The string that will replace the matched portion of the 980 * input string as it is copied to the destination buffer. 981 * The replacement text may contain references ($1, for 982 * example) to capture groups from the match. 983 * @param dest A mutable UText that will receive the result. Must not be NULL. 984 * @param status A reference to a UErrorCode to receive any errors. 985 * 986 * @internal ICU 4.4 technology preview 987 */ 988 U_INTERNAL void U_EXPORT2 989 uregex_appendReplacementUText(URegularExpression *regexp, 990 UText *replacementText, 991 UText *dest, 992 UErrorCode *status); 993 994 995 /** 996 * As the final step in a find-and-replace operation, append the remainder 997 * of the input string, starting at the position following the last match, 998 * to the destination string. <code>uregex_appendTail()</code> is intended 999 * to be invoked after one or more invocations of the 1000 * <code>uregex_appendReplacement()</code> function. 1001 * 1002 * @param regexp The regular expression object. This is needed to 1003 * obtain the input string and with the position 1004 * of the last match within it. 1005 * @param destBuf The buffer in which the results of the 1006 * find-and-replace are placed. On return, the pointer 1007 * will be updated to refer to the beginning of the 1008 * unused portion of buffer. 1009 * @param destCapacity The size of the output buffer, On return, this 1010 * value will be updated to reflect the space remaining 1011 * unused in the output buffer. 1012 * @param status A reference to a UErrorCode to receive any errors. 1013 * @return The length of the result string. In the event that 1014 * destCapacity is inadequate, the full length of the 1015 * untruncated output string is returned. 1016 * 1017 * @stable ICU 3.0 1018 */ 1019 U_STABLE int32_t U_EXPORT2 1020 uregex_appendTail(URegularExpression *regexp, 1021 UChar **destBuf, 1022 int32_t *destCapacity, 1023 UErrorCode *status); 1024 1025 1026 /** 1027 * As the final step in a find-and-replace operation, append the remainder 1028 * of the input string, starting at the position following the last match, 1029 * to the destination string. <code>uregex_appendTailUText()</code> is intended 1030 * to be invoked after one or more invocations of the 1031 * <code>uregex_appendReplacementUText()</code> function. 1032 * 1033 * @param regexp The regular expression object. This is needed to 1034 * obtain the input string and with the position 1035 * of the last match within it. 1036 * @param dest A mutable UText that will receive the result. Must not be NULL. 1037 * @return The destination UText. 1038 * 1039 * @internal ICU 4.4 technology preview 1040 */ 1041 U_INTERNAL UText * U_EXPORT2 1042 uregex_appendTailUText(URegularExpression *regexp, 1043 UText *dest); 1044 1045 1046 1047 /** 1048 * Split a string into fields. Somewhat like split() from Perl. 1049 * The pattern matches identify delimiters that separate the input 1050 * into fields. The input data between the matches becomes the 1051 * fields themselves. 1052 * <p> 1053 * Each of the fields is copied from the input string to the destination 1054 * buffer, and NUL terminated. The position of each field within 1055 * the destination buffer is returned in the destFields array. 1056 * 1057 * Note: another choice for the design of this function would be to not 1058 * copy the resulting fields at all, but to return indexes and 1059 * lengths within the source text. 1060 * Advantages would be 1061 * o Faster. No Copying. 1062 * o Nothing extra needed when field data may contain embedded NUL chars. 1063 * o Less memory needed if working on large data. 1064 * Disadvantages 1065 * o Less consistent with C++ split, which copies into an 1066 * array of UnicodeStrings. 1067 * o No NUL termination, extracted fields would be less convenient 1068 * to use in most cases. 1069 * o Possible problems in the future, when support Unicode Normalization 1070 * could cause the fields to not correspond exactly to 1071 * a range of the source text. 1072 * 1073 * @param regexp The compiled regular expression. 1074 * @param destBuf A (UChar *) buffer to receive the fields that 1075 * are extracted from the input string. These 1076 * field pointers will refer to positions within the 1077 * destination buffer supplied by the caller. Any 1078 * extra positions within the destFields array will be 1079 * set to NULL. 1080 * @param destCapacity The capacity of the destBuf. 1081 * @param requiredCapacity The actual capacity required of the destBuf. 1082 * If destCapacity is too small, requiredCapacity will return 1083 * the total capacity required to hold all of the output, and 1084 * a U_BUFFER_OVERFLOW_ERROR will be returned. 1085 * @param destFields An array to be filled with the position of each 1086 * of the extracted fields within destBuf. 1087 * @param destFieldsCapacity The number of elements in the destFields array. 1088 * If the number of fields found is less than destFieldsCapacity, 1089 * the extra destFields elements are set to zero. 1090 * If destFieldsCapacity is too small, the trailing part of the 1091 * input, including any field delimiters, is treated as if it 1092 * were the last field - it is copied to the destBuf, and 1093 * its position is in the destBuf is stored in the last element 1094 * of destFields. This behavior mimics that of Perl. It is not 1095 * an error condition, and no error status is returned when all destField 1096 * positions are used. 1097 * @param status A reference to a UErrorCode to receive any errors. 1098 * @return The number of fields into which the input string was split. 1099 * @stable ICU 3.0 1100 */ 1101 U_STABLE int32_t U_EXPORT2 1102 uregex_split( URegularExpression *regexp, 1103 UChar *destBuf, 1104 int32_t destCapacity, 1105 int32_t *requiredCapacity, 1106 UChar *destFields[], 1107 int32_t destFieldsCapacity, 1108 UErrorCode *status); 1109 1110 1111 /** 1112 * Split a string into fields. Somewhat like split() from Perl. 1113 * The pattern matches identify delimiters that separate the input 1114 * into fields. The input data between the matches becomes the 1115 * fields themselves. 1116 * <p> 1117 * The behavior of this function is not very closely aligned with uregex_split(); 1118 * instead, it is based on (and implemented directly on top of) the C++ split method. 1119 * 1120 * @param regexp The compiled regular expression. 1121 * @param destFields An array of mutable UText structs to receive the results of the split. 1122 * If a field is NULL, a new UText is allocated to contain the results for 1123 * that field. This new UText is not guaranteed to be mutable. 1124 * @param destFieldsCapacity The number of elements in the destination array. 1125 * If the number of fields found is less than destCapacity, the 1126 * extra strings in the destination array are not altered. 1127 * If the number of destination strings is less than the number 1128 * of fields, the trailing part of the input string, including any 1129 * field delimiters, is placed in the last destination string. 1130 * This behavior mimics that of Perl. It is not an error condition, and no 1131 * error status is returned when all destField positions are used. 1132 * @param status A reference to a UErrorCode to receive any errors. 1133 * @return The number of fields into which the input string was split. 1134 * 1135 * @internal ICU 4.4 technology preview 1136 */ 1137 U_INTERNAL int32_t U_EXPORT2 1138 uregex_splitUText(URegularExpression *regexp, 1139 UText *destFields[], 1140 int32_t destFieldsCapacity, 1141 UErrorCode *status); 1142 1143 1144 1145 1146 /** 1147 * Set a processing time limit for match operations with this URegularExpression. 1148 * 1149 * Some patterns, when matching certain strings, can run in exponential time. 1150 * For practical purposes, the match operation may appear to be in an 1151 * infinite loop. 1152 * When a limit is set a match operation will fail with an error if the 1153 * limit is exceeded. 1154 * <p> 1155 * The units of the limit are steps of the match engine. 1156 * Correspondence with actual processor time will depend on the speed 1157 * of the processor and the details of the specific pattern, but will 1158 * typically be on the order of milliseconds. 1159 * <p> 1160 * By default, the matching time is not limited. 1161 * <p> 1162 * 1163 * @param regexp The compiled regular expression. 1164 * @param limit The limit value, or 0 for no limit. 1165 * @param status A reference to a UErrorCode to receive any errors. 1166 * @stable ICU 4.0 1167 */ 1168 U_STABLE void U_EXPORT2 1169 uregex_setTimeLimit(URegularExpression *regexp, 1170 int32_t limit, 1171 UErrorCode *status); 1172 1173 /** 1174 * Get the time limit for for matches with this URegularExpression. 1175 * A return value of zero indicates that there is no limit. 1176 * 1177 * @param regexp The compiled regular expression. 1178 * @param status A reference to a UErrorCode to receive any errors. 1179 * @return the maximum allowed time for a match, in units of processing steps. 1180 * @stable ICU 4.0 1181 */ 1182 U_STABLE int32_t U_EXPORT2 1183 uregex_getTimeLimit(const URegularExpression *regexp, 1184 UErrorCode *status); 1185 1186 /** 1187 * Set the amount of heap storage avaliable for use by the match backtracking stack. 1188 * <p> 1189 * ICU uses a backtracking regular expression engine, with the backtrack stack 1190 * maintained on the heap. This function sets the limit to the amount of memory 1191 * that can be used for this purpose. A backtracking stack overflow will 1192 * result in an error from the match operation that caused it. 1193 * <p> 1194 * A limit is desirable because a malicious or poorly designed pattern can use 1195 * excessive memory, potentially crashing the process. A limit is enabled 1196 * by default. 1197 * <p> 1198 * @param regexp The compiled regular expression. 1199 * @param limit The maximum size, in bytes, of the matching backtrack stack. 1200 * A value of -1 means no limit. 1201 * The limit must be greater than zero, or -1. 1202 * @param status A reference to a UErrorCode to receive any errors. 1203 * 1204 * @stable ICU 4.0 1205 */ 1206 U_STABLE void U_EXPORT2 1207 uregex_setStackLimit(URegularExpression *regexp, 1208 int32_t limit, 1209 UErrorCode *status); 1210 1211 /** 1212 * Get the size of the heap storage available for use by the back tracking stack. 1213 * 1214 * @return the maximum backtracking stack size, in bytes, or zero if the 1215 * stack size is unlimited. 1216 * @stable ICU 4.0 1217 */ 1218 U_STABLE int32_t U_EXPORT2 1219 uregex_getStackLimit(const URegularExpression *regexp, 1220 UErrorCode *status); 1221 1222 1223 /** 1224 * Function pointer for a regular expression matching callback function. 1225 * When set, a callback function will be called periodically during matching 1226 * operations. If the call back function returns FALSE, the matching 1227 * operation will be terminated early. 1228 * 1229 * Note: the callback function must not call other functions on this 1230 * URegularExpression. 1231 * 1232 * @param context context pointer. The callback function will be invoked 1233 * with the context specified at the time that 1234 * uregex_setMatchCallback() is called. 1235 * @param steps the accumulated processing time, in match steps, 1236 * for this matching operation. 1237 * @return TRUE to continue the matching operation. 1238 * FALSE to terminate the matching operation. 1239 * @stable ICU 4.0 1240 */ 1241 U_CDECL_BEGIN 1242 typedef UBool U_CALLCONV URegexMatchCallback ( 1243 const void *context, 1244 int32_t steps); 1245 U_CDECL_END 1246 1247 /** 1248 * Set a callback function for this URegularExpression. 1249 * During matching operations the function will be called periodically, 1250 * giving the application the opportunity to terminate a long-running 1251 * match. 1252 * 1253 * @param regexp The compiled regular expression. 1254 * @param callback A pointer to the user-supplied callback function. 1255 * @param context User context pointer. The value supplied at the 1256 * time the callback function is set will be saved 1257 * and passed to the callback each time that it is called. 1258 * @param status A reference to a UErrorCode to receive any errors. 1259 * @stable ICU 4.0 1260 */ 1261 U_STABLE void U_EXPORT2 1262 uregex_setMatchCallback(URegularExpression *regexp, 1263 URegexMatchCallback *callback, 1264 const void *context, 1265 UErrorCode *status); 1266 1267 1268 /** 1269 * Get the callback function for this URegularExpression. 1270 * 1271 * @param regexp The compiled regular expression. 1272 * @param callback Out paramater, receives a pointer to the user-supplied 1273 * callback function. 1274 * @param context Out parameter, receives the user context pointer that 1275 * was set when uregex_setMatchCallback() was called. 1276 * @param status A reference to a UErrorCode to receive any errors. 1277 * @stable ICU 4.0 1278 */ 1279 U_STABLE void U_EXPORT2 1280 uregex_getMatchCallback(const URegularExpression *regexp, 1281 URegexMatchCallback **callback, 1282 const void **context, 1283 UErrorCode *status); 1284 1285 1286 1287 #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */ 1288 #endif /* UREGEX_H */ 1289