1 /* 2 ********************************************************************** 3 * Copyright (C) 2004-2010, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ********************************************************************** 6 * file name: uregex.h 7 * encoding: US-ASCII 8 * indentation:4 9 * 10 * created on: 2004mar09 11 * created by: Andy Heninger 12 * 13 * ICU Regular Expressions, API for C 14 */ 15 16 /** 17 * \file 18 * \brief C API: Regular Expressions 19 * 20 * <p>This is a C wrapper around the C++ RegexPattern and RegexMatcher classes.</p> 21 */ 22 23 #ifndef UREGEX_H 24 #define UREGEX_H 25 26 #include "unicode/utext.h" 27 #include "unicode/utypes.h" 28 29 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 30 31 #include "unicode/localpointer.h" 32 #include "unicode/parseerr.h" 33 34 struct URegularExpression; 35 /** 36 * Structure representing a compiled regular rexpression, plus the results 37 * of a match operation. 38 * @stable ICU 3.0 39 */ 40 typedef struct URegularExpression URegularExpression; 41 42 43 /** 44 * Constants for Regular Expression Match Modes. 45 * @stable ICU 2.4 46 */ 47 typedef enum URegexpFlag{ 48 49 #ifndef U_HIDE_DRAFT_API 50 /** Forces normalization of pattern and strings. 51 Not implemented yet, just a placeholder, hence draft. 52 @draft ICU 2.4 */ 53 UREGEX_CANON_EQ = 128, 54 #endif 55 /** Enable case insensitive matching. @stable ICU 2.4 */ 56 UREGEX_CASE_INSENSITIVE = 2, 57 58 /** Allow white space and comments within patterns @stable ICU 2.4 */ 59 UREGEX_COMMENTS = 4, 60 61 /** If set, '.' matches line terminators, otherwise '.' matching stops at line end. 62 * @stable ICU 2.4 */ 63 UREGEX_DOTALL = 32, 64 65 /** If set, treat the entire pattern as a literal string. 66 * Metacharacters or escape sequences in the input sequence will be given 67 * no special meaning. Not implemented yet as of ICU 4.4. 68 * 69 * The flags CASE_INSENSITIVE and UNICODE_CASE retain their impact 70 * on matching when used in conjunction with this flag. 71 * The other flags become superfluous. 72 * TODO: say which escapes are still handled; anything Java does 73 * early (\\u) we should still do. 74 * @stable ICU 4.0 75 */ 76 UREGEX_LITERAL = 16, 77 78 /** Control behavior of "$" and "^" 79 * If set, recognize line terminators within string, 80 * otherwise, match only at start and end of input string. 81 * @stable ICU 2.4 */ 82 UREGEX_MULTILINE = 8, 83 84 /** Unix-only line endings. 85 * When this mode is enabled, only \\u000a is recognized as a line ending 86 * in the behavior of ., ^, and $. 87 * @stable ICU 4.0 88 */ 89 UREGEX_UNIX_LINES = 1, 90 91 /** Unicode word boundaries. 92 * If set, \b uses the Unicode TR 29 definition of word boundaries. 93 * Warning: Unicode word boundaries are quite different from 94 * traditional regular expression word boundaries. See 95 * http://unicode.org/reports/tr29/#Word_Boundaries 96 * @stable ICU 2.8 97 */ 98 UREGEX_UWORD = 256, 99 100 /** Error on Unrecognized backslash escapes. 101 * If set, fail with an error on patterns that contain 102 * backslash-escaped ASCII letters without a known specail 103 * meaning. If this flag is not set, these 104 * escaped letters represent themselves. 105 * @stable ICU 4.0 106 */ 107 UREGEX_ERROR_ON_UNKNOWN_ESCAPES = 512 108 109 } URegexpFlag; 110 111 /** 112 * Open (compile) an ICU regular expression. Compiles the regular expression in 113 * string form into an internal representation using the specified match mode flags. 114 * The resulting regular expression handle can then be used to perform various 115 * matching operations. 116 * 117 * 118 * @param pattern The Regular Expression pattern to be compiled. 119 * @param patternLength The length of the pattern, or -1 if the pattern is 120 * NUL termintated. 121 * @param flags Flags that alter the default matching behavior for 122 * the regular expression, UREGEX_CASE_INSENSITIVE, for 123 * example. For default behavior, set this parameter to zero. 124 * See <code>enum URegexpFlag</code>. All desired flags 125 * are bitwise-ORed together. 126 * @param pe Receives the position (line and column nubers) of any syntax 127 * error within the source regular expression string. If this 128 * information is not wanted, pass NULL for this parameter. 129 * @param status Receives error detected by this function. 130 * @stable ICU 3.0 131 * 132 */ 133 U_STABLE URegularExpression * U_EXPORT2 134 uregex_open( const UChar *pattern, 135 int32_t patternLength, 136 uint32_t flags, 137 UParseError *pe, 138 UErrorCode *status); 139 140 /** 141 * Open (compile) an ICU regular expression. Compiles the regular expression in 142 * string form into an internal representation using the specified match mode flags. 143 * The resulting regular expression handle can then be used to perform various 144 * matching operations. 145 * <p> 146 * The contents of the pattern UText will be extracted and saved. Ownership of the 147 * UText struct itself remains with the caller. This is to match the behavior of 148 * uregex_open(). 149 * 150 * @param pattern The Regular Expression pattern to be compiled. 151 * @param flags Flags that alter the default matching behavior for 152 * the regular expression, UREGEX_CASE_INSENSITIVE, for 153 * example. For default behavior, set this parameter to zero. 154 * See <code>enum URegexpFlag</code>. All desired flags 155 * are bitwise-ORed together. 156 * @param pe Receives the position (line and column nubers) of any syntax 157 * error within the source regular expression string. If this 158 * information is not wanted, pass NULL for this parameter. 159 * @param status Receives error detected by this function. 160 * 161 * @draft ICU 4.6 162 */ 163 U_DRAFT URegularExpression * U_EXPORT2 164 uregex_openUText(UText *pattern, 165 uint32_t flags, 166 UParseError *pe, 167 UErrorCode *status); 168 169 /** 170 * Open (compile) an ICU regular expression. The resulting regular expression 171 * handle can then be used to perform various matching operations. 172 * <p> 173 * This function is the same as uregex_open, except that the pattern 174 * is supplied as an 8 bit char * string in the default code page. 175 * 176 * @param pattern The Regular Expression pattern to be compiled, 177 * NUL termintated. 178 * @param flags Flags that alter the default matching behavior for 179 * the regular expression, UREGEX_CASE_INSENSITIVE, for 180 * example. For default behavior, set this parameter to zero. 181 * See <code>enum URegexpFlag</code>. All desired flags 182 * are bitwise-ORed together. 183 * @param pe Receives the position (line and column nubers) of any syntax 184 * error within the source regular expression string. If this 185 * information is not wanted, pass NULL for this parameter. 186 * @param status Receives errors detected by this function. 187 * @return The URegularExpression object representing the compiled 188 * pattern. 189 * 190 * @stable ICU 3.0 191 */ 192 #if !UCONFIG_NO_CONVERSION 193 U_STABLE URegularExpression * U_EXPORT2 194 uregex_openC( const char *pattern, 195 uint32_t flags, 196 UParseError *pe, 197 UErrorCode *status); 198 #endif 199 200 201 202 /** 203 * Close the regular expression, recovering all resources (memory) it 204 * was holding. 205 * 206 * @param regexp The regular expression to be closed. 207 * @stable ICU 3.0 208 */ 209 U_STABLE void U_EXPORT2 210 uregex_close(URegularExpression *regexp); 211 212 #if U_SHOW_CPLUSPLUS_API 213 214 U_NAMESPACE_BEGIN 215 216 /** 217 * \class LocalURegularExpressionPointer 218 * "Smart pointer" class, closes a URegularExpression via uregex_close(). 219 * For most methods see the LocalPointerBase base class. 220 * 221 * @see LocalPointerBase 222 * @see LocalPointer 223 * @stable ICU 4.4 224 */ 225 U_DEFINE_LOCAL_OPEN_POINTER(LocalURegularExpressionPointer, URegularExpression, uregex_close); 226 227 U_NAMESPACE_END 228 229 #endif 230 231 /** 232 * Make a copy of a compiled regular expression. Cloning a regular 233 * expression is faster than opening a second instance from the source 234 * form of the expression, and requires less memory. 235 * <p> 236 * Note that the current input string and the position of any matched text 237 * within it are not cloned; only the pattern itself and and the 238 * match mode flags are copied. 239 * <p> 240 * Cloning can be particularly useful to threaded applications that perform 241 * multiple match operations in parallel. Each concurrent RE 242 * operation requires its own instance of a URegularExpression. 243 * 244 * @param regexp The compiled regular expression to be cloned. 245 * @param status Receives indication of any errors encountered 246 * @return the cloned copy of the compiled regular expression. 247 * @stable ICU 3.0 248 */ 249 U_STABLE URegularExpression * U_EXPORT2 250 uregex_clone(const URegularExpression *regexp, UErrorCode *status); 251 252 /** 253 * Returns a pointer to the source form of the pattern for this regular expression. 254 * This function will work even if the pattern was originally specified as a UText. 255 * 256 * @param regexp The compiled regular expression. 257 * @param patLength This output parameter will be set to the length of the 258 * pattern string. A NULL pointer may be used here if the 259 * pattern length is not needed, as would be the case if 260 * the pattern is known in advance to be a NUL terminated 261 * string. 262 * @param status Receives errors detected by this function. 263 * @return a pointer to the pattern string. The storage for the string is 264 * owned by the regular expression object, and must not be 265 * altered or deleted by the application. The returned string 266 * will remain valid until the regular expression is closed. 267 * @stable ICU 3.0 268 */ 269 U_STABLE const UChar * U_EXPORT2 270 uregex_pattern(const URegularExpression *regexp, 271 int32_t *patLength, 272 UErrorCode *status); 273 274 /** 275 * Returns the source text of the pattern for this regular expression. 276 * This function will work even if the pattern was originally specified as a UChar string. 277 * 278 * @param regexp The compiled regular expression. 279 * @param status Receives errors detected by this function. 280 * @return the pattern text. The storage for the text is owned by the regular expression 281 * object, and must not be altered or deleted. 282 * 283 * @draft ICU 4.6 284 */ 285 U_DRAFT UText * U_EXPORT2 286 uregex_patternUText(const URegularExpression *regexp, 287 UErrorCode *status); 288 289 290 /** 291 * Get the match mode flags that were specified when compiling this regular expression. 292 * @param status Receives errors detected by this function. 293 * @param regexp The compiled regular expression. 294 * @return The match mode flags 295 * @see URegexpFlag 296 * @stable ICU 3.0 297 */ 298 U_STABLE int32_t U_EXPORT2 299 uregex_flags(const URegularExpression *regexp, 300 UErrorCode *status); 301 302 303 /** 304 * Set the subject text string upon which the regular expression will look for matches. 305 * This function may be called any number of times, allowing the regular 306 * expression pattern to be applied to different strings. 307 * <p> 308 * Regular expression matching operations work directly on the application's 309 * string data. No copy is made. The subject string data must not be 310 * altered after calling this function until after all regular expression 311 * operations involving this string data are completed. 312 * <p> 313 * Zero length strings are permitted. In this case, no subsequent match 314 * operation will dereference the text string pointer. 315 * 316 * @param regexp The compiled regular expression. 317 * @param text The subject text string. 318 * @param textLength The length of the subject text, or -1 if the string 319 * is NUL terminated. 320 * @param status Receives errors detected by this function. 321 * @stable ICU 3.0 322 */ 323 U_STABLE void U_EXPORT2 324 uregex_setText(URegularExpression *regexp, 325 const UChar *text, 326 int32_t textLength, 327 UErrorCode *status); 328 329 330 /** 331 * Set the subject text string upon which the regular expression will look for matches. 332 * This function may be called any number of times, allowing the regular 333 * expression pattern to be applied to different strings. 334 * <p> 335 * Regular expression matching operations work directly on the application's 336 * string data; only a shallow clone is made. The subject string data must not be 337 * altered after calling this function until after all regular expression 338 * operations involving this string data are completed. 339 * 340 * @param regexp The compiled regular expression. 341 * @param text The subject text string. 342 * @param status Receives errors detected by this function. 343 * 344 * @draft ICU 4.6 345 */ 346 U_DRAFT void U_EXPORT2 347 uregex_setUText(URegularExpression *regexp, 348 UText *text, 349 UErrorCode *status); 350 351 /** 352 * Get the subject text that is currently associated with this 353 * regular expression object. If the input was supplied using uregex_setText(), 354 * that pointer will be returned. Otherwise, the characters in the input will 355 * be extracted to a buffer and returned. In either case, ownership remains 356 * with the regular expression object. 357 * 358 * This function will work even if the input was originally specified as a UText. 359 * 360 * @param regexp The compiled regular expression. 361 * @param textLength The length of the string is returned in this output parameter. 362 * A NULL pointer may be used here if the 363 * text length is not needed, as would be the case if 364 * the text is known in advance to be a NUL terminated 365 * string. 366 * @param status Receives errors detected by this function. 367 * @return Pointer to the subject text string currently associated with 368 * this regular expression. 369 * @stable ICU 3.0 370 */ 371 U_STABLE const UChar * U_EXPORT2 372 uregex_getText(URegularExpression *regexp, 373 int32_t *textLength, 374 UErrorCode *status); 375 376 377 /** 378 * Get the subject text that is currently associated with this 379 * regular expression object. 380 * 381 * This function will work even if the input was originally specified as a UChar string. 382 * 383 * @param regexp The compiled regular expression. 384 * @param dest A mutable UText in which to store the current input. 385 * If NULL, a new UText will be created as an immutable shallow clone 386 * of the actual input string. 387 * @param status Receives errors detected by this function. 388 * @return The subject text currently associated with this regular expression. 389 * If a pre-allocated UText was provided, it will always be used and returned. 390 * 391 * @draft ICU 4.6 392 */ 393 U_DRAFT UText * U_EXPORT2 394 uregex_getUText(URegularExpression *regexp, 395 UText *dest, 396 UErrorCode *status); 397 398 399 /* BEGIN android-added 400 Removed it after Android upgrade to ICU4.6. */ 401 /** 402 * Set the subject text string upon which the regular expression is looking for matches 403 * without changing any other aspect of the matching state. 404 * The new and previous text strings must have the same content. 405 * 406 * This function is intended for use in environments where ICU is operating on 407 * strings that may move around in memory. It provides a mechanism for notifying 408 * ICU that the string has been relocated, and providing a new UText to access the 409 * string in its new position. 410 * 411 * Caution: this function is normally used only by very specialized 412 * system-level code. 413 * 414 * @param regexp The compiled regular expression. 415 * @param text The new (moved) text string. 416 * @param status Receives errors detected by this function. 417 * 418 * @internal ICU 4.6 419 */ 420 U_INTERNAL void U_EXPORT2 421 uregex_refreshUText(URegularExpression *regexp, 422 UText *text, 423 UErrorCode *status); 424 /* END android-added */ 425 426 /** 427 * Attempts to match the input string against the pattern. 428 * To succeed, the match must extend to the end of the string, 429 * or cover the complete match region. 430 * 431 * If startIndex >= zero the match operation starts at the specified 432 * index and must extend to the end of the input string. Any region 433 * that has been specified is reset. 434 * 435 * If startIndex == -1 the match must cover the input region, or the entire 436 * input string if no region has been set. This directly corresponds to 437 * Matcher.matches() in Java 438 * 439 * @param regexp The compiled regular expression. 440 * @param startIndex The input string (native) index at which to begin matching, or -1 441 * to match the input Region. 442 * @param status Receives errors detected by this function. 443 * @return TRUE if there is a match 444 * @stable ICU 3.0 445 */ 446 U_STABLE UBool U_EXPORT2 447 uregex_matches(URegularExpression *regexp, 448 int32_t startIndex, 449 UErrorCode *status); 450 451 /** 452 * 64bit version of uregex_matches. 453 * @draft ICU 4.6 454 */ 455 U_DRAFT UBool U_EXPORT2 456 uregex_matches64(URegularExpression *regexp, 457 int64_t startIndex, 458 UErrorCode *status); 459 460 /** 461 * Attempts to match the input string, starting from the specified index, against the pattern. 462 * The match may be of any length, and is not required to extend to the end 463 * of the input string. Contrast with uregex_matches(). 464 * 465 * <p>If startIndex is >= 0 any input region that was set for this 466 * URegularExpression is reset before the operation begins. 467 * 468 * <p>If the specified starting index == -1 the match begins at the start of the input 469 * region, or at the start of the full string if no region has been specified. 470 * This corresponds directly with Matcher.lookingAt() in Java. 471 * 472 * <p>If the match succeeds then more information can be obtained via the 473 * <code>uregexp_start()</code>, <code>uregexp_end()</code>, 474 * and <code>uregexp_group()</code> functions.</p> 475 * 476 * @param regexp The compiled regular expression. 477 * @param startIndex The input string (native) index at which to begin matching, or 478 * -1 to match the Input Region 479 * @param status A reference to a UErrorCode to receive any errors. 480 * @return TRUE if there is a match. 481 * @stable ICU 3.0 482 */ 483 U_STABLE UBool U_EXPORT2 484 uregex_lookingAt(URegularExpression *regexp, 485 int32_t startIndex, 486 UErrorCode *status); 487 488 /** 489 * 64bit version of uregex_lookingAt. 490 * @draft ICU 4.6 491 */ 492 U_DRAFT UBool U_EXPORT2 493 uregex_lookingAt64(URegularExpression *regexp, 494 int64_t startIndex, 495 UErrorCode *status); 496 497 /** 498 * Find the first matching substring of the input string that matches the pattern. 499 * If startIndex is >= zero the search for a match begins at the specified index, 500 * and any match region is reset. This corresponds directly with 501 * Matcher.find(startIndex) in Java. 502 * 503 * If startIndex == -1 the search begins at the start of the input region, 504 * or at the start of the full string if no region has been specified. 505 * 506 * If a match is found, <code>uregex_start(), uregex_end()</code>, and 507 * <code>uregex_group()</code> will provide more information regarding the match. 508 * 509 * @param regexp The compiled regular expression. 510 * @param startIndex The position (native) in the input string to begin the search, or 511 * -1 to search within the Input Region. 512 * @param status A reference to a UErrorCode to receive any errors. 513 * @return TRUE if a match is found. 514 * @stable ICU 3.0 515 */ 516 U_STABLE UBool U_EXPORT2 517 uregex_find(URegularExpression *regexp, 518 int32_t startIndex, 519 UErrorCode *status); 520 521 /** 522 * 64bit version of uregex_find. 523 * @draft ICU 4.6 524 */ 525 U_DRAFT UBool U_EXPORT2 526 uregex_find64(URegularExpression *regexp, 527 int64_t startIndex, 528 UErrorCode *status); 529 530 /** 531 * Find the next pattern match in the input string. Begin searching 532 * the input at the location following the end of he previous match, 533 * or at the start of the string (or region) if there is no 534 * previous match. If a match is found, <code>uregex_start(), uregex_end()</code>, and 535 * <code>uregex_group()</code> will provide more information regarding the match. 536 * 537 * @param regexp The compiled regular expression. 538 * @param status A reference to a UErrorCode to receive any errors. 539 * @return TRUE if a match is found. 540 * @see uregex_reset 541 * @stable ICU 3.0 542 */ 543 U_STABLE UBool U_EXPORT2 544 uregex_findNext(URegularExpression *regexp, 545 UErrorCode *status); 546 547 /** 548 * Get the number of capturing groups in this regular expression's pattern. 549 * @param regexp The compiled regular expression. 550 * @param status A reference to a UErrorCode to receive any errors. 551 * @return the number of capture groups 552 * @stable ICU 3.0 553 */ 554 U_STABLE int32_t U_EXPORT2 555 uregex_groupCount(URegularExpression *regexp, 556 UErrorCode *status); 557 558 /** Extract the string for the specified matching expression or subexpression. 559 * Group #0 is the complete string of matched text. 560 * Group #1 is the text matched by the first set of capturing parentheses. 561 * 562 * @param regexp The compiled regular expression. 563 * @param groupNum The capture group to extract. Group 0 is the complete 564 * match. The value of this parameter must be 565 * less than or equal to the number of capture groups in 566 * the pattern. 567 * @param dest Buffer to receive the matching string data 568 * @param destCapacity Capacity of the dest buffer. 569 * @param status A reference to a UErrorCode to receive any errors. 570 * @return Length of matching data, 571 * or -1 if no applicable match. 572 * @stable ICU 3.0 573 */ 574 U_STABLE int32_t U_EXPORT2 575 uregex_group(URegularExpression *regexp, 576 int32_t groupNum, 577 UChar *dest, 578 int32_t destCapacity, 579 UErrorCode *status); 580 581 582 /** Returns a shallow immutable clone of the entire input string. The returned UText current native index 583 * is set to the beginning of the requested capture group. The capture group length is also 584 * returned via groupLength. 585 * Group #0 is the complete string of matched text. 586 * Group #1 is the text matched by the first set of capturing parentheses. 587 * 588 * @param regexp The compiled regular expression. 589 * @param groupNum The capture group to extract. Group 0 is the complete 590 * match. The value of this parameter must be 591 * less than or equal to the number of capture groups in 592 * the pattern. 593 * @param dest A mutable UText in which to store the current input. 594 * If NULL, a new UText will be created as an immutable shallow clone 595 * of the entire input string. 596 * @param groupLength The group length of the desired capture group. 597 * @param status A reference to a UErrorCode to receive any errors. 598 * @return The subject text currently associated with this regular expression. 599 * If a pre-allocated UText was provided, it will always be used and returned. 600 601 * 602 * @draft ICU 4.6 603 */ 604 U_DRAFT UText * U_EXPORT2 605 uregex_groupUText(URegularExpression *regexp, 606 int32_t groupNum, 607 UText *dest, 608 int64_t *groupLength, 609 UErrorCode *status); 610 611 612 /** Extract the string for the specified matching expression or subexpression. 613 * Group #0 is the complete string of matched text. 614 * Group #1 is the text matched by the first set of capturing parentheses. 615 * 616 * @param regexp The compiled regular expression. 617 * @param groupNum The capture group to extract. Group 0 is the complete 618 * match. The value of this parameter must be 619 * less than or equal to the number of capture groups in 620 * the pattern. 621 * @param dest Mutable UText to receive the matching string data. 622 * If NULL, a new UText will be created (which may not be mutable). 623 * @param status A reference to a UErrorCode to receive any errors. 624 * @return The matching string data. If a pre-allocated UText was provided, 625 * it will always be used and returned. 626 * 627 * @internal ICU 4.4 technology preview 628 */ 629 U_INTERNAL UText * U_EXPORT2 630 uregex_groupUTextDeep(URegularExpression *regexp, 631 int32_t groupNum, 632 UText *dest, 633 UErrorCode *status); 634 635 /** 636 * Returns the index in the input string of the start of the text matched by the 637 * specified capture group during the previous match operation. Return -1 if 638 * the capture group was not part of the last match. 639 * Group #0 refers to the complete range of matched text. 640 * Group #1 refers to the text matched by the first set of capturing parentheses. 641 * 642 * @param regexp The compiled regular expression. 643 * @param groupNum The capture group number 644 * @param status A reference to a UErrorCode to receive any errors. 645 * @return the starting (native) position in the input of the text matched 646 * by the specified group. 647 * @stable ICU 3.0 648 */ 649 U_STABLE int32_t U_EXPORT2 650 uregex_start(URegularExpression *regexp, 651 int32_t groupNum, 652 UErrorCode *status); 653 654 /** 655 * 64bit version of uregex_start. 656 * @draft ICU 4.6 657 */ 658 U_DRAFT int64_t U_EXPORT2 659 uregex_start64(URegularExpression *regexp, 660 int32_t groupNum, 661 UErrorCode *status); 662 663 /** 664 * Returns the index in the input string of the position following the end 665 * of the text matched by the specified capture group. 666 * Return -1 if the capture group was not part of the last match. 667 * Group #0 refers to the complete range of matched text. 668 * Group #1 refers to the text matched by the first set of capturing parentheses. 669 * 670 * @param regexp The compiled regular expression. 671 * @param groupNum The capture group number 672 * @param status A reference to a UErrorCode to receive any errors. 673 * @return the (native) index of the position following the last matched character. 674 * @stable ICU 3.0 675 */ 676 U_STABLE int32_t U_EXPORT2 677 uregex_end(URegularExpression *regexp, 678 int32_t groupNum, 679 UErrorCode *status); 680 681 /** 682 * 64bit version of uregex_end. 683 * @draft ICU 4.6 684 */ 685 U_DRAFT int64_t U_EXPORT2 686 uregex_end64(URegularExpression *regexp, 687 int32_t groupNum, 688 UErrorCode *status); 689 690 /** 691 * Reset any saved state from the previous match. Has the effect of 692 * causing uregex_findNext to begin at the specified index, and causing 693 * uregex_start(), uregex_end() and uregex_group() to return an error 694 * indicating that there is no match information available. Clears any 695 * match region that may have been set. 696 * 697 * @param regexp The compiled regular expression. 698 * @param index The position (native) in the text at which a 699 * uregex_findNext() should begin searching. 700 * @param status A reference to a UErrorCode to receive any errors. 701 * @stable ICU 3.0 702 */ 703 U_STABLE void U_EXPORT2 704 uregex_reset(URegularExpression *regexp, 705 int32_t index, 706 UErrorCode *status); 707 708 /** 709 * 64bit version of uregex_reset. 710 * @draft ICU 4.6 711 */ 712 U_DRAFT void U_EXPORT2 713 uregex_reset64(URegularExpression *regexp, 714 int64_t index, 715 UErrorCode *status); 716 717 /** Sets the limits of the matching region for this URegularExpression. 718 * The region is the part of the input string that will be considered when matching. 719 * Invoking this method resets any saved state from the previous match, 720 * then sets the region to start at the index specified by the start parameter 721 * and end at the index specified by the end parameter. 722 * 723 * Depending on the transparency and anchoring being used (see useTransparentBounds 724 * and useAnchoringBounds), certain constructs such as anchors may behave differently 725 * at or around the boundaries of the region 726 * 727 * The function will fail if start is greater than limit, or if either index 728 * is less than zero or greater than the length of the string being matched. 729 * 730 * @param regexp The compiled regular expression. 731 * @param regionStart The (native) index to begin searches at. 732 * @param regionLimit The (native) index to end searches at (exclusive). 733 * @param status A pointer to a UErrorCode to receive any errors. 734 * @stable ICU 4.0 735 */ 736 U_STABLE void U_EXPORT2 737 uregex_setRegion(URegularExpression *regexp, 738 int32_t regionStart, 739 int32_t regionLimit, 740 UErrorCode *status); 741 742 /** 743 * 64bit version of uregex_setRegion. 744 * @draft ICU 4.6 745 */ 746 U_DRAFT void U_EXPORT2 747 uregex_setRegion64(URegularExpression *regexp, 748 int64_t regionStart, 749 int64_t regionLimit, 750 UErrorCode *status); 751 752 /** 753 * Variation on uregex_setRegion to set the region without resetting the start index 754 * without resetting the position for subsequent matches. 755 * @draft ICU 4.6 756 */ 757 U_DRAFT void U_EXPORT2 758 uregex_setRegionAndStart(URegularExpression *regexp, 759 int64_t regionStart, 760 int64_t regionLimit, 761 int64_t startIndex, 762 UErrorCode *status); 763 764 /** 765 * Reports the start index of the matching region. Any matches found are limited to 766 * to the region bounded by regionStart (inclusive) and regionEnd (exclusive). 767 * 768 * @param regexp The compiled regular expression. 769 * @param status A pointer to a UErrorCode to receive any errors. 770 * @return The starting (native) index of this matcher's region. 771 * @stable ICU 4.0 772 */ 773 U_STABLE int32_t U_EXPORT2 774 uregex_regionStart(const URegularExpression *regexp, 775 UErrorCode *status); 776 777 /** 778 * 64bit version of uregex_regionStart. 779 * @draft ICU 4.6 780 */ 781 U_DRAFT int64_t U_EXPORT2 782 uregex_regionStart64(const URegularExpression *regexp, 783 UErrorCode *status); 784 785 /** 786 * Reports the end index (exclusive) of the matching region for this URegularExpression. 787 * Any matches found are limited to to the region bounded by regionStart (inclusive) 788 * and regionEnd (exclusive). 789 * 790 * @param regexp The compiled regular expression. 791 * @param status A pointer to a UErrorCode to receive any errors. 792 * @return The ending point (native) of this matcher's region. 793 * @stable ICU 4.0 794 */ 795 U_STABLE int32_t U_EXPORT2 796 uregex_regionEnd(const URegularExpression *regexp, 797 UErrorCode *status); 798 799 /** 800 * 64bit version of uregex_regionEnd. 801 * @draft ICU 4.6 802 */ 803 U_DRAFT int64_t U_EXPORT2 804 uregex_regionEnd64(const URegularExpression *regexp, 805 UErrorCode *status); 806 807 /** 808 * Queries the transparency of region bounds for this URegularExpression. 809 * See useTransparentBounds for a description of transparent and opaque bounds. 810 * By default, matching boundaries are opaque. 811 * 812 * @param regexp The compiled regular expression. 813 * @param status A pointer to a UErrorCode to receive any errors. 814 * @return TRUE if this matcher is using opaque bounds, false if it is not. 815 * @stable ICU 4.0 816 */ 817 U_STABLE UBool U_EXPORT2 818 uregex_hasTransparentBounds(const URegularExpression *regexp, 819 UErrorCode *status); 820 821 822 /** 823 * Sets the transparency of region bounds for this URegularExpression. 824 * Invoking this function with an argument of TRUE will set matches to use transparent bounds. 825 * If the boolean argument is FALSE, then opaque bounds will be used. 826 * 827 * Using transparent bounds, the boundaries of the matching region are transparent 828 * to lookahead, lookbehind, and boundary matching constructs. Those constructs can 829 * see text beyond the boundaries of the region while checking for a match. 830 * 831 * With opaque bounds, no text outside of the matching region is visible to lookahead, 832 * lookbehind, and boundary matching constructs. 833 * 834 * By default, opaque bounds are used. 835 * 836 * @param regexp The compiled regular expression. 837 * @param b TRUE for transparent bounds; FALSE for opaque bounds 838 * @param status A pointer to a UErrorCode to receive any errors. 839 * @stable ICU 4.0 840 **/ 841 U_STABLE void U_EXPORT2 842 uregex_useTransparentBounds(URegularExpression *regexp, 843 UBool b, 844 UErrorCode *status); 845 846 847 /** 848 * Return true if this URegularExpression is using anchoring bounds. 849 * By default, anchoring region bounds are used. 850 * 851 * @param regexp The compiled regular expression. 852 * @param status A pointer to a UErrorCode to receive any errors. 853 * @return TRUE if this matcher is using anchoring bounds. 854 * @stable ICU 4.0 855 */ 856 U_STABLE UBool U_EXPORT2 857 uregex_hasAnchoringBounds(const URegularExpression *regexp, 858 UErrorCode *status); 859 860 861 /** 862 * Set whether this URegularExpression is using Anchoring Bounds for its region. 863 * With anchoring bounds, pattern anchors such as ^ and $ will match at the start 864 * and end of the region. Without Anchoring Bounds, anchors will only match at 865 * the positions they would in the complete text. 866 * 867 * Anchoring Bounds are the default for regions. 868 * 869 * @param regexp The compiled regular expression. 870 * @param b TRUE if to enable anchoring bounds; FALSE to disable them. 871 * @param status A pointer to a UErrorCode to receive any errors. 872 * @stable ICU 4.0 873 */ 874 U_STABLE void U_EXPORT2 875 uregex_useAnchoringBounds(URegularExpression *regexp, 876 UBool b, 877 UErrorCode *status); 878 879 /** 880 * Return TRUE if the most recent matching operation touched the 881 * end of the text being processed. In this case, additional input text could 882 * change the results of that match. 883 * 884 * @param regexp The compiled regular expression. 885 * @param status A pointer to a UErrorCode to receive any errors. 886 * @return TRUE if the most recent match hit the end of input 887 * @stable ICU 4.0 888 */ 889 U_STABLE UBool U_EXPORT2 890 uregex_hitEnd(const URegularExpression *regexp, 891 UErrorCode *status); 892 893 /** 894 * Return TRUE the most recent match succeeded and additional input could cause 895 * it to fail. If this function returns false and a match was found, then more input 896 * might change the match but the match won't be lost. If a match was not found, 897 * then requireEnd has no meaning. 898 * 899 * @param regexp The compiled regular expression. 900 * @param status A pointer to a UErrorCode to receive any errors. 901 * @return TRUE if more input could cause the most recent match to no longer match. 902 * @stable ICU 4.0 903 */ 904 U_STABLE UBool U_EXPORT2 905 uregex_requireEnd(const URegularExpression *regexp, 906 UErrorCode *status); 907 908 909 910 911 912 /** 913 * Replaces every substring of the input that matches the pattern 914 * with the given replacement string. This is a convenience function that 915 * provides a complete find-and-replace-all operation. 916 * 917 * This method scans the input string looking for matches of the pattern. 918 * Input that is not part of any match is copied unchanged to the 919 * destination buffer. Matched regions are replaced in the output 920 * buffer by the replacement string. The replacement string may contain 921 * references to capture groups; these take the form of $1, $2, etc. 922 * 923 * @param regexp The compiled regular expression. 924 * @param replacementText A string containing the replacement text. 925 * @param replacementLength The length of the replacement string, or 926 * -1 if it is NUL terminated. 927 * @param destBuf A (UChar *) buffer that will receive the result. 928 * @param destCapacity The capacity of the desitnation buffer. 929 * @param status A reference to a UErrorCode to receive any errors. 930 * @return The length of the string resulting from the find 931 * and replace operation. In the event that the 932 * destination capacity is inadequate, the return value 933 * is still the full length of the untruncated string. 934 * @stable ICU 3.0 935 */ 936 U_STABLE int32_t U_EXPORT2 937 uregex_replaceAll(URegularExpression *regexp, 938 const UChar *replacementText, 939 int32_t replacementLength, 940 UChar *destBuf, 941 int32_t destCapacity, 942 UErrorCode *status); 943 944 /** 945 * Replaces every substring of the input that matches the pattern 946 * with the given replacement string. This is a convenience function that 947 * provides a complete find-and-replace-all operation. 948 * 949 * This method scans the input string looking for matches of the pattern. 950 * Input that is not part of any match is copied unchanged to the 951 * destination buffer. Matched regions are replaced in the output 952 * buffer by the replacement string. The replacement string may contain 953 * references to capture groups; these take the form of $1, $2, etc. 954 * 955 * @param regexp The compiled regular expression. 956 * @param replacement A string containing the replacement text. 957 * @param dest A mutable UText that will receive the result. 958 * If NULL, a new UText will be created (which may not be mutable). 959 * @param status A reference to a UErrorCode to receive any errors. 960 * @return A UText containing the results of the find and replace. 961 * If a pre-allocated UText was provided, it will always be used and returned. 962 * 963 * @draft ICU 4.6 964 */ 965 U_DRAFT UText * U_EXPORT2 966 uregex_replaceAllUText(URegularExpression *regexp, 967 UText *replacement, 968 UText *dest, 969 UErrorCode *status); 970 971 /** 972 * Replaces the first substring of the input that matches the pattern 973 * with the given replacement string. This is a convenience function that 974 * provides a complete find-and-replace operation. 975 * 976 * This method scans the input string looking for a match of the pattern. 977 * All input that is not part of the match is copied unchanged to the 978 * destination buffer. The matched region is replaced in the output 979 * buffer by the replacement string. The replacement string may contain 980 * references to capture groups; these take the form of $1, $2, etc. 981 * 982 * @param regexp The compiled regular expression. 983 * @param replacementText A string containing the replacement text. 984 * @param replacementLength The length of the replacement string, or 985 * -1 if it is NUL terminated. 986 * @param destBuf A (UChar *) buffer that will receive the result. 987 * @param destCapacity The capacity of the desitnation buffer. 988 * @param status a reference to a UErrorCode to receive any errors. 989 * @return The length of the string resulting from the find 990 * and replace operation. In the event that the 991 * destination capacity is inadequate, the return value 992 * is still the full length of the untruncated string. 993 * @stable ICU 3.0 994 */ 995 U_STABLE int32_t U_EXPORT2 996 uregex_replaceFirst(URegularExpression *regexp, 997 const UChar *replacementText, 998 int32_t replacementLength, 999 UChar *destBuf, 1000 int32_t destCapacity, 1001 UErrorCode *status); 1002 1003 /** 1004 * Replaces the first substring of the input that matches the pattern 1005 * with the given replacement string. This is a convenience function that 1006 * provides a complete find-and-replace operation. 1007 * 1008 * This method scans the input string looking for a match of the pattern. 1009 * All input that is not part of the match is copied unchanged to the 1010 * destination buffer. The matched region is replaced in the output 1011 * buffer by the replacement string. The replacement string may contain 1012 * references to capture groups; these take the form of $1, $2, etc. 1013 * 1014 * @param regexp The compiled regular expression. 1015 * @param replacement A string containing the replacement text. 1016 * @param dest A mutable UText that will receive the result. 1017 * If NULL, a new UText will be created (which may not be mutable). 1018 * @param status A reference to a UErrorCode to receive any errors. 1019 * @return A UText containing the results of the find and replace. 1020 * If a pre-allocated UText was provided, it will always be used and returned. 1021 * 1022 * @draft ICU 4.6 1023 */ 1024 U_DRAFT UText * U_EXPORT2 1025 uregex_replaceFirstUText(URegularExpression *regexp, 1026 UText *replacement, 1027 UText *dest, 1028 UErrorCode *status); 1029 1030 1031 /** 1032 * Implements a replace operation intended to be used as part of an 1033 * incremental find-and-replace. 1034 * 1035 * <p>The input string, starting from the end of the previous match and ending at 1036 * the start of the current match, is appended to the destination string. Then the 1037 * replacement string is appended to the output string, 1038 * including handling any substitutions of captured text.</p> 1039 * 1040 * <p>A note on preflight computation of buffersize and error handling: 1041 * Calls to uregex_appendReplacement() and uregex_appendTail() are 1042 * designed to be chained, one after another, with the destination 1043 * buffer pointer and buffer capacity updated after each in preparation 1044 * to for the next. If the destination buffer is exhausted partway through such a 1045 * sequence, a U_BUFFER_OVERFLOW_ERROR status will be returned. Normal 1046 * ICU conventions are for a function to perform no action if it is 1047 * called with an error status, but for this one case, uregex_appendRepacement() 1048 * will operate normally so that buffer size computations will complete 1049 * correctly. 1050 * 1051 * <p>For simple, prepackaged, non-incremental find-and-replace 1052 * operations, see replaceFirst() or replaceAll().</p> 1053 * 1054 * @param regexp The regular expression object. 1055 * @param replacementText The string that will replace the matched portion of the 1056 * input string as it is copied to the destination buffer. 1057 * The replacement text may contain references ($1, for 1058 * example) to capture groups from the match. 1059 * @param replacementLength The length of the replacement text string, 1060 * or -1 if the string is NUL terminated. 1061 * @param destBuf The buffer into which the results of the 1062 * find-and-replace are placed. On return, this pointer 1063 * will be updated to refer to the beginning of the 1064 * unused portion of buffer, leaving it in position for 1065 * a subsequent call to this function. 1066 * @param destCapacity The size of the output buffer, On return, this 1067 * parameter will be updated to reflect the space remaining 1068 * unused in the output buffer. 1069 * @param status A reference to a UErrorCode to receive any errors. 1070 * @return The length of the result string. In the event that 1071 * destCapacity is inadequate, the full length of the 1072 * untruncated output string is returned. 1073 * 1074 * @stable ICU 3.0 1075 * 1076 */ 1077 U_STABLE int32_t U_EXPORT2 1078 uregex_appendReplacement(URegularExpression *regexp, 1079 const UChar *replacementText, 1080 int32_t replacementLength, 1081 UChar **destBuf, 1082 int32_t *destCapacity, 1083 UErrorCode *status); 1084 1085 1086 /** 1087 * Implements a replace operation intended to be used as part of an 1088 * incremental find-and-replace. 1089 * 1090 * <p>The input string, starting from the end of the previous match and ending at 1091 * the start of the current match, is appended to the destination string. Then the 1092 * replacement string is appended to the output string, 1093 * including handling any substitutions of captured text.</p> 1094 * 1095 * <p>For simple, prepackaged, non-incremental find-and-replace 1096 * operations, see replaceFirst() or replaceAll().</p> 1097 * 1098 * @param regexp The regular expression object. 1099 * @param replacementText The string that will replace the matched portion of the 1100 * input string as it is copied to the destination buffer. 1101 * The replacement text may contain references ($1, for 1102 * example) to capture groups from the match. 1103 * @param dest A mutable UText that will receive the result. Must not be NULL. 1104 * @param status A reference to a UErrorCode to receive any errors. 1105 * 1106 * @draft ICU 4.6 1107 */ 1108 U_DRAFT void U_EXPORT2 1109 uregex_appendReplacementUText(URegularExpression *regexp, 1110 UText *replacementText, 1111 UText *dest, 1112 UErrorCode *status); 1113 1114 1115 /** 1116 * As the final step in a find-and-replace operation, append the remainder 1117 * of the input string, starting at the position following the last match, 1118 * to the destination string. <code>uregex_appendTail()</code> is intended 1119 * to be invoked after one or more invocations of the 1120 * <code>uregex_appendReplacement()</code> function. 1121 * 1122 * @param regexp The regular expression object. This is needed to 1123 * obtain the input string and with the position 1124 * of the last match within it. 1125 * @param destBuf The buffer in which the results of the 1126 * find-and-replace are placed. On return, the pointer 1127 * will be updated to refer to the beginning of the 1128 * unused portion of buffer. 1129 * @param destCapacity The size of the output buffer, On return, this 1130 * value will be updated to reflect the space remaining 1131 * unused in the output buffer. 1132 * @param status A reference to a UErrorCode to receive any errors. 1133 * @return The length of the result string. In the event that 1134 * destCapacity is inadequate, the full length of the 1135 * untruncated output string is returned. 1136 * 1137 * @stable ICU 3.0 1138 */ 1139 U_STABLE int32_t U_EXPORT2 1140 uregex_appendTail(URegularExpression *regexp, 1141 UChar **destBuf, 1142 int32_t *destCapacity, 1143 UErrorCode *status); 1144 1145 1146 /** 1147 * As the final step in a find-and-replace operation, append the remainder 1148 * of the input string, starting at the position following the last match, 1149 * to the destination string. <code>uregex_appendTailUText()</code> is intended 1150 * to be invoked after one or more invocations of the 1151 * <code>uregex_appendReplacementUText()</code> function. 1152 * 1153 * @param regexp The regular expression object. This is needed to 1154 * obtain the input string and with the position 1155 * of the last match within it. 1156 * @param dest A mutable UText that will receive the result. Must not be NULL. 1157 * @return The destination UText. 1158 * 1159 * @draft ICU 4.6 1160 */ 1161 U_DRAFT UText * U_EXPORT2 1162 uregex_appendTailUText(URegularExpression *regexp, 1163 UText *dest, 1164 UErrorCode *status); 1165 1166 1167 1168 /** 1169 * Split a string into fields. Somewhat like split() from Perl. 1170 * The pattern matches identify delimiters that separate the input 1171 * into fields. The input data between the matches becomes the 1172 * fields themselves. 1173 * <p> 1174 * Each of the fields is copied from the input string to the destination 1175 * buffer, and NUL terminated. The position of each field within 1176 * the destination buffer is returned in the destFields array. 1177 * 1178 * Note: another choice for the design of this function would be to not 1179 * copy the resulting fields at all, but to return indexes and 1180 * lengths within the source text. 1181 * Advantages would be 1182 * o Faster. No Copying. 1183 * o Nothing extra needed when field data may contain embedded NUL chars. 1184 * o Less memory needed if working on large data. 1185 * Disadvantages 1186 * o Less consistent with C++ split, which copies into an 1187 * array of UnicodeStrings. 1188 * o No NUL termination, extracted fields would be less convenient 1189 * to use in most cases. 1190 * o Possible problems in the future, when support Unicode Normalization 1191 * could cause the fields to not correspond exactly to 1192 * a range of the source text. 1193 * 1194 * @param regexp The compiled regular expression. 1195 * @param destBuf A (UChar *) buffer to receive the fields that 1196 * are extracted from the input string. These 1197 * field pointers will refer to positions within the 1198 * destination buffer supplied by the caller. Any 1199 * extra positions within the destFields array will be 1200 * set to NULL. 1201 * @param destCapacity The capacity of the destBuf. 1202 * @param requiredCapacity The actual capacity required of the destBuf. 1203 * If destCapacity is too small, requiredCapacity will return 1204 * the total capacity required to hold all of the output, and 1205 * a U_BUFFER_OVERFLOW_ERROR will be returned. 1206 * @param destFields An array to be filled with the position of each 1207 * of the extracted fields within destBuf. 1208 * @param destFieldsCapacity The number of elements in the destFields array. 1209 * If the number of fields found is less than destFieldsCapacity, 1210 * the extra destFields elements are set to zero. 1211 * If destFieldsCapacity is too small, the trailing part of the 1212 * input, including any field delimiters, is treated as if it 1213 * were the last field - it is copied to the destBuf, and 1214 * its position is in the destBuf is stored in the last element 1215 * of destFields. This behavior mimics that of Perl. It is not 1216 * an error condition, and no error status is returned when all destField 1217 * positions are used. 1218 * @param status A reference to a UErrorCode to receive any errors. 1219 * @return The number of fields into which the input string was split. 1220 * @stable ICU 3.0 1221 */ 1222 U_STABLE int32_t U_EXPORT2 1223 uregex_split( URegularExpression *regexp, 1224 UChar *destBuf, 1225 int32_t destCapacity, 1226 int32_t *requiredCapacity, 1227 UChar *destFields[], 1228 int32_t destFieldsCapacity, 1229 UErrorCode *status); 1230 1231 1232 /** 1233 * Split a string into fields. Somewhat like split() from Perl. 1234 * The pattern matches identify delimiters that separate the input 1235 * into fields. The input data between the matches becomes the 1236 * fields themselves. 1237 * <p> 1238 * The behavior of this function is not very closely aligned with uregex_split(); 1239 * instead, it is based on (and implemented directly on top of) the C++ split method. 1240 * 1241 * @param regexp The compiled regular expression. 1242 * @param destFields An array of mutable UText structs to receive the results of the split. 1243 * If a field is NULL, a new UText is allocated to contain the results for 1244 * that field. This new UText is not guaranteed to be mutable. 1245 * @param destFieldsCapacity The number of elements in the destination array. 1246 * If the number of fields found is less than destCapacity, the 1247 * extra strings in the destination array are not altered. 1248 * If the number of destination strings is less than the number 1249 * of fields, the trailing part of the input string, including any 1250 * field delimiters, is placed in the last destination string. 1251 * This behavior mimics that of Perl. It is not an error condition, and no 1252 * error status is returned when all destField positions are used. 1253 * @param status A reference to a UErrorCode to receive any errors. 1254 * @return The number of fields into which the input string was split. 1255 * 1256 * @draft ICU 4.6 1257 */ 1258 U_DRAFT int32_t U_EXPORT2 1259 uregex_splitUText(URegularExpression *regexp, 1260 UText *destFields[], 1261 int32_t destFieldsCapacity, 1262 UErrorCode *status); 1263 1264 1265 1266 1267 /** 1268 * Set a processing time limit for match operations with this URegularExpression. 1269 * 1270 * Some patterns, when matching certain strings, can run in exponential time. 1271 * For practical purposes, the match operation may appear to be in an 1272 * infinite loop. 1273 * When a limit is set a match operation will fail with an error if the 1274 * limit is exceeded. 1275 * <p> 1276 * The units of the limit are steps of the match engine. 1277 * Correspondence with actual processor time will depend on the speed 1278 * of the processor and the details of the specific pattern, but will 1279 * typically be on the order of milliseconds. 1280 * <p> 1281 * By default, the matching time is not limited. 1282 * <p> 1283 * 1284 * @param regexp The compiled regular expression. 1285 * @param limit The limit value, or 0 for no limit. 1286 * @param status A reference to a UErrorCode to receive any errors. 1287 * @stable ICU 4.0 1288 */ 1289 U_STABLE void U_EXPORT2 1290 uregex_setTimeLimit(URegularExpression *regexp, 1291 int32_t limit, 1292 UErrorCode *status); 1293 1294 /** 1295 * Get the time limit for for matches with this URegularExpression. 1296 * A return value of zero indicates that there is no limit. 1297 * 1298 * @param regexp The compiled regular expression. 1299 * @param status A reference to a UErrorCode to receive any errors. 1300 * @return the maximum allowed time for a match, in units of processing steps. 1301 * @stable ICU 4.0 1302 */ 1303 U_STABLE int32_t U_EXPORT2 1304 uregex_getTimeLimit(const URegularExpression *regexp, 1305 UErrorCode *status); 1306 1307 /** 1308 * Set the amount of heap storage avaliable for use by the match backtracking stack. 1309 * <p> 1310 * ICU uses a backtracking regular expression engine, with the backtrack stack 1311 * maintained on the heap. This function sets the limit to the amount of memory 1312 * that can be used for this purpose. A backtracking stack overflow will 1313 * result in an error from the match operation that caused it. 1314 * <p> 1315 * A limit is desirable because a malicious or poorly designed pattern can use 1316 * excessive memory, potentially crashing the process. A limit is enabled 1317 * by default. 1318 * <p> 1319 * @param regexp The compiled regular expression. 1320 * @param limit The maximum size, in bytes, of the matching backtrack stack. 1321 * A value of -1 means no limit. 1322 * The limit must be greater than zero, or -1. 1323 * @param status A reference to a UErrorCode to receive any errors. 1324 * 1325 * @stable ICU 4.0 1326 */ 1327 U_STABLE void U_EXPORT2 1328 uregex_setStackLimit(URegularExpression *regexp, 1329 int32_t limit, 1330 UErrorCode *status); 1331 1332 /** 1333 * Get the size of the heap storage available for use by the back tracking stack. 1334 * 1335 * @return the maximum backtracking stack size, in bytes, or zero if the 1336 * stack size is unlimited. 1337 * @stable ICU 4.0 1338 */ 1339 U_STABLE int32_t U_EXPORT2 1340 uregex_getStackLimit(const URegularExpression *regexp, 1341 UErrorCode *status); 1342 1343 1344 /** 1345 * Function pointer for a regular expression matching callback function. 1346 * When set, a callback function will be called periodically during matching 1347 * operations. If the call back function returns FALSE, the matching 1348 * operation will be terminated early. 1349 * 1350 * Note: the callback function must not call other functions on this 1351 * URegularExpression. 1352 * 1353 * @param context context pointer. The callback function will be invoked 1354 * with the context specified at the time that 1355 * uregex_setMatchCallback() is called. 1356 * @param steps the accumulated processing time, in match steps, 1357 * for this matching operation. 1358 * @return TRUE to continue the matching operation. 1359 * FALSE to terminate the matching operation. 1360 * @stable ICU 4.0 1361 */ 1362 U_CDECL_BEGIN 1363 typedef UBool U_CALLCONV URegexMatchCallback ( 1364 const void *context, 1365 int32_t steps); 1366 U_CDECL_END 1367 1368 /** 1369 * Set a callback function for this URegularExpression. 1370 * During matching operations the function will be called periodically, 1371 * giving the application the opportunity to terminate a long-running 1372 * match. 1373 * 1374 * @param regexp The compiled regular expression. 1375 * @param callback A pointer to the user-supplied callback function. 1376 * @param context User context pointer. The value supplied at the 1377 * time the callback function is set will be saved 1378 * and passed to the callback each time that it is called. 1379 * @param status A reference to a UErrorCode to receive any errors. 1380 * @stable ICU 4.0 1381 */ 1382 U_STABLE void U_EXPORT2 1383 uregex_setMatchCallback(URegularExpression *regexp, 1384 URegexMatchCallback *callback, 1385 const void *context, 1386 UErrorCode *status); 1387 1388 1389 /** 1390 * Get the callback function for this URegularExpression. 1391 * 1392 * @param regexp The compiled regular expression. 1393 * @param callback Out paramater, receives a pointer to the user-supplied 1394 * callback function. 1395 * @param context Out parameter, receives the user context pointer that 1396 * was set when uregex_setMatchCallback() was called. 1397 * @param status A reference to a UErrorCode to receive any errors. 1398 * @stable ICU 4.0 1399 */ 1400 U_STABLE void U_EXPORT2 1401 uregex_getMatchCallback(const URegularExpression *regexp, 1402 URegexMatchCallback **callback, 1403 const void **context, 1404 UErrorCode *status); 1405 1406 1407 /** 1408 * Function pointer for a regular expression find callback function. 1409 * 1410 * When set, a callback function will be called during a find operation 1411 * and for operations that depend on find, such as findNext, split and some replace 1412 * operations like replaceFirst. 1413 * The callback will usually be called after each attempt at a match, but this is not a 1414 * guarantee that the callback will be invoked at each character. For finds where the 1415 * match engine is invoked at each character, this may be close to true, but less likely 1416 * for more optimized loops where the pattern is known to only start, and the match 1417 * engine invoked, at certain characters. 1418 * When invoked, this callback will specify the index at which a match operation is about 1419 * to be attempted, giving the application the opportunity to terminate a long-running 1420 * find operation. 1421 * 1422 * If the call back function returns FALSE, the find operation will be terminated early. 1423 * 1424 * Note: the callback function must not call other functions on this 1425 * URegularExpression 1426 * 1427 * @param context context pointer. The callback function will be invoked 1428 * with the context specified at the time that 1429 * uregex_setFindProgressCallback() is called. 1430 * @param matchIndex the next index at which a match attempt will be attempted for this 1431 * find operation. If this callback interrupts the search, this is the 1432 * index at which a find/findNext operation may be re-initiated. 1433 * @return TRUE to continue the matching operation. 1434 * FALSE to terminate the matching operation. 1435 * @draft ICU 4.6 1436 */ 1437 U_CDECL_BEGIN 1438 typedef UBool U_CALLCONV URegexFindProgressCallback ( 1439 const void *context, 1440 int64_t matchIndex); 1441 U_CDECL_END 1442 1443 /** 1444 * Set the find progress callback function for this URegularExpression. 1445 * 1446 * @param regexp The compiled regular expression. 1447 * @param callback A pointer to the user-supplied callback function. 1448 * @param context User context pointer. The value supplied at the 1449 * time the callback function is set will be saved 1450 * and passed to the callback each time that it is called. 1451 * @param status A reference to a UErrorCode to receive any errors. 1452 * @draft ICU 4.6 1453 */ 1454 U_DRAFT void U_EXPORT2 1455 uregex_setFindProgressCallback(URegularExpression *regexp, 1456 URegexFindProgressCallback *callback, 1457 const void *context, 1458 UErrorCode *status); 1459 1460 1461 /** 1462 * Get the find progress callback function for this URegularExpression. 1463 * 1464 * @param regexp The compiled regular expression. 1465 * @param callback Out paramater, receives a pointer to the user-supplied 1466 * callback function. 1467 * @param context Out parameter, receives the user context pointer that 1468 * was set when uregex_setFindProgressCallback() was called. 1469 * @param status A reference to a UErrorCode to receive any errors. 1470 * @draft ICU 4.6 1471 */ 1472 U_DRAFT void U_EXPORT2 1473 uregex_getFindProgressCallback(const URegularExpression *regexp, 1474 URegexFindProgressCallback **callback, 1475 const void **context, 1476 UErrorCode *status); 1477 1478 #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */ 1479 #endif /* UREGEX_H */ 1480