1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ********************************************************************** 5 * Copyright (C) 2004-2016, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ********************************************************************** 8 * file name: uregex.h 9 * encoding: UTF-8 10 * indentation:4 11 * 12 * created on: 2004mar09 13 * created by: Andy Heninger 14 * 15 * ICU Regular Expressions, API for C 16 */ 17 18 /** 19 * \file 20 * \brief C API: Regular Expressions 21 * 22 * <p>This is a C wrapper around the C++ RegexPattern and RegexMatcher classes.</p> 23 */ 24 25 #ifndef UREGEX_H 26 #define UREGEX_H 27 28 #include "unicode/utext.h" 29 #include "unicode/utypes.h" 30 31 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 32 33 #include "unicode/parseerr.h" 34 35 #if U_SHOW_CPLUSPLUS_API 36 #include "unicode/localpointer.h" 37 #endif // U_SHOW_CPLUSPLUS_API 38 39 struct URegularExpression; 40 /** 41 * Structure representing a compiled regular expression, plus the results 42 * of a match operation. 43 * @stable ICU 3.0 44 */ 45 typedef struct URegularExpression URegularExpression; 46 47 48 /** 49 * Constants for Regular Expression Match Modes. 50 * @stable ICU 2.4 51 */ 52 typedef enum URegexpFlag{ 53 54 #ifndef U_HIDE_DRAFT_API 55 /** Forces normalization of pattern and strings. 56 Not implemented yet, just a placeholder, hence draft. 57 @draft ICU 2.4 */ 58 UREGEX_CANON_EQ = 128, 59 #endif /* U_HIDE_DRAFT_API */ 60 /** Enable case insensitive matching. @stable ICU 2.4 */ 61 UREGEX_CASE_INSENSITIVE = 2, 62 63 /** Allow white space and comments within patterns @stable ICU 2.4 */ 64 UREGEX_COMMENTS = 4, 65 66 /** If set, '.' matches line terminators, otherwise '.' matching stops at line end. 67 * @stable ICU 2.4 */ 68 UREGEX_DOTALL = 32, 69 70 /** If set, treat the entire pattern as a literal string. 71 * Metacharacters or escape sequences in the input sequence will be given 72 * no special meaning. 73 * 74 * The flag UREGEX_CASE_INSENSITIVE retains its impact 75 * on matching when used in conjunction with this flag. 76 * The other flags become superfluous. 77 * 78 * @stable ICU 4.0 79 */ 80 UREGEX_LITERAL = 16, 81 82 /** Control behavior of "$" and "^" 83 * If set, recognize line terminators within string, 84 * otherwise, match only at start and end of input string. 85 * @stable ICU 2.4 */ 86 UREGEX_MULTILINE = 8, 87 88 /** Unix-only line endings. 89 * When this mode is enabled, only \\u000a is recognized as a line ending 90 * in the behavior of ., ^, and $. 91 * @stable ICU 4.0 92 */ 93 UREGEX_UNIX_LINES = 1, 94 95 /** Unicode word boundaries. 96 * If set, \b uses the Unicode TR 29 definition of word boundaries. 97 * Warning: Unicode word boundaries are quite different from 98 * traditional regular expression word boundaries. See 99 * http://unicode.org/reports/tr29/#Word_Boundaries 100 * @stable ICU 2.8 101 */ 102 UREGEX_UWORD = 256, 103 104 /** Error on Unrecognized backslash escapes. 105 * If set, fail with an error on patterns that contain 106 * backslash-escaped ASCII letters without a known special 107 * meaning. If this flag is not set, these 108 * escaped letters represent themselves. 109 * @stable ICU 4.0 110 */ 111 UREGEX_ERROR_ON_UNKNOWN_ESCAPES = 512 112 113 } URegexpFlag; 114 115 /** 116 * Open (compile) an ICU regular expression. Compiles the regular expression in 117 * string form into an internal representation using the specified match mode flags. 118 * The resulting regular expression handle can then be used to perform various 119 * matching operations. 120 * 121 * 122 * @param pattern The Regular Expression pattern to be compiled. 123 * @param patternLength The length of the pattern, or -1 if the pattern is 124 * NUL terminated. 125 * @param flags Flags that alter the default matching behavior for 126 * the regular expression, UREGEX_CASE_INSENSITIVE, for 127 * example. For default behavior, set this parameter to zero. 128 * See <code>enum URegexpFlag</code>. All desired flags 129 * are bitwise-ORed together. 130 * @param pe Receives the position (line and column numbers) of any syntax 131 * error within the source regular expression string. If this 132 * information is not wanted, pass NULL for this parameter. 133 * @param status Receives error detected by this function. 134 * @stable ICU 3.0 135 * 136 */ 137 U_CAPI URegularExpression * U_EXPORT2 138 uregex_open( const UChar *pattern, 139 int32_t patternLength, 140 uint32_t flags, 141 UParseError *pe, 142 UErrorCode *status); 143 144 /** 145 * Open (compile) an ICU regular expression. Compiles the regular expression in 146 * string form into an internal representation using the specified match mode flags. 147 * The resulting regular expression handle can then be used to perform various 148 * matching operations. 149 * <p> 150 * The contents of the pattern UText will be extracted and saved. Ownership of the 151 * UText struct itself remains with the caller. This is to match the behavior of 152 * uregex_open(). 153 * 154 * @param pattern The Regular Expression pattern to be compiled. 155 * @param flags Flags that alter the default matching behavior for 156 * the regular expression, UREGEX_CASE_INSENSITIVE, for 157 * example. For default behavior, set this parameter to zero. 158 * See <code>enum URegexpFlag</code>. All desired flags 159 * are bitwise-ORed together. 160 * @param pe Receives the position (line and column numbers) of any syntax 161 * error within the source regular expression string. If this 162 * information is not wanted, pass NULL for this parameter. 163 * @param status Receives error detected by this function. 164 * 165 * @stable ICU 4.6 166 */ 167 U_CAPI URegularExpression * U_EXPORT2 168 uregex_openUText(UText *pattern, 169 uint32_t flags, 170 UParseError *pe, 171 UErrorCode *status); 172 173 #if !UCONFIG_NO_CONVERSION 174 /** 175 * Open (compile) an ICU regular expression. The resulting regular expression 176 * handle can then be used to perform various matching operations. 177 * <p> 178 * This function is the same as uregex_open, except that the pattern 179 * is supplied as an 8 bit char * string in the default code page. 180 * 181 * @param pattern The Regular Expression pattern to be compiled, 182 * NUL terminated. 183 * @param flags Flags that alter the default matching behavior for 184 * the regular expression, UREGEX_CASE_INSENSITIVE, for 185 * example. For default behavior, set this parameter to zero. 186 * See <code>enum URegexpFlag</code>. All desired flags 187 * are bitwise-ORed together. 188 * @param pe Receives the position (line and column numbers) of any syntax 189 * error within the source regular expression string. If this 190 * information is not wanted, pass NULL for this parameter. 191 * @param status Receives errors detected by this function. 192 * @return The URegularExpression object representing the compiled 193 * pattern. 194 * 195 * @stable ICU 3.0 196 */ 197 U_CAPI URegularExpression * U_EXPORT2 198 uregex_openC( const char *pattern, 199 uint32_t flags, 200 UParseError *pe, 201 UErrorCode *status); 202 #endif 203 204 205 206 /** 207 * Close the regular expression, recovering all resources (memory) it 208 * was holding. 209 * 210 * @param regexp The regular expression to be closed. 211 * @stable ICU 3.0 212 */ 213 U_CAPI void U_EXPORT2 214 uregex_close(URegularExpression *regexp); 215 216 #if U_SHOW_CPLUSPLUS_API 217 218 U_NAMESPACE_BEGIN 219 220 /** 221 * \class LocalURegularExpressionPointer 222 * "Smart pointer" class, closes a URegularExpression via uregex_close(). 223 * For most methods see the LocalPointerBase base class. 224 * 225 * @see LocalPointerBase 226 * @see LocalPointer 227 * @stable ICU 4.4 228 */ 229 U_DEFINE_LOCAL_OPEN_POINTER(LocalURegularExpressionPointer, URegularExpression, uregex_close); 230 231 U_NAMESPACE_END 232 233 #endif 234 235 /** 236 * Make a copy of a compiled regular expression. Cloning a regular 237 * expression is faster than opening a second instance from the source 238 * form of the expression, and requires less memory. 239 * <p> 240 * Note that the current input string and the position of any matched text 241 * within it are not cloned; only the pattern itself and the 242 * match mode flags are copied. 243 * <p> 244 * Cloning can be particularly useful to threaded applications that perform 245 * multiple match operations in parallel. Each concurrent RE 246 * operation requires its own instance of a URegularExpression. 247 * 248 * @param regexp The compiled regular expression to be cloned. 249 * @param status Receives indication of any errors encountered 250 * @return the cloned copy of the compiled regular expression. 251 * @stable ICU 3.0 252 */ 253 U_CAPI URegularExpression * U_EXPORT2 254 uregex_clone(const URegularExpression *regexp, UErrorCode *status); 255 256 /** 257 * Returns a pointer to the source form of the pattern for this regular expression. 258 * This function will work even if the pattern was originally specified as a UText. 259 * 260 * @param regexp The compiled regular expression. 261 * @param patLength This output parameter will be set to the length of the 262 * pattern string. A NULL pointer may be used here if the 263 * pattern length is not needed, as would be the case if 264 * the pattern is known in advance to be a NUL terminated 265 * string. 266 * @param status Receives errors detected by this function. 267 * @return a pointer to the pattern string. The storage for the string is 268 * owned by the regular expression object, and must not be 269 * altered or deleted by the application. The returned string 270 * will remain valid until the regular expression is closed. 271 * @stable ICU 3.0 272 */ 273 U_CAPI const UChar * U_EXPORT2 274 uregex_pattern(const URegularExpression *regexp, 275 int32_t *patLength, 276 UErrorCode *status); 277 278 /** 279 * Returns the source text of the pattern for this regular expression. 280 * This function will work even if the pattern was originally specified as a UChar string. 281 * 282 * @param regexp The compiled regular expression. 283 * @param status Receives errors detected by this function. 284 * @return the pattern text. The storage for the text is owned by the regular expression 285 * object, and must not be altered or deleted. 286 * 287 * @stable ICU 4.6 288 */ 289 U_CAPI UText * U_EXPORT2 290 uregex_patternUText(const URegularExpression *regexp, 291 UErrorCode *status); 292 293 /** 294 * Get the match mode flags that were specified when compiling this regular expression. 295 * @param status Receives errors detected by this function. 296 * @param regexp The compiled regular expression. 297 * @return The match mode flags 298 * @see URegexpFlag 299 * @stable ICU 3.0 300 */ 301 U_CAPI int32_t U_EXPORT2 302 uregex_flags(const URegularExpression *regexp, 303 UErrorCode *status); 304 305 306 /** 307 * Set the subject text string upon which the regular expression will look for matches. 308 * This function may be called any number of times, allowing the regular 309 * expression pattern to be applied to different strings. 310 * <p> 311 * Regular expression matching operations work directly on the application's 312 * string data. No copy is made. The subject string data must not be 313 * altered after calling this function until after all regular expression 314 * operations involving this string data are completed. 315 * <p> 316 * Zero length strings are permitted. In this case, no subsequent match 317 * operation will dereference the text string pointer. 318 * 319 * @param regexp The compiled regular expression. 320 * @param text The subject text string. 321 * @param textLength The length of the subject text, or -1 if the string 322 * is NUL terminated. 323 * @param status Receives errors detected by this function. 324 * @stable ICU 3.0 325 */ 326 U_CAPI void U_EXPORT2 327 uregex_setText(URegularExpression *regexp, 328 const UChar *text, 329 int32_t textLength, 330 UErrorCode *status); 331 332 333 /** 334 * Set the subject text string upon which the regular expression will look for matches. 335 * This function may be called any number of times, allowing the regular 336 * expression pattern to be applied to different strings. 337 * <p> 338 * Regular expression matching operations work directly on the application's 339 * string data; only a shallow clone is made. The subject string data must not be 340 * altered after calling this function until after all regular expression 341 * operations involving this string data are completed. 342 * 343 * @param regexp The compiled regular expression. 344 * @param text The subject text string. 345 * @param status Receives errors detected by this function. 346 * 347 * @stable ICU 4.6 348 */ 349 U_CAPI void U_EXPORT2 350 uregex_setUText(URegularExpression *regexp, 351 UText *text, 352 UErrorCode *status); 353 354 /** 355 * Get the subject text that is currently associated with this 356 * regular expression object. If the input was supplied using uregex_setText(), 357 * that pointer will be returned. Otherwise, the characters in the input will 358 * be extracted to a buffer and returned. In either case, ownership remains 359 * with the regular expression object. 360 * 361 * This function will work even if the input was originally specified as a UText. 362 * 363 * @param regexp The compiled regular expression. 364 * @param textLength The length of the string is returned in this output parameter. 365 * A NULL pointer may be used here if the 366 * text length is not needed, as would be the case if 367 * the text is known in advance to be a NUL terminated 368 * string. 369 * @param status Receives errors detected by this function. 370 * @return Pointer to the subject text string currently associated with 371 * this regular expression. 372 * @stable ICU 3.0 373 */ 374 U_CAPI const UChar * U_EXPORT2 375 uregex_getText(URegularExpression *regexp, 376 int32_t *textLength, 377 UErrorCode *status); 378 379 /** 380 * Get the subject text that is currently associated with this 381 * regular expression object. 382 * 383 * This function will work even if the input was originally specified as a UChar string. 384 * 385 * @param regexp The compiled regular expression. 386 * @param dest A mutable UText in which to store the current input. 387 * If NULL, a new UText will be created as an immutable shallow clone 388 * of the actual input string. 389 * @param status Receives errors detected by this function. 390 * @return The subject text currently associated with this regular expression. 391 * If a pre-allocated UText was provided, it will always be used and returned. 392 * 393 * @stable ICU 4.6 394 */ 395 U_CAPI UText * U_EXPORT2 396 uregex_getUText(URegularExpression *regexp, 397 UText *dest, 398 UErrorCode *status); 399 400 /** 401 * Set the subject text string upon which the regular expression is looking for matches 402 * without changing any other aspect of the matching state. 403 * The new and previous text strings must have the same content. 404 * 405 * This function is intended for use in environments where ICU is operating on 406 * strings that may move around in memory. It provides a mechanism for notifying 407 * ICU that the string has been relocated, and providing a new UText to access the 408 * string in its new position. 409 * 410 * Note that the regular expression implementation never copies the underlying text 411 * of a string being matched, but always operates directly on the original text 412 * provided by the user. Refreshing simply drops the references to the old text 413 * and replaces them with references to the new. 414 * 415 * Caution: this function is normally used only by very specialized 416 * system-level code. One example use case is with garbage collection 417 * that moves the text in memory. 418 * 419 * @param regexp The compiled regular expression. 420 * @param text The new (moved) text string. 421 * @param status Receives errors detected by this function. 422 * 423 * @stable ICU 4.8 424 */ 425 U_CAPI void U_EXPORT2 426 uregex_refreshUText(URegularExpression *regexp, 427 UText *text, 428 UErrorCode *status); 429 430 /** 431 * Attempts to match the input string against the pattern. 432 * To succeed, the match must extend to the end of the string, 433 * or cover the complete match region. 434 * 435 * If startIndex >= zero the match operation starts at the specified 436 * index and must extend to the end of the input string. Any region 437 * that has been specified is reset. 438 * 439 * If startIndex == -1 the match must cover the input region, or the entire 440 * input string if no region has been set. This directly corresponds to 441 * Matcher.matches() in Java 442 * 443 * @param regexp The compiled regular expression. 444 * @param startIndex The input string (native) index at which to begin matching, or -1 445 * to match the input Region. 446 * @param status Receives errors detected by this function. 447 * @return true if there is a match 448 * @stable ICU 3.0 449 */ 450 U_CAPI UBool U_EXPORT2 451 uregex_matches(URegularExpression *regexp, 452 int32_t startIndex, 453 UErrorCode *status); 454 455 /** 456 * 64bit version of uregex_matches. 457 * Attempts to match the input string against the pattern. 458 * To succeed, the match must extend to the end of the string, 459 * or cover the complete match region. 460 * 461 * If startIndex >= zero the match operation starts at the specified 462 * index and must extend to the end of the input string. Any region 463 * that has been specified is reset. 464 * 465 * If startIndex == -1 the match must cover the input region, or the entire 466 * input string if no region has been set. This directly corresponds to 467 * Matcher.matches() in Java 468 * 469 * @param regexp The compiled regular expression. 470 * @param startIndex The input string (native) index at which to begin matching, or -1 471 * to match the input Region. 472 * @param status Receives errors detected by this function. 473 * @return true if there is a match 474 * @stable ICU 4.6 475 */ 476 U_CAPI UBool U_EXPORT2 477 uregex_matches64(URegularExpression *regexp, 478 int64_t startIndex, 479 UErrorCode *status); 480 481 /** 482 * Attempts to match the input string, starting from the specified index, against the pattern. 483 * The match may be of any length, and is not required to extend to the end 484 * of the input string. Contrast with uregex_matches(). 485 * 486 * <p>If startIndex is >= 0 any input region that was set for this 487 * URegularExpression is reset before the operation begins. 488 * 489 * <p>If the specified starting index == -1 the match begins at the start of the input 490 * region, or at the start of the full string if no region has been specified. 491 * This corresponds directly with Matcher.lookingAt() in Java. 492 * 493 * <p>If the match succeeds then more information can be obtained via the 494 * <code>uregexp_start()</code>, <code>uregexp_end()</code>, 495 * and <code>uregex_group()</code> functions.</p> 496 * 497 * @param regexp The compiled regular expression. 498 * @param startIndex The input string (native) index at which to begin matching, or 499 * -1 to match the Input Region 500 * @param status A reference to a UErrorCode to receive any errors. 501 * @return true if there is a match. 502 * @stable ICU 3.0 503 */ 504 U_CAPI UBool U_EXPORT2 505 uregex_lookingAt(URegularExpression *regexp, 506 int32_t startIndex, 507 UErrorCode *status); 508 509 /** 510 * 64bit version of uregex_lookingAt. 511 * Attempts to match the input string, starting from the specified index, against the pattern. 512 * The match may be of any length, and is not required to extend to the end 513 * of the input string. Contrast with uregex_matches(). 514 * 515 * <p>If startIndex is >= 0 any input region that was set for this 516 * URegularExpression is reset before the operation begins. 517 * 518 * <p>If the specified starting index == -1 the match begins at the start of the input 519 * region, or at the start of the full string if no region has been specified. 520 * This corresponds directly with Matcher.lookingAt() in Java. 521 * 522 * <p>If the match succeeds then more information can be obtained via the 523 * <code>uregexp_start()</code>, <code>uregexp_end()</code>, 524 * and <code>uregex_group()</code> functions.</p> 525 * 526 * @param regexp The compiled regular expression. 527 * @param startIndex The input string (native) index at which to begin matching, or 528 * -1 to match the Input Region 529 * @param status A reference to a UErrorCode to receive any errors. 530 * @return true if there is a match. 531 * @stable ICU 4.6 532 */ 533 U_CAPI UBool U_EXPORT2 534 uregex_lookingAt64(URegularExpression *regexp, 535 int64_t startIndex, 536 UErrorCode *status); 537 538 /** 539 * Find the first matching substring of the input string that matches the pattern. 540 * If startIndex is >= zero the search for a match begins at the specified index, 541 * and any match region is reset. This corresponds directly with 542 * Matcher.find(startIndex) in Java. 543 * 544 * If startIndex == -1 the search begins at the start of the input region, 545 * or at the start of the full string if no region has been specified. 546 * 547 * If a match is found, <code>uregex_start(), uregex_end()</code>, and 548 * <code>uregex_group()</code> will provide more information regarding the match. 549 * 550 * @param regexp The compiled regular expression. 551 * @param startIndex The position (native) in the input string to begin the search, or 552 * -1 to search within the Input Region. 553 * @param status A reference to a UErrorCode to receive any errors. 554 * @return true if a match is found. 555 * @stable ICU 3.0 556 */ 557 U_CAPI UBool U_EXPORT2 558 uregex_find(URegularExpression *regexp, 559 int32_t startIndex, 560 UErrorCode *status); 561 562 /** 563 * 64bit version of uregex_find. 564 * Find the first matching substring of the input string that matches the pattern. 565 * If startIndex is >= zero the search for a match begins at the specified index, 566 * and any match region is reset. This corresponds directly with 567 * Matcher.find(startIndex) in Java. 568 * 569 * If startIndex == -1 the search begins at the start of the input region, 570 * or at the start of the full string if no region has been specified. 571 * 572 * If a match is found, <code>uregex_start(), uregex_end()</code>, and 573 * <code>uregex_group()</code> will provide more information regarding the match. 574 * 575 * @param regexp The compiled regular expression. 576 * @param startIndex The position (native) in the input string to begin the search, or 577 * -1 to search within the Input Region. 578 * @param status A reference to a UErrorCode to receive any errors. 579 * @return true if a match is found. 580 * @stable ICU 4.6 581 */ 582 U_CAPI UBool U_EXPORT2 583 uregex_find64(URegularExpression *regexp, 584 int64_t startIndex, 585 UErrorCode *status); 586 587 /** 588 * Find the next pattern match in the input string. Begin searching 589 * the input at the location following the end of he previous match, 590 * or at the start of the string (or region) if there is no 591 * previous match. If a match is found, <code>uregex_start(), uregex_end()</code>, and 592 * <code>uregex_group()</code> will provide more information regarding the match. 593 * 594 * @param regexp The compiled regular expression. 595 * @param status A reference to a UErrorCode to receive any errors. 596 * @return true if a match is found. 597 * @see uregex_reset 598 * @stable ICU 3.0 599 */ 600 U_CAPI UBool U_EXPORT2 601 uregex_findNext(URegularExpression *regexp, 602 UErrorCode *status); 603 604 /** 605 * Get the number of capturing groups in this regular expression's pattern. 606 * @param regexp The compiled regular expression. 607 * @param status A reference to a UErrorCode to receive any errors. 608 * @return the number of capture groups 609 * @stable ICU 3.0 610 */ 611 U_CAPI int32_t U_EXPORT2 612 uregex_groupCount(URegularExpression *regexp, 613 UErrorCode *status); 614 615 /** 616 * Get the group number corresponding to a named capture group. 617 * The returned number can be used with any function that access 618 * capture groups by number. 619 * 620 * The function returns an error status if the specified name does not 621 * appear in the pattern. 622 * 623 * @param regexp The compiled regular expression. 624 * @param groupName The capture group name. 625 * @param nameLength The length of the name, or -1 if the name is a 626 * nul-terminated string. 627 * @param status A pointer to a UErrorCode to receive any errors. 628 * 629 * @stable ICU 55 630 */ 631 U_CAPI int32_t U_EXPORT2 632 uregex_groupNumberFromName(URegularExpression *regexp, 633 const UChar *groupName, 634 int32_t nameLength, 635 UErrorCode *status); 636 637 638 /** 639 * Get the group number corresponding to a named capture group. 640 * The returned number can be used with any function that access 641 * capture groups by number. 642 * 643 * The function returns an error status if the specified name does not 644 * appear in the pattern. 645 * 646 * @param regexp The compiled regular expression. 647 * @param groupName The capture group name, 648 * platform invariant characters only. 649 * @param nameLength The length of the name, or -1 if the name is 650 * nul-terminated. 651 * @param status A pointer to a UErrorCode to receive any errors. 652 * 653 * @stable ICU 55 654 */ 655 U_CAPI int32_t U_EXPORT2 656 uregex_groupNumberFromCName(URegularExpression *regexp, 657 const char *groupName, 658 int32_t nameLength, 659 UErrorCode *status); 660 661 /** Extract the string for the specified matching expression or subexpression. 662 * Group #0 is the complete string of matched text. 663 * Group #1 is the text matched by the first set of capturing parentheses. 664 * 665 * @param regexp The compiled regular expression. 666 * @param groupNum The capture group to extract. Group 0 is the complete 667 * match. The value of this parameter must be 668 * less than or equal to the number of capture groups in 669 * the pattern. 670 * @param dest Buffer to receive the matching string data 671 * @param destCapacity Capacity of the dest buffer. 672 * @param status A reference to a UErrorCode to receive any errors. 673 * @return Length of matching data, 674 * or -1 if no applicable match. 675 * @stable ICU 3.0 676 */ 677 U_CAPI int32_t U_EXPORT2 678 uregex_group(URegularExpression *regexp, 679 int32_t groupNum, 680 UChar *dest, 681 int32_t destCapacity, 682 UErrorCode *status); 683 684 /** Returns a shallow immutable clone of the entire input string with the current index set 685 * to the beginning of the requested capture group. The capture group length is also 686 * returned via groupLength. 687 * Group #0 is the complete string of matched text. 688 * Group #1 is the text matched by the first set of capturing parentheses. 689 * 690 * @param regexp The compiled regular expression. 691 * @param groupNum The capture group to extract. Group 0 is the complete 692 * match. The value of this parameter must be 693 * less than or equal to the number of capture groups in 694 * the pattern. 695 * @param dest A mutable UText in which to store the current input. 696 * If NULL, a new UText will be created as an immutable shallow clone 697 * of the entire input string. 698 * @param groupLength The group length of the desired capture group. Output parameter. 699 * @param status A reference to a UErrorCode to receive any errors. 700 * @return The subject text currently associated with this regular expression. 701 * If a pre-allocated UText was provided, it will always be used and returned. 702 703 * 704 * @stable ICU 4.6 705 */ 706 U_CAPI UText * U_EXPORT2 707 uregex_groupUText(URegularExpression *regexp, 708 int32_t groupNum, 709 UText *dest, 710 int64_t *groupLength, 711 UErrorCode *status); 712 713 /** 714 * Returns the index in the input string of the start of the text matched by the 715 * specified capture group during the previous match operation. Return -1 if 716 * the capture group was not part of the last match. 717 * Group #0 refers to the complete range of matched text. 718 * Group #1 refers to the text matched by the first set of capturing parentheses. 719 * 720 * @param regexp The compiled regular expression. 721 * @param groupNum The capture group number 722 * @param status A reference to a UErrorCode to receive any errors. 723 * @return the starting (native) position in the input of the text matched 724 * by the specified group. 725 * @stable ICU 3.0 726 */ 727 U_CAPI int32_t U_EXPORT2 728 uregex_start(URegularExpression *regexp, 729 int32_t groupNum, 730 UErrorCode *status); 731 732 /** 733 * 64bit version of uregex_start. 734 * Returns the index in the input string of the start of the text matched by the 735 * specified capture group during the previous match operation. Return -1 if 736 * the capture group was not part of the last match. 737 * Group #0 refers to the complete range of matched text. 738 * Group #1 refers to the text matched by the first set of capturing parentheses. 739 * 740 * @param regexp The compiled regular expression. 741 * @param groupNum The capture group number 742 * @param status A reference to a UErrorCode to receive any errors. 743 * @return the starting (native) position in the input of the text matched 744 * by the specified group. 745 * @stable ICU 4.6 746 */ 747 U_CAPI int64_t U_EXPORT2 748 uregex_start64(URegularExpression *regexp, 749 int32_t groupNum, 750 UErrorCode *status); 751 752 /** 753 * Returns the index in the input string of the position following the end 754 * of the text matched by the specified capture group. 755 * Return -1 if the capture group was not part of the last match. 756 * Group #0 refers to the complete range of matched text. 757 * Group #1 refers to the text matched by the first set of capturing parentheses. 758 * 759 * @param regexp The compiled regular expression. 760 * @param groupNum The capture group number 761 * @param status A reference to a UErrorCode to receive any errors. 762 * @return the (native) index of the position following the last matched character. 763 * @stable ICU 3.0 764 */ 765 U_CAPI int32_t U_EXPORT2 766 uregex_end(URegularExpression *regexp, 767 int32_t groupNum, 768 UErrorCode *status); 769 770 /** 771 * 64bit version of uregex_end. 772 * Returns the index in the input string of the position following the end 773 * of the text matched by the specified capture group. 774 * Return -1 if the capture group was not part of the last match. 775 * Group #0 refers to the complete range of matched text. 776 * Group #1 refers to the text matched by the first set of capturing parentheses. 777 * 778 * @param regexp The compiled regular expression. 779 * @param groupNum The capture group number 780 * @param status A reference to a UErrorCode to receive any errors. 781 * @return the (native) index of the position following the last matched character. 782 * @stable ICU 4.6 783 */ 784 U_CAPI int64_t U_EXPORT2 785 uregex_end64(URegularExpression *regexp, 786 int32_t groupNum, 787 UErrorCode *status); 788 789 /** 790 * Reset any saved state from the previous match. Has the effect of 791 * causing uregex_findNext to begin at the specified index, and causing 792 * uregex_start(), uregex_end() and uregex_group() to return an error 793 * indicating that there is no match information available. Clears any 794 * match region that may have been set. 795 * 796 * @param regexp The compiled regular expression. 797 * @param index The position (native) in the text at which a 798 * uregex_findNext() should begin searching. 799 * @param status A reference to a UErrorCode to receive any errors. 800 * @stable ICU 3.0 801 */ 802 U_CAPI void U_EXPORT2 803 uregex_reset(URegularExpression *regexp, 804 int32_t index, 805 UErrorCode *status); 806 807 /** 808 * 64bit version of uregex_reset. 809 * Reset any saved state from the previous match. Has the effect of 810 * causing uregex_findNext to begin at the specified index, and causing 811 * uregex_start(), uregex_end() and uregex_group() to return an error 812 * indicating that there is no match information available. Clears any 813 * match region that may have been set. 814 * 815 * @param regexp The compiled regular expression. 816 * @param index The position (native) in the text at which a 817 * uregex_findNext() should begin searching. 818 * @param status A reference to a UErrorCode to receive any errors. 819 * @stable ICU 4.6 820 */ 821 U_CAPI void U_EXPORT2 822 uregex_reset64(URegularExpression *regexp, 823 int64_t index, 824 UErrorCode *status); 825 826 /** 827 * Sets the limits of the matching region for this URegularExpression. 828 * The region is the part of the input string that will be considered when matching. 829 * Invoking this method resets any saved state from the previous match, 830 * then sets the region to start at the index specified by the start parameter 831 * and end at the index specified by the end parameter. 832 * 833 * Depending on the transparency and anchoring being used (see useTransparentBounds 834 * and useAnchoringBounds), certain constructs such as anchors may behave differently 835 * at or around the boundaries of the region 836 * 837 * The function will fail if start is greater than limit, or if either index 838 * is less than zero or greater than the length of the string being matched. 839 * 840 * @param regexp The compiled regular expression. 841 * @param regionStart The (native) index to begin searches at. 842 * @param regionLimit The (native) index to end searches at (exclusive). 843 * @param status A pointer to a UErrorCode to receive any errors. 844 * @stable ICU 4.0 845 */ 846 U_CAPI void U_EXPORT2 847 uregex_setRegion(URegularExpression *regexp, 848 int32_t regionStart, 849 int32_t regionLimit, 850 UErrorCode *status); 851 852 /** 853 * 64bit version of uregex_setRegion. 854 * Sets the limits of the matching region for this URegularExpression. 855 * The region is the part of the input string that will be considered when matching. 856 * Invoking this method resets any saved state from the previous match, 857 * then sets the region to start at the index specified by the start parameter 858 * and end at the index specified by the end parameter. 859 * 860 * Depending on the transparency and anchoring being used (see useTransparentBounds 861 * and useAnchoringBounds), certain constructs such as anchors may behave differently 862 * at or around the boundaries of the region 863 * 864 * The function will fail if start is greater than limit, or if either index 865 * is less than zero or greater than the length of the string being matched. 866 * 867 * @param regexp The compiled regular expression. 868 * @param regionStart The (native) index to begin searches at. 869 * @param regionLimit The (native) index to end searches at (exclusive). 870 * @param status A pointer to a UErrorCode to receive any errors. 871 * @stable ICU 4.6 872 */ 873 U_CAPI void U_EXPORT2 874 uregex_setRegion64(URegularExpression *regexp, 875 int64_t regionStart, 876 int64_t regionLimit, 877 UErrorCode *status); 878 879 /** 880 * Set the matching region and the starting index for subsequent matches 881 * in a single operation. 882 * This is useful because the usual function for setting the starting 883 * index, urgex_reset(), also resets any region limits. 884 * 885 * @param regexp The compiled regular expression. 886 * @param regionStart The (native) index to begin searches at. 887 * @param regionLimit The (native) index to end searches at (exclusive). 888 * @param startIndex The index in the input text at which the next 889 * match operation should begin. 890 * @param status A pointer to a UErrorCode to receive any errors. 891 * @stable ICU 4.6 892 */ 893 U_CAPI void U_EXPORT2 894 uregex_setRegionAndStart(URegularExpression *regexp, 895 int64_t regionStart, 896 int64_t regionLimit, 897 int64_t startIndex, 898 UErrorCode *status); 899 900 /** 901 * Reports the start index of the matching region. Any matches found are limited to 902 * to the region bounded by regionStart (inclusive) and regionEnd (exclusive). 903 * 904 * @param regexp The compiled regular expression. 905 * @param status A pointer to a UErrorCode to receive any errors. 906 * @return The starting (native) index of this matcher's region. 907 * @stable ICU 4.0 908 */ 909 U_CAPI int32_t U_EXPORT2 910 uregex_regionStart(const URegularExpression *regexp, 911 UErrorCode *status); 912 913 /** 914 * 64bit version of uregex_regionStart. 915 * Reports the start index of the matching region. Any matches found are limited to 916 * to the region bounded by regionStart (inclusive) and regionEnd (exclusive). 917 * 918 * @param regexp The compiled regular expression. 919 * @param status A pointer to a UErrorCode to receive any errors. 920 * @return The starting (native) index of this matcher's region. 921 * @stable ICU 4.6 922 */ 923 U_CAPI int64_t U_EXPORT2 924 uregex_regionStart64(const URegularExpression *regexp, 925 UErrorCode *status); 926 927 /** 928 * Reports the end index (exclusive) of the matching region for this URegularExpression. 929 * Any matches found are limited to to the region bounded by regionStart (inclusive) 930 * and regionEnd (exclusive). 931 * 932 * @param regexp The compiled regular expression. 933 * @param status A pointer to a UErrorCode to receive any errors. 934 * @return The ending point (native) of this matcher's region. 935 * @stable ICU 4.0 936 */ 937 U_CAPI int32_t U_EXPORT2 938 uregex_regionEnd(const URegularExpression *regexp, 939 UErrorCode *status); 940 941 /** 942 * 64bit version of uregex_regionEnd. 943 * Reports the end index (exclusive) of the matching region for this URegularExpression. 944 * Any matches found are limited to to the region bounded by regionStart (inclusive) 945 * and regionEnd (exclusive). 946 * 947 * @param regexp The compiled regular expression. 948 * @param status A pointer to a UErrorCode to receive any errors. 949 * @return The ending point (native) of this matcher's region. 950 * @stable ICU 4.6 951 */ 952 U_CAPI int64_t U_EXPORT2 953 uregex_regionEnd64(const URegularExpression *regexp, 954 UErrorCode *status); 955 956 /** 957 * Queries the transparency of region bounds for this URegularExpression. 958 * See useTransparentBounds for a description of transparent and opaque bounds. 959 * By default, matching boundaries are opaque. 960 * 961 * @param regexp The compiled regular expression. 962 * @param status A pointer to a UErrorCode to receive any errors. 963 * @return true if this matcher is using opaque bounds, false if it is not. 964 * @stable ICU 4.0 965 */ 966 U_CAPI UBool U_EXPORT2 967 uregex_hasTransparentBounds(const URegularExpression *regexp, 968 UErrorCode *status); 969 970 971 /** 972 * Sets the transparency of region bounds for this URegularExpression. 973 * Invoking this function with an argument of true will set matches to use transparent bounds. 974 * If the boolean argument is false, then opaque bounds will be used. 975 * 976 * Using transparent bounds, the boundaries of the matching region are transparent 977 * to lookahead, lookbehind, and boundary matching constructs. Those constructs can 978 * see text beyond the boundaries of the region while checking for a match. 979 * 980 * With opaque bounds, no text outside of the matching region is visible to lookahead, 981 * lookbehind, and boundary matching constructs. 982 * 983 * By default, opaque bounds are used. 984 * 985 * @param regexp The compiled regular expression. 986 * @param b true for transparent bounds; false for opaque bounds 987 * @param status A pointer to a UErrorCode to receive any errors. 988 * @stable ICU 4.0 989 **/ 990 U_CAPI void U_EXPORT2 991 uregex_useTransparentBounds(URegularExpression *regexp, 992 UBool b, 993 UErrorCode *status); 994 995 996 /** 997 * Return true if this URegularExpression is using anchoring bounds. 998 * By default, anchoring region bounds are used. 999 * 1000 * @param regexp The compiled regular expression. 1001 * @param status A pointer to a UErrorCode to receive any errors. 1002 * @return true if this matcher is using anchoring bounds. 1003 * @stable ICU 4.0 1004 */ 1005 U_CAPI UBool U_EXPORT2 1006 uregex_hasAnchoringBounds(const URegularExpression *regexp, 1007 UErrorCode *status); 1008 1009 1010 /** 1011 * Set whether this URegularExpression is using Anchoring Bounds for its region. 1012 * With anchoring bounds, pattern anchors such as ^ and $ will match at the start 1013 * and end of the region. Without Anchoring Bounds, anchors will only match at 1014 * the positions they would in the complete text. 1015 * 1016 * Anchoring Bounds are the default for regions. 1017 * 1018 * @param regexp The compiled regular expression. 1019 * @param b true if to enable anchoring bounds; false to disable them. 1020 * @param status A pointer to a UErrorCode to receive any errors. 1021 * @stable ICU 4.0 1022 */ 1023 U_CAPI void U_EXPORT2 1024 uregex_useAnchoringBounds(URegularExpression *regexp, 1025 UBool b, 1026 UErrorCode *status); 1027 1028 /** 1029 * Return true if the most recent matching operation touched the 1030 * end of the text being processed. In this case, additional input text could 1031 * change the results of that match. 1032 * 1033 * @param regexp The compiled regular expression. 1034 * @param status A pointer to a UErrorCode to receive any errors. 1035 * @return true if the most recent match hit the end of input 1036 * @stable ICU 4.0 1037 */ 1038 U_CAPI UBool U_EXPORT2 1039 uregex_hitEnd(const URegularExpression *regexp, 1040 UErrorCode *status); 1041 1042 /** 1043 * Return true the most recent match succeeded and additional input could cause 1044 * it to fail. If this function returns false and a match was found, then more input 1045 * might change the match but the match won't be lost. If a match was not found, 1046 * then requireEnd has no meaning. 1047 * 1048 * @param regexp The compiled regular expression. 1049 * @param status A pointer to a UErrorCode to receive any errors. 1050 * @return true if more input could cause the most recent match to no longer match. 1051 * @stable ICU 4.0 1052 */ 1053 U_CAPI UBool U_EXPORT2 1054 uregex_requireEnd(const URegularExpression *regexp, 1055 UErrorCode *status); 1056 1057 1058 1059 1060 1061 /** 1062 * Replaces every substring of the input that matches the pattern 1063 * with the given replacement string. This is a convenience function that 1064 * provides a complete find-and-replace-all operation. 1065 * 1066 * This method scans the input string looking for matches of the pattern. 1067 * Input that is not part of any match is copied unchanged to the 1068 * destination buffer. Matched regions are replaced in the output 1069 * buffer by the replacement string. The replacement string may contain 1070 * references to capture groups; these take the form of $1, $2, etc. 1071 * 1072 * @param regexp The compiled regular expression. 1073 * @param replacementText A string containing the replacement text. 1074 * @param replacementLength The length of the replacement string, or 1075 * -1 if it is NUL terminated. 1076 * @param destBuf A (UChar *) buffer that will receive the result. 1077 * @param destCapacity The capacity of the destination buffer. 1078 * @param status A reference to a UErrorCode to receive any errors. 1079 * @return The length of the string resulting from the find 1080 * and replace operation. In the event that the 1081 * destination capacity is inadequate, the return value 1082 * is still the full length of the untruncated string. 1083 * @stable ICU 3.0 1084 */ 1085 U_CAPI int32_t U_EXPORT2 1086 uregex_replaceAll(URegularExpression *regexp, 1087 const UChar *replacementText, 1088 int32_t replacementLength, 1089 UChar *destBuf, 1090 int32_t destCapacity, 1091 UErrorCode *status); 1092 1093 /** 1094 * Replaces every substring of the input that matches the pattern 1095 * with the given replacement string. This is a convenience function that 1096 * provides a complete find-and-replace-all operation. 1097 * 1098 * This method scans the input string looking for matches of the pattern. 1099 * Input that is not part of any match is copied unchanged to the 1100 * destination buffer. Matched regions are replaced in the output 1101 * buffer by the replacement string. The replacement string may contain 1102 * references to capture groups; these take the form of $1, $2, etc. 1103 * 1104 * @param regexp The compiled regular expression. 1105 * @param replacement A string containing the replacement text. 1106 * @param dest A mutable UText that will receive the result. 1107 * If NULL, a new UText will be created (which may not be mutable). 1108 * @param status A reference to a UErrorCode to receive any errors. 1109 * @return A UText containing the results of the find and replace. 1110 * If a pre-allocated UText was provided, it will always be used and returned. 1111 * 1112 * @stable ICU 4.6 1113 */ 1114 U_CAPI UText * U_EXPORT2 1115 uregex_replaceAllUText(URegularExpression *regexp, 1116 UText *replacement, 1117 UText *dest, 1118 UErrorCode *status); 1119 1120 /** 1121 * Replaces the first substring of the input that matches the pattern 1122 * with the given replacement string. This is a convenience function that 1123 * provides a complete find-and-replace operation. 1124 * 1125 * This method scans the input string looking for a match of the pattern. 1126 * All input that is not part of the match is copied unchanged to the 1127 * destination buffer. The matched region is replaced in the output 1128 * buffer by the replacement string. The replacement string may contain 1129 * references to capture groups; these take the form of $1, $2, etc. 1130 * 1131 * @param regexp The compiled regular expression. 1132 * @param replacementText A string containing the replacement text. 1133 * @param replacementLength The length of the replacement string, or 1134 * -1 if it is NUL terminated. 1135 * @param destBuf A (UChar *) buffer that will receive the result. 1136 * @param destCapacity The capacity of the destination buffer. 1137 * @param status a reference to a UErrorCode to receive any errors. 1138 * @return The length of the string resulting from the find 1139 * and replace operation. In the event that the 1140 * destination capacity is inadequate, the return value 1141 * is still the full length of the untruncated string. 1142 * @stable ICU 3.0 1143 */ 1144 U_CAPI int32_t U_EXPORT2 1145 uregex_replaceFirst(URegularExpression *regexp, 1146 const UChar *replacementText, 1147 int32_t replacementLength, 1148 UChar *destBuf, 1149 int32_t destCapacity, 1150 UErrorCode *status); 1151 1152 /** 1153 * Replaces the first substring of the input that matches the pattern 1154 * with the given replacement string. This is a convenience function that 1155 * provides a complete find-and-replace operation. 1156 * 1157 * This method scans the input string looking for a match of the pattern. 1158 * All input that is not part of the match is copied unchanged to the 1159 * destination buffer. The matched region is replaced in the output 1160 * buffer by the replacement string. The replacement string may contain 1161 * references to capture groups; these take the form of $1, $2, etc. 1162 * 1163 * @param regexp The compiled regular expression. 1164 * @param replacement A string containing the replacement text. 1165 * @param dest A mutable UText that will receive the result. 1166 * If NULL, a new UText will be created (which may not be mutable). 1167 * @param status A reference to a UErrorCode to receive any errors. 1168 * @return A UText containing the results of the find and replace. 1169 * If a pre-allocated UText was provided, it will always be used and returned. 1170 * 1171 * @stable ICU 4.6 1172 */ 1173 U_CAPI UText * U_EXPORT2 1174 uregex_replaceFirstUText(URegularExpression *regexp, 1175 UText *replacement, 1176 UText *dest, 1177 UErrorCode *status); 1178 1179 /** 1180 * Implements a replace operation intended to be used as part of an 1181 * incremental find-and-replace. 1182 * 1183 * <p>The input string, starting from the end of the previous match and ending at 1184 * the start of the current match, is appended to the destination string. Then the 1185 * replacement string is appended to the output string, 1186 * including handling any substitutions of captured text.</p> 1187 * 1188 * <p>A note on preflight computation of buffersize and error handling: 1189 * Calls to uregex_appendReplacement() and uregex_appendTail() are 1190 * designed to be chained, one after another, with the destination 1191 * buffer pointer and buffer capacity updated after each in preparation 1192 * to for the next. If the destination buffer is exhausted partway through such a 1193 * sequence, a U_BUFFER_OVERFLOW_ERROR status will be returned. Normal 1194 * ICU conventions are for a function to perform no action if it is 1195 * called with an error status, but for this one case, uregex_appendRepacement() 1196 * will operate normally so that buffer size computations will complete 1197 * correctly. 1198 * 1199 * <p>For simple, prepackaged, non-incremental find-and-replace 1200 * operations, see replaceFirst() or replaceAll().</p> 1201 * 1202 * @param regexp The regular expression object. 1203 * @param replacementText The string that will replace the matched portion of the 1204 * input string as it is copied to the destination buffer. 1205 * The replacement text may contain references ($1, for 1206 * example) to capture groups from the match. 1207 * @param replacementLength The length of the replacement text string, 1208 * or -1 if the string is NUL terminated. 1209 * @param destBuf The buffer into which the results of the 1210 * find-and-replace are placed. On return, this pointer 1211 * will be updated to refer to the beginning of the 1212 * unused portion of buffer, leaving it in position for 1213 * a subsequent call to this function. 1214 * @param destCapacity The size of the output buffer, On return, this 1215 * parameter will be updated to reflect the space remaining 1216 * unused in the output buffer. 1217 * @param status A reference to a UErrorCode to receive any errors. 1218 * @return The length of the result string. In the event that 1219 * destCapacity is inadequate, the full length of the 1220 * untruncated output string is returned. 1221 * 1222 * @stable ICU 3.0 1223 * 1224 */ 1225 U_CAPI int32_t U_EXPORT2 1226 uregex_appendReplacement(URegularExpression *regexp, 1227 const UChar *replacementText, 1228 int32_t replacementLength, 1229 UChar **destBuf, 1230 int32_t *destCapacity, 1231 UErrorCode *status); 1232 1233 /** 1234 * Implements a replace operation intended to be used as part of an 1235 * incremental find-and-replace. 1236 * 1237 * <p>The input string, starting from the end of the previous match and ending at 1238 * the start of the current match, is appended to the destination string. Then the 1239 * replacement string is appended to the output string, 1240 * including handling any substitutions of captured text.</p> 1241 * 1242 * <p>For simple, prepackaged, non-incremental find-and-replace 1243 * operations, see replaceFirst() or replaceAll().</p> 1244 * 1245 * @param regexp The regular expression object. 1246 * @param replacementText The string that will replace the matched portion of the 1247 * input string as it is copied to the destination buffer. 1248 * The replacement text may contain references ($1, for 1249 * example) to capture groups from the match. 1250 * @param dest A mutable UText that will receive the result. Must not be NULL. 1251 * @param status A reference to a UErrorCode to receive any errors. 1252 * 1253 * @stable ICU 4.6 1254 */ 1255 U_CAPI void U_EXPORT2 1256 uregex_appendReplacementUText(URegularExpression *regexp, 1257 UText *replacementText, 1258 UText *dest, 1259 UErrorCode *status); 1260 1261 /** 1262 * As the final step in a find-and-replace operation, append the remainder 1263 * of the input string, starting at the position following the last match, 1264 * to the destination string. <code>uregex_appendTail()</code> is intended 1265 * to be invoked after one or more invocations of the 1266 * <code>uregex_appendReplacement()</code> function. 1267 * 1268 * @param regexp The regular expression object. This is needed to 1269 * obtain the input string and with the position 1270 * of the last match within it. 1271 * @param destBuf The buffer in which the results of the 1272 * find-and-replace are placed. On return, the pointer 1273 * will be updated to refer to the beginning of the 1274 * unused portion of buffer. 1275 * @param destCapacity The size of the output buffer, On return, this 1276 * value will be updated to reflect the space remaining 1277 * unused in the output buffer. 1278 * @param status A reference to a UErrorCode to receive any errors. 1279 * @return The length of the result string. In the event that 1280 * destCapacity is inadequate, the full length of the 1281 * untruncated output string is returned. 1282 * 1283 * @stable ICU 3.0 1284 */ 1285 U_CAPI int32_t U_EXPORT2 1286 uregex_appendTail(URegularExpression *regexp, 1287 UChar **destBuf, 1288 int32_t *destCapacity, 1289 UErrorCode *status); 1290 1291 /** 1292 * As the final step in a find-and-replace operation, append the remainder 1293 * of the input string, starting at the position following the last match, 1294 * to the destination string. <code>uregex_appendTailUText()</code> is intended 1295 * to be invoked after one or more invocations of the 1296 * <code>uregex_appendReplacementUText()</code> function. 1297 * 1298 * @param regexp The regular expression object. This is needed to 1299 * obtain the input string and with the position 1300 * of the last match within it. 1301 * @param dest A mutable UText that will receive the result. Must not be NULL. 1302 * 1303 * @param status Error code 1304 * 1305 * @return The destination UText. 1306 * 1307 * @stable ICU 4.6 1308 */ 1309 U_CAPI UText * U_EXPORT2 1310 uregex_appendTailUText(URegularExpression *regexp, 1311 UText *dest, 1312 UErrorCode *status); 1313 1314 /** 1315 * Split a string into fields. Somewhat like split() from Perl. 1316 * The pattern matches identify delimiters that separate the input 1317 * into fields. The input data between the matches becomes the 1318 * fields themselves. 1319 * 1320 * Each of the fields is copied from the input string to the destination 1321 * buffer, and NUL terminated. The position of each field within 1322 * the destination buffer is returned in the destFields array. 1323 * 1324 * If the delimiter pattern includes capture groups, the captured text will 1325 * also appear in the destination array of output strings, interspersed 1326 * with the fields. This is similar to Perl, but differs from Java, 1327 * which ignores the presence of capture groups in the pattern. 1328 * 1329 * Trailing empty fields will always be returned, assuming sufficient 1330 * destination capacity. This differs from the default behavior for Java 1331 * and Perl where trailing empty fields are not returned. 1332 * 1333 * The number of strings produced by the split operation is returned. 1334 * This count includes the strings from capture groups in the delimiter pattern. 1335 * This behavior differs from Java, which ignores capture groups. 1336 * 1337 * @param regexp The compiled regular expression. 1338 * @param destBuf A (UChar *) buffer to receive the fields that 1339 * are extracted from the input string. These 1340 * field pointers will refer to positions within the 1341 * destination buffer supplied by the caller. Any 1342 * extra positions within the destFields array will be 1343 * set to NULL. 1344 * @param destCapacity The capacity of the destBuf. 1345 * @param requiredCapacity The actual capacity required of the destBuf. 1346 * If destCapacity is too small, requiredCapacity will return 1347 * the total capacity required to hold all of the output, and 1348 * a U_BUFFER_OVERFLOW_ERROR will be returned. 1349 * @param destFields An array to be filled with the position of each 1350 * of the extracted fields within destBuf. 1351 * @param destFieldsCapacity The number of elements in the destFields array. 1352 * If the number of fields found is less than destFieldsCapacity, 1353 * the extra destFields elements are set to zero. 1354 * If destFieldsCapacity is too small, the trailing part of the 1355 * input, including any field delimiters, is treated as if it 1356 * were the last field - it is copied to the destBuf, and 1357 * its position is in the destBuf is stored in the last element 1358 * of destFields. This behavior mimics that of Perl. It is not 1359 * an error condition, and no error status is returned when all destField 1360 * positions are used. 1361 * @param status A reference to a UErrorCode to receive any errors. 1362 * @return The number of fields into which the input string was split. 1363 * @stable ICU 3.0 1364 */ 1365 U_CAPI int32_t U_EXPORT2 1366 uregex_split( URegularExpression *regexp, 1367 UChar *destBuf, 1368 int32_t destCapacity, 1369 int32_t *requiredCapacity, 1370 UChar *destFields[], 1371 int32_t destFieldsCapacity, 1372 UErrorCode *status); 1373 1374 /** 1375 * Split a string into fields. Somewhat like split() from Perl. 1376 * The pattern matches identify delimiters that separate the input 1377 * into fields. The input data between the matches becomes the 1378 * fields themselves. 1379 * <p> 1380 * The behavior of this function is not very closely aligned with uregex_split(); 1381 * instead, it is based on (and implemented directly on top of) the C++ split method. 1382 * 1383 * @param regexp The compiled regular expression. 1384 * @param destFields An array of mutable UText structs to receive the results of the split. 1385 * If a field is NULL, a new UText is allocated to contain the results for 1386 * that field. This new UText is not guaranteed to be mutable. 1387 * @param destFieldsCapacity The number of elements in the destination array. 1388 * If the number of fields found is less than destCapacity, the 1389 * extra strings in the destination array are not altered. 1390 * If the number of destination strings is less than the number 1391 * of fields, the trailing part of the input string, including any 1392 * field delimiters, is placed in the last destination string. 1393 * This behavior mimics that of Perl. It is not an error condition, and no 1394 * error status is returned when all destField positions are used. 1395 * @param status A reference to a UErrorCode to receive any errors. 1396 * @return The number of fields into which the input string was split. 1397 * 1398 * @stable ICU 4.6 1399 */ 1400 U_CAPI int32_t U_EXPORT2 1401 uregex_splitUText(URegularExpression *regexp, 1402 UText *destFields[], 1403 int32_t destFieldsCapacity, 1404 UErrorCode *status); 1405 1406 /** 1407 * Set a processing time limit for match operations with this URegularExpression. 1408 * 1409 * Some patterns, when matching certain strings, can run in exponential time. 1410 * For practical purposes, the match operation may appear to be in an 1411 * infinite loop. 1412 * When a limit is set a match operation will fail with an error if the 1413 * limit is exceeded. 1414 * <p> 1415 * The units of the limit are steps of the match engine. 1416 * Correspondence with actual processor time will depend on the speed 1417 * of the processor and the details of the specific pattern, but will 1418 * typically be on the order of milliseconds. 1419 * <p> 1420 * By default, the matching time is not limited. 1421 * <p> 1422 * 1423 * @param regexp The compiled regular expression. 1424 * @param limit The limit value, or 0 for no limit. 1425 * @param status A reference to a UErrorCode to receive any errors. 1426 * @stable ICU 4.0 1427 */ 1428 U_CAPI void U_EXPORT2 1429 uregex_setTimeLimit(URegularExpression *regexp, 1430 int32_t limit, 1431 UErrorCode *status); 1432 1433 /** 1434 * Get the time limit for for matches with this URegularExpression. 1435 * A return value of zero indicates that there is no limit. 1436 * 1437 * @param regexp The compiled regular expression. 1438 * @param status A reference to a UErrorCode to receive any errors. 1439 * @return the maximum allowed time for a match, in units of processing steps. 1440 * @stable ICU 4.0 1441 */ 1442 U_CAPI int32_t U_EXPORT2 1443 uregex_getTimeLimit(const URegularExpression *regexp, 1444 UErrorCode *status); 1445 1446 /** 1447 * Set the amount of heap storage available for use by the match backtracking stack. 1448 * <p> 1449 * ICU uses a backtracking regular expression engine, with the backtrack stack 1450 * maintained on the heap. This function sets the limit to the amount of memory 1451 * that can be used for this purpose. A backtracking stack overflow will 1452 * result in an error from the match operation that caused it. 1453 * <p> 1454 * A limit is desirable because a malicious or poorly designed pattern can use 1455 * excessive memory, potentially crashing the process. A limit is enabled 1456 * by default. 1457 * <p> 1458 * @param regexp The compiled regular expression. 1459 * @param limit The maximum size, in bytes, of the matching backtrack stack. 1460 * A value of zero means no limit. 1461 * The limit must be greater than or equal to zero. 1462 * @param status A reference to a UErrorCode to receive any errors. 1463 * 1464 * @stable ICU 4.0 1465 */ 1466 U_CAPI void U_EXPORT2 1467 uregex_setStackLimit(URegularExpression *regexp, 1468 int32_t limit, 1469 UErrorCode *status); 1470 1471 /** 1472 * Get the size of the heap storage available for use by the back tracking stack. 1473 * 1474 * @return the maximum backtracking stack size, in bytes, or zero if the 1475 * stack size is unlimited. 1476 * @stable ICU 4.0 1477 */ 1478 U_CAPI int32_t U_EXPORT2 1479 uregex_getStackLimit(const URegularExpression *regexp, 1480 UErrorCode *status); 1481 1482 1483 /** 1484 * Function pointer for a regular expression matching callback function. 1485 * When set, a callback function will be called periodically during matching 1486 * operations. If the call back function returns false, the matching 1487 * operation will be terminated early. 1488 * 1489 * Note: the callback function must not call other functions on this 1490 * URegularExpression. 1491 * 1492 * @param context context pointer. The callback function will be invoked 1493 * with the context specified at the time that 1494 * uregex_setMatchCallback() is called. 1495 * @param steps the accumulated processing time, in match steps, 1496 * for this matching operation. 1497 * @return true to continue the matching operation. 1498 * false to terminate the matching operation. 1499 * @stable ICU 4.0 1500 */ 1501 U_CDECL_BEGIN 1502 typedef UBool U_CALLCONV URegexMatchCallback ( 1503 const void *context, 1504 int32_t steps); 1505 U_CDECL_END 1506 1507 /** 1508 * Set a callback function for this URegularExpression. 1509 * During matching operations the function will be called periodically, 1510 * giving the application the opportunity to terminate a long-running 1511 * match. 1512 * 1513 * @param regexp The compiled regular expression. 1514 * @param callback A pointer to the user-supplied callback function. 1515 * @param context User context pointer. The value supplied at the 1516 * time the callback function is set will be saved 1517 * and passed to the callback each time that it is called. 1518 * @param status A reference to a UErrorCode to receive any errors. 1519 * @stable ICU 4.0 1520 */ 1521 U_CAPI void U_EXPORT2 1522 uregex_setMatchCallback(URegularExpression *regexp, 1523 URegexMatchCallback *callback, 1524 const void *context, 1525 UErrorCode *status); 1526 1527 1528 /** 1529 * Get the callback function for this URegularExpression. 1530 * 1531 * @param regexp The compiled regular expression. 1532 * @param callback Out parameter, receives a pointer to the user-supplied 1533 * callback function. 1534 * @param context Out parameter, receives the user context pointer that 1535 * was set when uregex_setMatchCallback() was called. 1536 * @param status A reference to a UErrorCode to receive any errors. 1537 * @stable ICU 4.0 1538 */ 1539 U_CAPI void U_EXPORT2 1540 uregex_getMatchCallback(const URegularExpression *regexp, 1541 URegexMatchCallback **callback, 1542 const void **context, 1543 UErrorCode *status); 1544 1545 /** 1546 * Function pointer for a regular expression find callback function. 1547 * 1548 * When set, a callback function will be called during a find operation 1549 * and for operations that depend on find, such as findNext, split and some replace 1550 * operations like replaceFirst. 1551 * The callback will usually be called after each attempt at a match, but this is not a 1552 * guarantee that the callback will be invoked at each character. For finds where the 1553 * match engine is invoked at each character, this may be close to true, but less likely 1554 * for more optimized loops where the pattern is known to only start, and the match 1555 * engine invoked, at certain characters. 1556 * When invoked, this callback will specify the index at which a match operation is about 1557 * to be attempted, giving the application the opportunity to terminate a long-running 1558 * find operation. 1559 * 1560 * If the call back function returns false, the find operation will be terminated early. 1561 * 1562 * Note: the callback function must not call other functions on this 1563 * URegularExpression 1564 * 1565 * @param context context pointer. The callback function will be invoked 1566 * with the context specified at the time that 1567 * uregex_setFindProgressCallback() is called. 1568 * @param matchIndex the next index at which a match attempt will be attempted for this 1569 * find operation. If this callback interrupts the search, this is the 1570 * index at which a find/findNext operation may be re-initiated. 1571 * @return true to continue the matching operation. 1572 * false to terminate the matching operation. 1573 * @stable ICU 4.6 1574 */ 1575 U_CDECL_BEGIN 1576 typedef UBool U_CALLCONV URegexFindProgressCallback ( 1577 const void *context, 1578 int64_t matchIndex); 1579 U_CDECL_END 1580 1581 1582 /** 1583 * Set the find progress callback function for this URegularExpression. 1584 * 1585 * @param regexp The compiled regular expression. 1586 * @param callback A pointer to the user-supplied callback function. 1587 * @param context User context pointer. The value supplied at the 1588 * time the callback function is set will be saved 1589 * and passed to the callback each time that it is called. 1590 * @param status A reference to a UErrorCode to receive any errors. 1591 * @stable ICU 4.6 1592 */ 1593 U_CAPI void U_EXPORT2 1594 uregex_setFindProgressCallback(URegularExpression *regexp, 1595 URegexFindProgressCallback *callback, 1596 const void *context, 1597 UErrorCode *status); 1598 1599 /** 1600 * Get the find progress callback function for this URegularExpression. 1601 * 1602 * @param regexp The compiled regular expression. 1603 * @param callback Out parameter, receives a pointer to the user-supplied 1604 * callback function. 1605 * @param context Out parameter, receives the user context pointer that 1606 * was set when uregex_setFindProgressCallback() was called. 1607 * @param status A reference to a UErrorCode to receive any errors. 1608 * @stable ICU 4.6 1609 */ 1610 U_CAPI void U_EXPORT2 1611 uregex_getFindProgressCallback(const URegularExpression *regexp, 1612 URegexFindProgressCallback **callback, 1613 const void **context, 1614 UErrorCode *status); 1615 1616 #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */ 1617 #endif /* UREGEX_H */ 1618