1 /* 2 ********************************************************************** 3 * Copyright (C) 2004-2011, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ********************************************************************** 6 * file name: uregex.h 7 * encoding: US-ASCII 8 * indentation:4 9 * 10 * created on: 2004mar09 11 * created by: Andy Heninger 12 * 13 * ICU Regular Expressions, API for C 14 */ 15 16 /** 17 * \file 18 * \brief C API: Regular Expressions 19 * 20 * <p>This is a C wrapper around the C++ RegexPattern and RegexMatcher classes.</p> 21 */ 22 23 #ifndef UREGEX_H 24 #define UREGEX_H 25 26 #include "unicode/utext.h" 27 #include "unicode/utypes.h" 28 29 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 30 31 #include "unicode/localpointer.h" 32 #include "unicode/parseerr.h" 33 34 struct URegularExpression; 35 /** 36 * Structure representing a compiled regular expression, plus the results 37 * of a match operation. 38 * @stable ICU 3.0 39 */ 40 typedef struct URegularExpression URegularExpression; 41 42 43 /** 44 * Constants for Regular Expression Match Modes. 45 * @stable ICU 2.4 46 */ 47 typedef enum URegexpFlag{ 48 49 #ifndef U_HIDE_DRAFT_API 50 /** Forces normalization of pattern and strings. 51 Not implemented yet, just a placeholder, hence draft. 52 @draft ICU 2.4 */ 53 UREGEX_CANON_EQ = 128, 54 #endif 55 /** Enable case insensitive matching. @stable ICU 2.4 */ 56 UREGEX_CASE_INSENSITIVE = 2, 57 58 /** Allow white space and comments within patterns @stable ICU 2.4 */ 59 UREGEX_COMMENTS = 4, 60 61 /** If set, '.' matches line terminators, otherwise '.' matching stops at line end. 62 * @stable ICU 2.4 */ 63 UREGEX_DOTALL = 32, 64 65 /** If set, treat the entire pattern as a literal string. 66 * Metacharacters or escape sequences in the input sequence will be given 67 * no special meaning. Not implemented yet as of ICU 4.4. 68 * 69 * The flags CASE_INSENSITIVE and UNICODE_CASE retain their impact 70 * on matching when used in conjunction with this flag. 71 * The other flags become superfluous. 72 * TODO: say which escapes are still handled; anything Java does 73 * early (\\u) we should still do. 74 * @stable ICU 4.0 75 */ 76 UREGEX_LITERAL = 16, 77 78 /** Control behavior of "$" and "^" 79 * If set, recognize line terminators within string, 80 * otherwise, match only at start and end of input string. 81 * @stable ICU 2.4 */ 82 UREGEX_MULTILINE = 8, 83 84 /** Unix-only line endings. 85 * When this mode is enabled, only \\u000a is recognized as a line ending 86 * in the behavior of ., ^, and $. 87 * @stable ICU 4.0 88 */ 89 UREGEX_UNIX_LINES = 1, 90 91 /** Unicode word boundaries. 92 * If set, \b uses the Unicode TR 29 definition of word boundaries. 93 * Warning: Unicode word boundaries are quite different from 94 * traditional regular expression word boundaries. See 95 * http://unicode.org/reports/tr29/#Word_Boundaries 96 * @stable ICU 2.8 97 */ 98 UREGEX_UWORD = 256, 99 100 /** Error on Unrecognized backslash escapes. 101 * If set, fail with an error on patterns that contain 102 * backslash-escaped ASCII letters without a known special 103 * meaning. If this flag is not set, these 104 * escaped letters represent themselves. 105 * @stable ICU 4.0 106 */ 107 UREGEX_ERROR_ON_UNKNOWN_ESCAPES = 512 108 109 } URegexpFlag; 110 111 /** 112 * Open (compile) an ICU regular expression. Compiles the regular expression in 113 * string form into an internal representation using the specified match mode flags. 114 * The resulting regular expression handle can then be used to perform various 115 * matching operations. 116 * 117 * 118 * @param pattern The Regular Expression pattern to be compiled. 119 * @param patternLength The length of the pattern, or -1 if the pattern is 120 * NUL terminated. 121 * @param flags Flags that alter the default matching behavior for 122 * the regular expression, UREGEX_CASE_INSENSITIVE, for 123 * example. For default behavior, set this parameter to zero. 124 * See <code>enum URegexpFlag</code>. All desired flags 125 * are bitwise-ORed together. 126 * @param pe Receives the position (line and column numbers) of any syntax 127 * error within the source regular expression string. If this 128 * information is not wanted, pass NULL for this parameter. 129 * @param status Receives error detected by this function. 130 * @stable ICU 3.0 131 * 132 */ 133 U_STABLE URegularExpression * U_EXPORT2 134 uregex_open( const UChar *pattern, 135 int32_t patternLength, 136 uint32_t flags, 137 UParseError *pe, 138 UErrorCode *status); 139 140 /** 141 * Open (compile) an ICU regular expression. Compiles the regular expression in 142 * string form into an internal representation using the specified match mode flags. 143 * The resulting regular expression handle can then be used to perform various 144 * matching operations. 145 * <p> 146 * The contents of the pattern UText will be extracted and saved. Ownership of the 147 * UText struct itself remains with the caller. This is to match the behavior of 148 * uregex_open(). 149 * 150 * @param pattern The Regular Expression pattern to be compiled. 151 * @param flags Flags that alter the default matching behavior for 152 * the regular expression, UREGEX_CASE_INSENSITIVE, for 153 * example. For default behavior, set this parameter to zero. 154 * See <code>enum URegexpFlag</code>. All desired flags 155 * are bitwise-ORed together. 156 * @param pe Receives the position (line and column numbers) of any syntax 157 * error within the source regular expression string. If this 158 * information is not wanted, pass NULL for this parameter. 159 * @param status Receives error detected by this function. 160 * 161 * @draft ICU 4.6 162 */ 163 U_DRAFT URegularExpression * U_EXPORT2 164 uregex_openUText(UText *pattern, 165 uint32_t flags, 166 UParseError *pe, 167 UErrorCode *status); 168 169 /** 170 * Open (compile) an ICU regular expression. The resulting regular expression 171 * handle can then be used to perform various matching operations. 172 * <p> 173 * This function is the same as uregex_open, except that the pattern 174 * is supplied as an 8 bit char * string in the default code page. 175 * 176 * @param pattern The Regular Expression pattern to be compiled, 177 * NUL terminated. 178 * @param flags Flags that alter the default matching behavior for 179 * the regular expression, UREGEX_CASE_INSENSITIVE, for 180 * example. For default behavior, set this parameter to zero. 181 * See <code>enum URegexpFlag</code>. All desired flags 182 * are bitwise-ORed together. 183 * @param pe Receives the position (line and column numbers) of any syntax 184 * error within the source regular expression string. If this 185 * information is not wanted, pass NULL for this parameter. 186 * @param status Receives errors detected by this function. 187 * @return The URegularExpression object representing the compiled 188 * pattern. 189 * 190 * @stable ICU 3.0 191 */ 192 #if !UCONFIG_NO_CONVERSION 193 U_STABLE URegularExpression * U_EXPORT2 194 uregex_openC( const char *pattern, 195 uint32_t flags, 196 UParseError *pe, 197 UErrorCode *status); 198 #endif 199 200 201 202 /** 203 * Close the regular expression, recovering all resources (memory) it 204 * was holding. 205 * 206 * @param regexp The regular expression to be closed. 207 * @stable ICU 3.0 208 */ 209 U_STABLE void U_EXPORT2 210 uregex_close(URegularExpression *regexp); 211 212 #if U_SHOW_CPLUSPLUS_API 213 214 U_NAMESPACE_BEGIN 215 216 /** 217 * \class LocalURegularExpressionPointer 218 * "Smart pointer" class, closes a URegularExpression via uregex_close(). 219 * For most methods see the LocalPointerBase base class. 220 * 221 * @see LocalPointerBase 222 * @see LocalPointer 223 * @stable ICU 4.4 224 */ 225 U_DEFINE_LOCAL_OPEN_POINTER(LocalURegularExpressionPointer, URegularExpression, uregex_close); 226 227 U_NAMESPACE_END 228 229 #endif 230 231 /** 232 * Make a copy of a compiled regular expression. Cloning a regular 233 * expression is faster than opening a second instance from the source 234 * form of the expression, and requires less memory. 235 * <p> 236 * Note that the current input string and the position of any matched text 237 * within it are not cloned; only the pattern itself and the 238 * match mode flags are copied. 239 * <p> 240 * Cloning can be particularly useful to threaded applications that perform 241 * multiple match operations in parallel. Each concurrent RE 242 * operation requires its own instance of a URegularExpression. 243 * 244 * @param regexp The compiled regular expression to be cloned. 245 * @param status Receives indication of any errors encountered 246 * @return the cloned copy of the compiled regular expression. 247 * @stable ICU 3.0 248 */ 249 U_STABLE URegularExpression * U_EXPORT2 250 uregex_clone(const URegularExpression *regexp, UErrorCode *status); 251 252 /** 253 * Returns a pointer to the source form of the pattern for this regular expression. 254 * This function will work even if the pattern was originally specified as a UText. 255 * 256 * @param regexp The compiled regular expression. 257 * @param patLength This output parameter will be set to the length of the 258 * pattern string. A NULL pointer may be used here if the 259 * pattern length is not needed, as would be the case if 260 * the pattern is known in advance to be a NUL terminated 261 * string. 262 * @param status Receives errors detected by this function. 263 * @return a pointer to the pattern string. The storage for the string is 264 * owned by the regular expression object, and must not be 265 * altered or deleted by the application. The returned string 266 * will remain valid until the regular expression is closed. 267 * @stable ICU 3.0 268 */ 269 U_STABLE const UChar * U_EXPORT2 270 uregex_pattern(const URegularExpression *regexp, 271 int32_t *patLength, 272 UErrorCode *status); 273 274 /** 275 * Returns the source text of the pattern for this regular expression. 276 * This function will work even if the pattern was originally specified as a UChar string. 277 * 278 * @param regexp The compiled regular expression. 279 * @param status Receives errors detected by this function. 280 * @return the pattern text. The storage for the text is owned by the regular expression 281 * object, and must not be altered or deleted. 282 * 283 * @draft ICU 4.6 284 */ 285 U_DRAFT UText * U_EXPORT2 286 uregex_patternUText(const URegularExpression *regexp, 287 UErrorCode *status); 288 289 290 /** 291 * Get the match mode flags that were specified when compiling this regular expression. 292 * @param status Receives errors detected by this function. 293 * @param regexp The compiled regular expression. 294 * @return The match mode flags 295 * @see URegexpFlag 296 * @stable ICU 3.0 297 */ 298 U_STABLE int32_t U_EXPORT2 299 uregex_flags(const URegularExpression *regexp, 300 UErrorCode *status); 301 302 303 /** 304 * Set the subject text string upon which the regular expression will look for matches. 305 * This function may be called any number of times, allowing the regular 306 * expression pattern to be applied to different strings. 307 * <p> 308 * Regular expression matching operations work directly on the application's 309 * string data. No copy is made. The subject string data must not be 310 * altered after calling this function until after all regular expression 311 * operations involving this string data are completed. 312 * <p> 313 * Zero length strings are permitted. In this case, no subsequent match 314 * operation will dereference the text string pointer. 315 * 316 * @param regexp The compiled regular expression. 317 * @param text The subject text string. 318 * @param textLength The length of the subject text, or -1 if the string 319 * is NUL terminated. 320 * @param status Receives errors detected by this function. 321 * @stable ICU 3.0 322 */ 323 U_STABLE void U_EXPORT2 324 uregex_setText(URegularExpression *regexp, 325 const UChar *text, 326 int32_t textLength, 327 UErrorCode *status); 328 329 330 /** 331 * Set the subject text string upon which the regular expression will look for matches. 332 * This function may be called any number of times, allowing the regular 333 * expression pattern to be applied to different strings. 334 * <p> 335 * Regular expression matching operations work directly on the application's 336 * string data; only a shallow clone is made. The subject string data must not be 337 * altered after calling this function until after all regular expression 338 * operations involving this string data are completed. 339 * 340 * @param regexp The compiled regular expression. 341 * @param text The subject text string. 342 * @param status Receives errors detected by this function. 343 * 344 * @draft ICU 4.6 345 */ 346 U_DRAFT void U_EXPORT2 347 uregex_setUText(URegularExpression *regexp, 348 UText *text, 349 UErrorCode *status); 350 351 /** 352 * Get the subject text that is currently associated with this 353 * regular expression object. If the input was supplied using uregex_setText(), 354 * that pointer will be returned. Otherwise, the characters in the input will 355 * be extracted to a buffer and returned. In either case, ownership remains 356 * with the regular expression object. 357 * 358 * This function will work even if the input was originally specified as a UText. 359 * 360 * @param regexp The compiled regular expression. 361 * @param textLength The length of the string is returned in this output parameter. 362 * A NULL pointer may be used here if the 363 * text length is not needed, as would be the case if 364 * the text is known in advance to be a NUL terminated 365 * string. 366 * @param status Receives errors detected by this function. 367 * @return Pointer to the subject text string currently associated with 368 * this regular expression. 369 * @stable ICU 3.0 370 */ 371 U_STABLE const UChar * U_EXPORT2 372 uregex_getText(URegularExpression *regexp, 373 int32_t *textLength, 374 UErrorCode *status); 375 376 377 /** 378 * Get the subject text that is currently associated with this 379 * regular expression object. 380 * 381 * This function will work even if the input was originally specified as a UChar string. 382 * 383 * @param regexp The compiled regular expression. 384 * @param dest A mutable UText in which to store the current input. 385 * If NULL, a new UText will be created as an immutable shallow clone 386 * of the actual input string. 387 * @param status Receives errors detected by this function. 388 * @return The subject text currently associated with this regular expression. 389 * If a pre-allocated UText was provided, it will always be used and returned. 390 * 391 * @draft ICU 4.6 392 */ 393 U_DRAFT UText * U_EXPORT2 394 uregex_getUText(URegularExpression *regexp, 395 UText *dest, 396 UErrorCode *status); 397 398 /** 399 * Set the subject text string upon which the regular expression is looking for matches 400 * without changing any other aspect of the matching state. 401 * The new and previous text strings must have the same content. 402 * 403 * This function is intended for use in environments where ICU is operating on 404 * strings that may move around in memory. It provides a mechanism for notifying 405 * ICU that the string has been relocated, and providing a new UText to access the 406 * string in its new position. 407 * 408 * Note that the regular expression implementation never copies the underlying text 409 * of a string being matched, but always operates directly on the original text 410 * provided by the user. Refreshing simply drops the references to the old text 411 * and replaces them with references to the new. 412 * 413 * Caution: this function is normally used only by very specialized 414 * system-level code. One example use case is with garbage collection 415 * that moves the text in memory. 416 * 417 * @param regexp The compiled regular expression. 418 * @param text The new (moved) text string. 419 * @param status Receives errors detected by this function. 420 * 421 * @draft ICU 4.8 422 */ 423 U_DRAFT void U_EXPORT2 424 uregex_refreshUText(URegularExpression *regexp, 425 UText *text, 426 UErrorCode *status); 427 428 /** 429 * Attempts to match the input string against the pattern. 430 * To succeed, the match must extend to the end of the string, 431 * or cover the complete match region. 432 * 433 * If startIndex >= zero the match operation starts at the specified 434 * index and must extend to the end of the input string. Any region 435 * that has been specified is reset. 436 * 437 * If startIndex == -1 the match must cover the input region, or the entire 438 * input string if no region has been set. This directly corresponds to 439 * Matcher.matches() in Java 440 * 441 * @param regexp The compiled regular expression. 442 * @param startIndex The input string (native) index at which to begin matching, or -1 443 * to match the input Region. 444 * @param status Receives errors detected by this function. 445 * @return TRUE if there is a match 446 * @stable ICU 3.0 447 */ 448 U_STABLE UBool U_EXPORT2 449 uregex_matches(URegularExpression *regexp, 450 int32_t startIndex, 451 UErrorCode *status); 452 453 /** 454 * 64bit version of uregex_matches. 455 * Attempts to match the input string against the pattern. 456 * To succeed, the match must extend to the end of the string, 457 * or cover the complete match region. 458 * 459 * If startIndex >= zero the match operation starts at the specified 460 * index and must extend to the end of the input string. Any region 461 * that has been specified is reset. 462 * 463 * If startIndex == -1 the match must cover the input region, or the entire 464 * input string if no region has been set. This directly corresponds to 465 * Matcher.matches() in Java 466 * 467 * @param regexp The compiled regular expression. 468 * @param startIndex The input string (native) index at which to begin matching, or -1 469 * to match the input Region. 470 * @param status Receives errors detected by this function. 471 * @return TRUE if there is a match 472 * @draft ICU 4.6 473 */ 474 U_DRAFT UBool U_EXPORT2 475 uregex_matches64(URegularExpression *regexp, 476 int64_t startIndex, 477 UErrorCode *status); 478 479 /** 480 * Attempts to match the input string, starting from the specified index, against the pattern. 481 * The match may be of any length, and is not required to extend to the end 482 * of the input string. Contrast with uregex_matches(). 483 * 484 * <p>If startIndex is >= 0 any input region that was set for this 485 * URegularExpression is reset before the operation begins. 486 * 487 * <p>If the specified starting index == -1 the match begins at the start of the input 488 * region, or at the start of the full string if no region has been specified. 489 * This corresponds directly with Matcher.lookingAt() in Java. 490 * 491 * <p>If the match succeeds then more information can be obtained via the 492 * <code>uregexp_start()</code>, <code>uregexp_end()</code>, 493 * and <code>uregexp_group()</code> functions.</p> 494 * 495 * @param regexp The compiled regular expression. 496 * @param startIndex The input string (native) index at which to begin matching, or 497 * -1 to match the Input Region 498 * @param status A reference to a UErrorCode to receive any errors. 499 * @return TRUE if there is a match. 500 * @stable ICU 3.0 501 */ 502 U_STABLE UBool U_EXPORT2 503 uregex_lookingAt(URegularExpression *regexp, 504 int32_t startIndex, 505 UErrorCode *status); 506 507 /** 508 * 64bit version of uregex_lookingAt. 509 * Attempts to match the input string, starting from the specified index, against the pattern. 510 * The match may be of any length, and is not required to extend to the end 511 * of the input string. Contrast with uregex_matches(). 512 * 513 * <p>If startIndex is >= 0 any input region that was set for this 514 * URegularExpression is reset before the operation begins. 515 * 516 * <p>If the specified starting index == -1 the match begins at the start of the input 517 * region, or at the start of the full string if no region has been specified. 518 * This corresponds directly with Matcher.lookingAt() in Java. 519 * 520 * <p>If the match succeeds then more information can be obtained via the 521 * <code>uregexp_start()</code>, <code>uregexp_end()</code>, 522 * and <code>uregexp_group()</code> functions.</p> 523 * 524 * @param regexp The compiled regular expression. 525 * @param startIndex The input string (native) index at which to begin matching, or 526 * -1 to match the Input Region 527 * @param status A reference to a UErrorCode to receive any errors. 528 * @return TRUE if there is a match. 529 * @draft ICU 4.6 530 */ 531 U_DRAFT UBool U_EXPORT2 532 uregex_lookingAt64(URegularExpression *regexp, 533 int64_t startIndex, 534 UErrorCode *status); 535 536 /** 537 * Find the first matching substring of the input string that matches the pattern. 538 * If startIndex is >= zero the search for a match begins at the specified index, 539 * and any match region is reset. This corresponds directly with 540 * Matcher.find(startIndex) in Java. 541 * 542 * If startIndex == -1 the search begins at the start of the input region, 543 * or at the start of the full string if no region has been specified. 544 * 545 * If a match is found, <code>uregex_start(), uregex_end()</code>, and 546 * <code>uregex_group()</code> will provide more information regarding the match. 547 * 548 * @param regexp The compiled regular expression. 549 * @param startIndex The position (native) in the input string to begin the search, or 550 * -1 to search within the Input Region. 551 * @param status A reference to a UErrorCode to receive any errors. 552 * @return TRUE if a match is found. 553 * @stable ICU 3.0 554 */ 555 U_STABLE UBool U_EXPORT2 556 uregex_find(URegularExpression *regexp, 557 int32_t startIndex, 558 UErrorCode *status); 559 560 /** 561 * 64bit version of uregex_find. 562 * Find the first matching substring of the input string that matches the pattern. 563 * If startIndex is >= zero the search for a match begins at the specified index, 564 * and any match region is reset. This corresponds directly with 565 * Matcher.find(startIndex) in Java. 566 * 567 * If startIndex == -1 the search begins at the start of the input region, 568 * or at the start of the full string if no region has been specified. 569 * 570 * If a match is found, <code>uregex_start(), uregex_end()</code>, and 571 * <code>uregex_group()</code> will provide more information regarding the match. 572 * 573 * @param regexp The compiled regular expression. 574 * @param startIndex The position (native) in the input string to begin the search, or 575 * -1 to search within the Input Region. 576 * @param status A reference to a UErrorCode to receive any errors. 577 * @return TRUE if a match is found. 578 * @draft ICU 4.6 579 */ 580 U_DRAFT UBool U_EXPORT2 581 uregex_find64(URegularExpression *regexp, 582 int64_t startIndex, 583 UErrorCode *status); 584 585 /** 586 * Find the next pattern match in the input string. Begin searching 587 * the input at the location following the end of he previous match, 588 * or at the start of the string (or region) if there is no 589 * previous match. If a match is found, <code>uregex_start(), uregex_end()</code>, and 590 * <code>uregex_group()</code> will provide more information regarding the match. 591 * 592 * @param regexp The compiled regular expression. 593 * @param status A reference to a UErrorCode to receive any errors. 594 * @return TRUE if a match is found. 595 * @see uregex_reset 596 * @stable ICU 3.0 597 */ 598 U_STABLE UBool U_EXPORT2 599 uregex_findNext(URegularExpression *regexp, 600 UErrorCode *status); 601 602 /** 603 * Get the number of capturing groups in this regular expression's pattern. 604 * @param regexp The compiled regular expression. 605 * @param status A reference to a UErrorCode to receive any errors. 606 * @return the number of capture groups 607 * @stable ICU 3.0 608 */ 609 U_STABLE int32_t U_EXPORT2 610 uregex_groupCount(URegularExpression *regexp, 611 UErrorCode *status); 612 613 /** Extract the string for the specified matching expression or subexpression. 614 * Group #0 is the complete string of matched text. 615 * Group #1 is the text matched by the first set of capturing parentheses. 616 * 617 * @param regexp The compiled regular expression. 618 * @param groupNum The capture group to extract. Group 0 is the complete 619 * match. The value of this parameter must be 620 * less than or equal to the number of capture groups in 621 * the pattern. 622 * @param dest Buffer to receive the matching string data 623 * @param destCapacity Capacity of the dest buffer. 624 * @param status A reference to a UErrorCode to receive any errors. 625 * @return Length of matching data, 626 * or -1 if no applicable match. 627 * @stable ICU 3.0 628 */ 629 U_STABLE int32_t U_EXPORT2 630 uregex_group(URegularExpression *regexp, 631 int32_t groupNum, 632 UChar *dest, 633 int32_t destCapacity, 634 UErrorCode *status); 635 636 637 /** Returns a shallow immutable clone of the entire input string. The returned UText current native index 638 * is set to the beginning of the requested capture group. The capture group length is also 639 * returned via groupLength. 640 * Group #0 is the complete string of matched text. 641 * Group #1 is the text matched by the first set of capturing parentheses. 642 * 643 * @param regexp The compiled regular expression. 644 * @param groupNum The capture group to extract. Group 0 is the complete 645 * match. The value of this parameter must be 646 * less than or equal to the number of capture groups in 647 * the pattern. 648 * @param dest A mutable UText in which to store the current input. 649 * If NULL, a new UText will be created as an immutable shallow clone 650 * of the entire input string. 651 * @param groupLength The group length of the desired capture group. 652 * @param status A reference to a UErrorCode to receive any errors. 653 * @return The subject text currently associated with this regular expression. 654 * If a pre-allocated UText was provided, it will always be used and returned. 655 656 * 657 * @draft ICU 4.6 658 */ 659 U_DRAFT UText * U_EXPORT2 660 uregex_groupUText(URegularExpression *regexp, 661 int32_t groupNum, 662 UText *dest, 663 int64_t *groupLength, 664 UErrorCode *status); 665 666 667 /** Extract the string for the specified matching expression or subexpression. 668 * Group #0 is the complete string of matched text. 669 * Group #1 is the text matched by the first set of capturing parentheses. 670 * 671 * @param regexp The compiled regular expression. 672 * @param groupNum The capture group to extract. Group 0 is the complete 673 * match. The value of this parameter must be 674 * less than or equal to the number of capture groups in 675 * the pattern. 676 * @param dest Mutable UText to receive the matching string data. 677 * If NULL, a new UText will be created (which may not be mutable). 678 * @param status A reference to a UErrorCode to receive any errors. 679 * @return The matching string data. If a pre-allocated UText was provided, 680 * it will always be used and returned. 681 * 682 * @internal ICU 4.4 technology preview 683 */ 684 U_INTERNAL UText * U_EXPORT2 685 uregex_groupUTextDeep(URegularExpression *regexp, 686 int32_t groupNum, 687 UText *dest, 688 UErrorCode *status); 689 690 /** 691 * Returns the index in the input string of the start of the text matched by the 692 * specified capture group during the previous match operation. Return -1 if 693 * the capture group was not part of the last match. 694 * Group #0 refers to the complete range of matched text. 695 * Group #1 refers to the text matched by the first set of capturing parentheses. 696 * 697 * @param regexp The compiled regular expression. 698 * @param groupNum The capture group number 699 * @param status A reference to a UErrorCode to receive any errors. 700 * @return the starting (native) position in the input of the text matched 701 * by the specified group. 702 * @stable ICU 3.0 703 */ 704 U_STABLE int32_t U_EXPORT2 705 uregex_start(URegularExpression *regexp, 706 int32_t groupNum, 707 UErrorCode *status); 708 709 /** 710 * 64bit version of uregex_start. 711 * Returns the index in the input string of the start of the text matched by the 712 * specified capture group during the previous match operation. Return -1 if 713 * the capture group was not part of the last match. 714 * Group #0 refers to the complete range of matched text. 715 * Group #1 refers to the text matched by the first set of capturing parentheses. 716 * 717 * @param regexp The compiled regular expression. 718 * @param groupNum The capture group number 719 * @param status A reference to a UErrorCode to receive any errors. 720 * @return the starting (native) position in the input of the text matched 721 * by the specified group. 722 * @draft ICU 4.6 723 */ 724 U_DRAFT int64_t U_EXPORT2 725 uregex_start64(URegularExpression *regexp, 726 int32_t groupNum, 727 UErrorCode *status); 728 729 /** 730 * Returns the index in the input string of the position following the end 731 * of the text matched by the specified capture group. 732 * Return -1 if the capture group was not part of the last match. 733 * Group #0 refers to the complete range of matched text. 734 * Group #1 refers to the text matched by the first set of capturing parentheses. 735 * 736 * @param regexp The compiled regular expression. 737 * @param groupNum The capture group number 738 * @param status A reference to a UErrorCode to receive any errors. 739 * @return the (native) index of the position following the last matched character. 740 * @stable ICU 3.0 741 */ 742 U_STABLE int32_t U_EXPORT2 743 uregex_end(URegularExpression *regexp, 744 int32_t groupNum, 745 UErrorCode *status); 746 747 /** 748 * 64bit version of uregex_end. 749 * Returns the index in the input string of the position following the end 750 * of the text matched by the specified capture group. 751 * Return -1 if the capture group was not part of the last match. 752 * Group #0 refers to the complete range of matched text. 753 * Group #1 refers to the text matched by the first set of capturing parentheses. 754 * 755 * @param regexp The compiled regular expression. 756 * @param groupNum The capture group number 757 * @param status A reference to a UErrorCode to receive any errors. 758 * @return the (native) index of the position following the last matched character. 759 * @draft ICU 4.6 760 */ 761 U_DRAFT int64_t U_EXPORT2 762 uregex_end64(URegularExpression *regexp, 763 int32_t groupNum, 764 UErrorCode *status); 765 766 /** 767 * Reset any saved state from the previous match. Has the effect of 768 * causing uregex_findNext to begin at the specified index, and causing 769 * uregex_start(), uregex_end() and uregex_group() to return an error 770 * indicating that there is no match information available. Clears any 771 * match region that may have been set. 772 * 773 * @param regexp The compiled regular expression. 774 * @param index The position (native) in the text at which a 775 * uregex_findNext() should begin searching. 776 * @param status A reference to a UErrorCode to receive any errors. 777 * @stable ICU 3.0 778 */ 779 U_STABLE void U_EXPORT2 780 uregex_reset(URegularExpression *regexp, 781 int32_t index, 782 UErrorCode *status); 783 784 /** 785 * 64bit version of uregex_reset. 786 * Reset any saved state from the previous match. Has the effect of 787 * causing uregex_findNext to begin at the specified index, and causing 788 * uregex_start(), uregex_end() and uregex_group() to return an error 789 * indicating that there is no match information available. Clears any 790 * match region that may have been set. 791 * 792 * @param regexp The compiled regular expression. 793 * @param index The position (native) in the text at which a 794 * uregex_findNext() should begin searching. 795 * @param status A reference to a UErrorCode to receive any errors. 796 * @draft ICU 4.6 797 */ 798 U_DRAFT void U_EXPORT2 799 uregex_reset64(URegularExpression *regexp, 800 int64_t index, 801 UErrorCode *status); 802 803 /** 804 * Sets the limits of the matching region for this URegularExpression. 805 * The region is the part of the input string that will be considered when matching. 806 * Invoking this method resets any saved state from the previous match, 807 * then sets the region to start at the index specified by the start parameter 808 * and end at the index specified by the end parameter. 809 * 810 * Depending on the transparency and anchoring being used (see useTransparentBounds 811 * and useAnchoringBounds), certain constructs such as anchors may behave differently 812 * at or around the boundaries of the region 813 * 814 * The function will fail if start is greater than limit, or if either index 815 * is less than zero or greater than the length of the string being matched. 816 * 817 * @param regexp The compiled regular expression. 818 * @param regionStart The (native) index to begin searches at. 819 * @param regionLimit The (native) index to end searches at (exclusive). 820 * @param status A pointer to a UErrorCode to receive any errors. 821 * @stable ICU 4.0 822 */ 823 U_STABLE void U_EXPORT2 824 uregex_setRegion(URegularExpression *regexp, 825 int32_t regionStart, 826 int32_t regionLimit, 827 UErrorCode *status); 828 829 /** 830 * 64bit version of uregex_setRegion. 831 * Sets the limits of the matching region for this URegularExpression. 832 * The region is the part of the input string that will be considered when matching. 833 * Invoking this method resets any saved state from the previous match, 834 * then sets the region to start at the index specified by the start parameter 835 * and end at the index specified by the end parameter. 836 * 837 * Depending on the transparency and anchoring being used (see useTransparentBounds 838 * and useAnchoringBounds), certain constructs such as anchors may behave differently 839 * at or around the boundaries of the region 840 * 841 * The function will fail if start is greater than limit, or if either index 842 * is less than zero or greater than the length of the string being matched. 843 * 844 * @param regexp The compiled regular expression. 845 * @param regionStart The (native) index to begin searches at. 846 * @param regionLimit The (native) index to end searches at (exclusive). 847 * @param status A pointer to a UErrorCode to receive any errors. 848 * @draft ICU 4.6 849 */ 850 U_DRAFT void U_EXPORT2 851 uregex_setRegion64(URegularExpression *regexp, 852 int64_t regionStart, 853 int64_t regionLimit, 854 UErrorCode *status); 855 856 /** 857 * Set the matching region and the starting index for subsequent matches 858 * in a single operation. 859 * This is useful because the usual function for setting the starting 860 * index, urgex_reset(), also resets any region limits. 861 * 862 * @param regexp The compiled regular expression. 863 * @param regionStart The (native) index to begin searches at. 864 * @param regionLimit The (native) index to end searches at (exclusive). 865 * @param startIndex The index in the input text at which the next 866 * match operation should begin. 867 * @param status A pointer to a UErrorCode to receive any errors. 868 * @draft ICU 4.6 869 */ 870 U_DRAFT void U_EXPORT2 871 uregex_setRegionAndStart(URegularExpression *regexp, 872 int64_t regionStart, 873 int64_t regionLimit, 874 int64_t startIndex, 875 UErrorCode *status); 876 877 /** 878 * Reports the start index of the matching region. Any matches found are limited to 879 * to the region bounded by regionStart (inclusive) and regionEnd (exclusive). 880 * 881 * @param regexp The compiled regular expression. 882 * @param status A pointer to a UErrorCode to receive any errors. 883 * @return The starting (native) index of this matcher's region. 884 * @stable ICU 4.0 885 */ 886 U_STABLE int32_t U_EXPORT2 887 uregex_regionStart(const URegularExpression *regexp, 888 UErrorCode *status); 889 890 /** 891 * 64bit version of uregex_regionStart. 892 * Reports the start index of the matching region. Any matches found are limited to 893 * to the region bounded by regionStart (inclusive) and regionEnd (exclusive). 894 * 895 * @param regexp The compiled regular expression. 896 * @param status A pointer to a UErrorCode to receive any errors. 897 * @return The starting (native) index of this matcher's region. 898 * @draft ICU 4.6 899 */ 900 U_DRAFT int64_t U_EXPORT2 901 uregex_regionStart64(const URegularExpression *regexp, 902 UErrorCode *status); 903 904 /** 905 * Reports the end index (exclusive) of the matching region for this URegularExpression. 906 * Any matches found are limited to to the region bounded by regionStart (inclusive) 907 * and regionEnd (exclusive). 908 * 909 * @param regexp The compiled regular expression. 910 * @param status A pointer to a UErrorCode to receive any errors. 911 * @return The ending point (native) of this matcher's region. 912 * @stable ICU 4.0 913 */ 914 U_STABLE int32_t U_EXPORT2 915 uregex_regionEnd(const URegularExpression *regexp, 916 UErrorCode *status); 917 918 /** 919 * 64bit version of uregex_regionEnd. 920 * Reports the end index (exclusive) of the matching region for this URegularExpression. 921 * Any matches found are limited to to the region bounded by regionStart (inclusive) 922 * and regionEnd (exclusive). 923 * 924 * @param regexp The compiled regular expression. 925 * @param status A pointer to a UErrorCode to receive any errors. 926 * @return The ending point (native) of this matcher's region. 927 * @draft ICU 4.6 928 */ 929 U_DRAFT int64_t U_EXPORT2 930 uregex_regionEnd64(const URegularExpression *regexp, 931 UErrorCode *status); 932 933 /** 934 * Queries the transparency of region bounds for this URegularExpression. 935 * See useTransparentBounds for a description of transparent and opaque bounds. 936 * By default, matching boundaries are opaque. 937 * 938 * @param regexp The compiled regular expression. 939 * @param status A pointer to a UErrorCode to receive any errors. 940 * @return TRUE if this matcher is using opaque bounds, false if it is not. 941 * @stable ICU 4.0 942 */ 943 U_STABLE UBool U_EXPORT2 944 uregex_hasTransparentBounds(const URegularExpression *regexp, 945 UErrorCode *status); 946 947 948 /** 949 * Sets the transparency of region bounds for this URegularExpression. 950 * Invoking this function with an argument of TRUE will set matches to use transparent bounds. 951 * If the boolean argument is FALSE, then opaque bounds will be used. 952 * 953 * Using transparent bounds, the boundaries of the matching region are transparent 954 * to lookahead, lookbehind, and boundary matching constructs. Those constructs can 955 * see text beyond the boundaries of the region while checking for a match. 956 * 957 * With opaque bounds, no text outside of the matching region is visible to lookahead, 958 * lookbehind, and boundary matching constructs. 959 * 960 * By default, opaque bounds are used. 961 * 962 * @param regexp The compiled regular expression. 963 * @param b TRUE for transparent bounds; FALSE for opaque bounds 964 * @param status A pointer to a UErrorCode to receive any errors. 965 * @stable ICU 4.0 966 **/ 967 U_STABLE void U_EXPORT2 968 uregex_useTransparentBounds(URegularExpression *regexp, 969 UBool b, 970 UErrorCode *status); 971 972 973 /** 974 * Return true if this URegularExpression is using anchoring bounds. 975 * By default, anchoring region bounds are used. 976 * 977 * @param regexp The compiled regular expression. 978 * @param status A pointer to a UErrorCode to receive any errors. 979 * @return TRUE if this matcher is using anchoring bounds. 980 * @stable ICU 4.0 981 */ 982 U_STABLE UBool U_EXPORT2 983 uregex_hasAnchoringBounds(const URegularExpression *regexp, 984 UErrorCode *status); 985 986 987 /** 988 * Set whether this URegularExpression is using Anchoring Bounds for its region. 989 * With anchoring bounds, pattern anchors such as ^ and $ will match at the start 990 * and end of the region. Without Anchoring Bounds, anchors will only match at 991 * the positions they would in the complete text. 992 * 993 * Anchoring Bounds are the default for regions. 994 * 995 * @param regexp The compiled regular expression. 996 * @param b TRUE if to enable anchoring bounds; FALSE to disable them. 997 * @param status A pointer to a UErrorCode to receive any errors. 998 * @stable ICU 4.0 999 */ 1000 U_STABLE void U_EXPORT2 1001 uregex_useAnchoringBounds(URegularExpression *regexp, 1002 UBool b, 1003 UErrorCode *status); 1004 1005 /** 1006 * Return TRUE if the most recent matching operation touched the 1007 * end of the text being processed. In this case, additional input text could 1008 * change the results of that match. 1009 * 1010 * @param regexp The compiled regular expression. 1011 * @param status A pointer to a UErrorCode to receive any errors. 1012 * @return TRUE if the most recent match hit the end of input 1013 * @stable ICU 4.0 1014 */ 1015 U_STABLE UBool U_EXPORT2 1016 uregex_hitEnd(const URegularExpression *regexp, 1017 UErrorCode *status); 1018 1019 /** 1020 * Return TRUE the most recent match succeeded and additional input could cause 1021 * it to fail. If this function returns false and a match was found, then more input 1022 * might change the match but the match won't be lost. If a match was not found, 1023 * then requireEnd has no meaning. 1024 * 1025 * @param regexp The compiled regular expression. 1026 * @param status A pointer to a UErrorCode to receive any errors. 1027 * @return TRUE if more input could cause the most recent match to no longer match. 1028 * @stable ICU 4.0 1029 */ 1030 U_STABLE UBool U_EXPORT2 1031 uregex_requireEnd(const URegularExpression *regexp, 1032 UErrorCode *status); 1033 1034 1035 1036 1037 1038 /** 1039 * Replaces every substring of the input that matches the pattern 1040 * with the given replacement string. This is a convenience function that 1041 * provides a complete find-and-replace-all operation. 1042 * 1043 * This method scans the input string looking for matches of the pattern. 1044 * Input that is not part of any match is copied unchanged to the 1045 * destination buffer. Matched regions are replaced in the output 1046 * buffer by the replacement string. The replacement string may contain 1047 * references to capture groups; these take the form of $1, $2, etc. 1048 * 1049 * @param regexp The compiled regular expression. 1050 * @param replacementText A string containing the replacement text. 1051 * @param replacementLength The length of the replacement string, or 1052 * -1 if it is NUL terminated. 1053 * @param destBuf A (UChar *) buffer that will receive the result. 1054 * @param destCapacity The capacity of the destination buffer. 1055 * @param status A reference to a UErrorCode to receive any errors. 1056 * @return The length of the string resulting from the find 1057 * and replace operation. In the event that the 1058 * destination capacity is inadequate, the return value 1059 * is still the full length of the untruncated string. 1060 * @stable ICU 3.0 1061 */ 1062 U_STABLE int32_t U_EXPORT2 1063 uregex_replaceAll(URegularExpression *regexp, 1064 const UChar *replacementText, 1065 int32_t replacementLength, 1066 UChar *destBuf, 1067 int32_t destCapacity, 1068 UErrorCode *status); 1069 1070 /** 1071 * Replaces every substring of the input that matches the pattern 1072 * with the given replacement string. This is a convenience function that 1073 * provides a complete find-and-replace-all operation. 1074 * 1075 * This method scans the input string looking for matches of the pattern. 1076 * Input that is not part of any match is copied unchanged to the 1077 * destination buffer. Matched regions are replaced in the output 1078 * buffer by the replacement string. The replacement string may contain 1079 * references to capture groups; these take the form of $1, $2, etc. 1080 * 1081 * @param regexp The compiled regular expression. 1082 * @param replacement A string containing the replacement text. 1083 * @param dest A mutable UText that will receive the result. 1084 * If NULL, a new UText will be created (which may not be mutable). 1085 * @param status A reference to a UErrorCode to receive any errors. 1086 * @return A UText containing the results of the find and replace. 1087 * If a pre-allocated UText was provided, it will always be used and returned. 1088 * 1089 * @draft ICU 4.6 1090 */ 1091 U_DRAFT UText * U_EXPORT2 1092 uregex_replaceAllUText(URegularExpression *regexp, 1093 UText *replacement, 1094 UText *dest, 1095 UErrorCode *status); 1096 1097 /** 1098 * Replaces the first substring of the input that matches the pattern 1099 * with the given replacement string. This is a convenience function that 1100 * provides a complete find-and-replace operation. 1101 * 1102 * This method scans the input string looking for a match of the pattern. 1103 * All input that is not part of the match is copied unchanged to the 1104 * destination buffer. The matched region is replaced in the output 1105 * buffer by the replacement string. The replacement string may contain 1106 * references to capture groups; these take the form of $1, $2, etc. 1107 * 1108 * @param regexp The compiled regular expression. 1109 * @param replacementText A string containing the replacement text. 1110 * @param replacementLength The length of the replacement string, or 1111 * -1 if it is NUL terminated. 1112 * @param destBuf A (UChar *) buffer that will receive the result. 1113 * @param destCapacity The capacity of the destination buffer. 1114 * @param status a reference to a UErrorCode to receive any errors. 1115 * @return The length of the string resulting from the find 1116 * and replace operation. In the event that the 1117 * destination capacity is inadequate, the return value 1118 * is still the full length of the untruncated string. 1119 * @stable ICU 3.0 1120 */ 1121 U_STABLE int32_t U_EXPORT2 1122 uregex_replaceFirst(URegularExpression *regexp, 1123 const UChar *replacementText, 1124 int32_t replacementLength, 1125 UChar *destBuf, 1126 int32_t destCapacity, 1127 UErrorCode *status); 1128 1129 /** 1130 * Replaces the first substring of the input that matches the pattern 1131 * with the given replacement string. This is a convenience function that 1132 * provides a complete find-and-replace operation. 1133 * 1134 * This method scans the input string looking for a match of the pattern. 1135 * All input that is not part of the match is copied unchanged to the 1136 * destination buffer. The matched region is replaced in the output 1137 * buffer by the replacement string. The replacement string may contain 1138 * references to capture groups; these take the form of $1, $2, etc. 1139 * 1140 * @param regexp The compiled regular expression. 1141 * @param replacement A string containing the replacement text. 1142 * @param dest A mutable UText that will receive the result. 1143 * If NULL, a new UText will be created (which may not be mutable). 1144 * @param status A reference to a UErrorCode to receive any errors. 1145 * @return A UText containing the results of the find and replace. 1146 * If a pre-allocated UText was provided, it will always be used and returned. 1147 * 1148 * @draft ICU 4.6 1149 */ 1150 U_DRAFT UText * U_EXPORT2 1151 uregex_replaceFirstUText(URegularExpression *regexp, 1152 UText *replacement, 1153 UText *dest, 1154 UErrorCode *status); 1155 1156 1157 /** 1158 * Implements a replace operation intended to be used as part of an 1159 * incremental find-and-replace. 1160 * 1161 * <p>The input string, starting from the end of the previous match and ending at 1162 * the start of the current match, is appended to the destination string. Then the 1163 * replacement string is appended to the output string, 1164 * including handling any substitutions of captured text.</p> 1165 * 1166 * <p>A note on preflight computation of buffersize and error handling: 1167 * Calls to uregex_appendReplacement() and uregex_appendTail() are 1168 * designed to be chained, one after another, with the destination 1169 * buffer pointer and buffer capacity updated after each in preparation 1170 * to for the next. If the destination buffer is exhausted partway through such a 1171 * sequence, a U_BUFFER_OVERFLOW_ERROR status will be returned. Normal 1172 * ICU conventions are for a function to perform no action if it is 1173 * called with an error status, but for this one case, uregex_appendRepacement() 1174 * will operate normally so that buffer size computations will complete 1175 * correctly. 1176 * 1177 * <p>For simple, prepackaged, non-incremental find-and-replace 1178 * operations, see replaceFirst() or replaceAll().</p> 1179 * 1180 * @param regexp The regular expression object. 1181 * @param replacementText The string that will replace the matched portion of the 1182 * input string as it is copied to the destination buffer. 1183 * The replacement text may contain references ($1, for 1184 * example) to capture groups from the match. 1185 * @param replacementLength The length of the replacement text string, 1186 * or -1 if the string is NUL terminated. 1187 * @param destBuf The buffer into which the results of the 1188 * find-and-replace are placed. On return, this pointer 1189 * will be updated to refer to the beginning of the 1190 * unused portion of buffer, leaving it in position for 1191 * a subsequent call to this function. 1192 * @param destCapacity The size of the output buffer, On return, this 1193 * parameter will be updated to reflect the space remaining 1194 * unused in the output buffer. 1195 * @param status A reference to a UErrorCode to receive any errors. 1196 * @return The length of the result string. In the event that 1197 * destCapacity is inadequate, the full length of the 1198 * untruncated output string is returned. 1199 * 1200 * @stable ICU 3.0 1201 * 1202 */ 1203 U_STABLE int32_t U_EXPORT2 1204 uregex_appendReplacement(URegularExpression *regexp, 1205 const UChar *replacementText, 1206 int32_t replacementLength, 1207 UChar **destBuf, 1208 int32_t *destCapacity, 1209 UErrorCode *status); 1210 1211 1212 /** 1213 * Implements a replace operation intended to be used as part of an 1214 * incremental find-and-replace. 1215 * 1216 * <p>The input string, starting from the end of the previous match and ending at 1217 * the start of the current match, is appended to the destination string. Then the 1218 * replacement string is appended to the output string, 1219 * including handling any substitutions of captured text.</p> 1220 * 1221 * <p>For simple, prepackaged, non-incremental find-and-replace 1222 * operations, see replaceFirst() or replaceAll().</p> 1223 * 1224 * @param regexp The regular expression object. 1225 * @param replacementText The string that will replace the matched portion of the 1226 * input string as it is copied to the destination buffer. 1227 * The replacement text may contain references ($1, for 1228 * example) to capture groups from the match. 1229 * @param dest A mutable UText that will receive the result. Must not be NULL. 1230 * @param status A reference to a UErrorCode to receive any errors. 1231 * 1232 * @draft ICU 4.6 1233 */ 1234 U_DRAFT void U_EXPORT2 1235 uregex_appendReplacementUText(URegularExpression *regexp, 1236 UText *replacementText, 1237 UText *dest, 1238 UErrorCode *status); 1239 1240 1241 /** 1242 * As the final step in a find-and-replace operation, append the remainder 1243 * of the input string, starting at the position following the last match, 1244 * to the destination string. <code>uregex_appendTail()</code> is intended 1245 * to be invoked after one or more invocations of the 1246 * <code>uregex_appendReplacement()</code> function. 1247 * 1248 * @param regexp The regular expression object. This is needed to 1249 * obtain the input string and with the position 1250 * of the last match within it. 1251 * @param destBuf The buffer in which the results of the 1252 * find-and-replace are placed. On return, the pointer 1253 * will be updated to refer to the beginning of the 1254 * unused portion of buffer. 1255 * @param destCapacity The size of the output buffer, On return, this 1256 * value will be updated to reflect the space remaining 1257 * unused in the output buffer. 1258 * @param status A reference to a UErrorCode to receive any errors. 1259 * @return The length of the result string. In the event that 1260 * destCapacity is inadequate, the full length of the 1261 * untruncated output string is returned. 1262 * 1263 * @stable ICU 3.0 1264 */ 1265 U_STABLE int32_t U_EXPORT2 1266 uregex_appendTail(URegularExpression *regexp, 1267 UChar **destBuf, 1268 int32_t *destCapacity, 1269 UErrorCode *status); 1270 1271 1272 /** 1273 * As the final step in a find-and-replace operation, append the remainder 1274 * of the input string, starting at the position following the last match, 1275 * to the destination string. <code>uregex_appendTailUText()</code> is intended 1276 * to be invoked after one or more invocations of the 1277 * <code>uregex_appendReplacementUText()</code> function. 1278 * 1279 * @param regexp The regular expression object. This is needed to 1280 * obtain the input string and with the position 1281 * of the last match within it. 1282 * @param dest A mutable UText that will receive the result. Must not be NULL. 1283 * 1284 * @param status Error code 1285 * 1286 * @return The destination UText. 1287 * 1288 * @draft ICU 4.6 1289 */ 1290 U_DRAFT UText * U_EXPORT2 1291 uregex_appendTailUText(URegularExpression *regexp, 1292 UText *dest, 1293 UErrorCode *status); 1294 1295 1296 1297 /** 1298 * Split a string into fields. Somewhat like split() from Perl. 1299 * The pattern matches identify delimiters that separate the input 1300 * into fields. The input data between the matches becomes the 1301 * fields themselves. 1302 * 1303 * Each of the fields is copied from the input string to the destination 1304 * buffer, and NUL terminated. The position of each field within 1305 * the destination buffer is returned in the destFields array. 1306 * 1307 * If the delimiter pattern includes capture groups, the captured text will 1308 * also appear in the destination array of output strings, interspersed 1309 * with the fields. This is similar to Perl, but differs from Java, 1310 * which ignores the presence of capture groups in the pattern. 1311 * 1312 * Trailing empty fields will always be returned, assuming sufficient 1313 * destination capacity. This differs from the default behavior for Java 1314 * and Perl where trailing empty fields are not returned. 1315 * 1316 * The number of strings produced by the split operation is returned. 1317 * This count includes the strings from capture groups in the delimiter pattern. 1318 * This behavior differs from Java, which ignores capture groups. 1319 * 1320 * @param regexp The compiled regular expression. 1321 * @param destBuf A (UChar *) buffer to receive the fields that 1322 * are extracted from the input string. These 1323 * field pointers will refer to positions within the 1324 * destination buffer supplied by the caller. Any 1325 * extra positions within the destFields array will be 1326 * set to NULL. 1327 * @param destCapacity The capacity of the destBuf. 1328 * @param requiredCapacity The actual capacity required of the destBuf. 1329 * If destCapacity is too small, requiredCapacity will return 1330 * the total capacity required to hold all of the output, and 1331 * a U_BUFFER_OVERFLOW_ERROR will be returned. 1332 * @param destFields An array to be filled with the position of each 1333 * of the extracted fields within destBuf. 1334 * @param destFieldsCapacity The number of elements in the destFields array. 1335 * If the number of fields found is less than destFieldsCapacity, 1336 * the extra destFields elements are set to zero. 1337 * If destFieldsCapacity is too small, the trailing part of the 1338 * input, including any field delimiters, is treated as if it 1339 * were the last field - it is copied to the destBuf, and 1340 * its position is in the destBuf is stored in the last element 1341 * of destFields. This behavior mimics that of Perl. It is not 1342 * an error condition, and no error status is returned when all destField 1343 * positions are used. 1344 * @param status A reference to a UErrorCode to receive any errors. 1345 * @return The number of fields into which the input string was split. 1346 * @stable ICU 3.0 1347 */ 1348 U_STABLE int32_t U_EXPORT2 1349 uregex_split( URegularExpression *regexp, 1350 UChar *destBuf, 1351 int32_t destCapacity, 1352 int32_t *requiredCapacity, 1353 UChar *destFields[], 1354 int32_t destFieldsCapacity, 1355 UErrorCode *status); 1356 1357 1358 /** 1359 * Split a string into fields. Somewhat like split() from Perl. 1360 * The pattern matches identify delimiters that separate the input 1361 * into fields. The input data between the matches becomes the 1362 * fields themselves. 1363 * <p> 1364 * The behavior of this function is not very closely aligned with uregex_split(); 1365 * instead, it is based on (and implemented directly on top of) the C++ split method. 1366 * 1367 * @param regexp The compiled regular expression. 1368 * @param destFields An array of mutable UText structs to receive the results of the split. 1369 * If a field is NULL, a new UText is allocated to contain the results for 1370 * that field. This new UText is not guaranteed to be mutable. 1371 * @param destFieldsCapacity The number of elements in the destination array. 1372 * If the number of fields found is less than destCapacity, the 1373 * extra strings in the destination array are not altered. 1374 * If the number of destination strings is less than the number 1375 * of fields, the trailing part of the input string, including any 1376 * field delimiters, is placed in the last destination string. 1377 * This behavior mimics that of Perl. It is not an error condition, and no 1378 * error status is returned when all destField positions are used. 1379 * @param status A reference to a UErrorCode to receive any errors. 1380 * @return The number of fields into which the input string was split. 1381 * 1382 * @draft ICU 4.6 1383 */ 1384 U_DRAFT int32_t U_EXPORT2 1385 uregex_splitUText(URegularExpression *regexp, 1386 UText *destFields[], 1387 int32_t destFieldsCapacity, 1388 UErrorCode *status); 1389 1390 1391 1392 1393 /** 1394 * Set a processing time limit for match operations with this URegularExpression. 1395 * 1396 * Some patterns, when matching certain strings, can run in exponential time. 1397 * For practical purposes, the match operation may appear to be in an 1398 * infinite loop. 1399 * When a limit is set a match operation will fail with an error if the 1400 * limit is exceeded. 1401 * <p> 1402 * The units of the limit are steps of the match engine. 1403 * Correspondence with actual processor time will depend on the speed 1404 * of the processor and the details of the specific pattern, but will 1405 * typically be on the order of milliseconds. 1406 * <p> 1407 * By default, the matching time is not limited. 1408 * <p> 1409 * 1410 * @param regexp The compiled regular expression. 1411 * @param limit The limit value, or 0 for no limit. 1412 * @param status A reference to a UErrorCode to receive any errors. 1413 * @stable ICU 4.0 1414 */ 1415 U_STABLE void U_EXPORT2 1416 uregex_setTimeLimit(URegularExpression *regexp, 1417 int32_t limit, 1418 UErrorCode *status); 1419 1420 /** 1421 * Get the time limit for for matches with this URegularExpression. 1422 * A return value of zero indicates that there is no limit. 1423 * 1424 * @param regexp The compiled regular expression. 1425 * @param status A reference to a UErrorCode to receive any errors. 1426 * @return the maximum allowed time for a match, in units of processing steps. 1427 * @stable ICU 4.0 1428 */ 1429 U_STABLE int32_t U_EXPORT2 1430 uregex_getTimeLimit(const URegularExpression *regexp, 1431 UErrorCode *status); 1432 1433 /** 1434 * Set the amount of heap storage available for use by the match backtracking stack. 1435 * <p> 1436 * ICU uses a backtracking regular expression engine, with the backtrack stack 1437 * maintained on the heap. This function sets the limit to the amount of memory 1438 * that can be used for this purpose. A backtracking stack overflow will 1439 * result in an error from the match operation that caused it. 1440 * <p> 1441 * A limit is desirable because a malicious or poorly designed pattern can use 1442 * excessive memory, potentially crashing the process. A limit is enabled 1443 * by default. 1444 * <p> 1445 * @param regexp The compiled regular expression. 1446 * @param limit The maximum size, in bytes, of the matching backtrack stack. 1447 * A value of -1 means no limit. 1448 * The limit must be greater than zero, or -1. 1449 * @param status A reference to a UErrorCode to receive any errors. 1450 * 1451 * @stable ICU 4.0 1452 */ 1453 U_STABLE void U_EXPORT2 1454 uregex_setStackLimit(URegularExpression *regexp, 1455 int32_t limit, 1456 UErrorCode *status); 1457 1458 /** 1459 * Get the size of the heap storage available for use by the back tracking stack. 1460 * 1461 * @return the maximum backtracking stack size, in bytes, or zero if the 1462 * stack size is unlimited. 1463 * @stable ICU 4.0 1464 */ 1465 U_STABLE int32_t U_EXPORT2 1466 uregex_getStackLimit(const URegularExpression *regexp, 1467 UErrorCode *status); 1468 1469 1470 /** 1471 * Function pointer for a regular expression matching callback function. 1472 * When set, a callback function will be called periodically during matching 1473 * operations. If the call back function returns FALSE, the matching 1474 * operation will be terminated early. 1475 * 1476 * Note: the callback function must not call other functions on this 1477 * URegularExpression. 1478 * 1479 * @param context context pointer. The callback function will be invoked 1480 * with the context specified at the time that 1481 * uregex_setMatchCallback() is called. 1482 * @param steps the accumulated processing time, in match steps, 1483 * for this matching operation. 1484 * @return TRUE to continue the matching operation. 1485 * FALSE to terminate the matching operation. 1486 * @stable ICU 4.0 1487 */ 1488 U_CDECL_BEGIN 1489 typedef UBool U_CALLCONV URegexMatchCallback ( 1490 const void *context, 1491 int32_t steps); 1492 U_CDECL_END 1493 1494 /** 1495 * Set a callback function for this URegularExpression. 1496 * During matching operations the function will be called periodically, 1497 * giving the application the opportunity to terminate a long-running 1498 * match. 1499 * 1500 * @param regexp The compiled regular expression. 1501 * @param callback A pointer to the user-supplied callback function. 1502 * @param context User context pointer. The value supplied at the 1503 * time the callback function is set will be saved 1504 * and passed to the callback each time that it is called. 1505 * @param status A reference to a UErrorCode to receive any errors. 1506 * @stable ICU 4.0 1507 */ 1508 U_STABLE void U_EXPORT2 1509 uregex_setMatchCallback(URegularExpression *regexp, 1510 URegexMatchCallback *callback, 1511 const void *context, 1512 UErrorCode *status); 1513 1514 1515 /** 1516 * Get the callback function for this URegularExpression. 1517 * 1518 * @param regexp The compiled regular expression. 1519 * @param callback Out parameter, receives a pointer to the user-supplied 1520 * callback function. 1521 * @param context Out parameter, receives the user context pointer that 1522 * was set when uregex_setMatchCallback() was called. 1523 * @param status A reference to a UErrorCode to receive any errors. 1524 * @stable ICU 4.0 1525 */ 1526 U_STABLE void U_EXPORT2 1527 uregex_getMatchCallback(const URegularExpression *regexp, 1528 URegexMatchCallback **callback, 1529 const void **context, 1530 UErrorCode *status); 1531 1532 1533 /** 1534 * Function pointer for a regular expression find callback function. 1535 * 1536 * When set, a callback function will be called during a find operation 1537 * and for operations that depend on find, such as findNext, split and some replace 1538 * operations like replaceFirst. 1539 * The callback will usually be called after each attempt at a match, but this is not a 1540 * guarantee that the callback will be invoked at each character. For finds where the 1541 * match engine is invoked at each character, this may be close to true, but less likely 1542 * for more optimized loops where the pattern is known to only start, and the match 1543 * engine invoked, at certain characters. 1544 * When invoked, this callback will specify the index at which a match operation is about 1545 * to be attempted, giving the application the opportunity to terminate a long-running 1546 * find operation. 1547 * 1548 * If the call back function returns FALSE, the find operation will be terminated early. 1549 * 1550 * Note: the callback function must not call other functions on this 1551 * URegularExpression 1552 * 1553 * @param context context pointer. The callback function will be invoked 1554 * with the context specified at the time that 1555 * uregex_setFindProgressCallback() is called. 1556 * @param matchIndex the next index at which a match attempt will be attempted for this 1557 * find operation. If this callback interrupts the search, this is the 1558 * index at which a find/findNext operation may be re-initiated. 1559 * @return TRUE to continue the matching operation. 1560 * FALSE to terminate the matching operation. 1561 * @draft ICU 4.6 1562 */ 1563 U_CDECL_BEGIN 1564 typedef UBool U_CALLCONV URegexFindProgressCallback ( 1565 const void *context, 1566 int64_t matchIndex); 1567 U_CDECL_END 1568 1569 /** 1570 * Set the find progress callback function for this URegularExpression. 1571 * 1572 * @param regexp The compiled regular expression. 1573 * @param callback A pointer to the user-supplied callback function. 1574 * @param context User context pointer. The value supplied at the 1575 * time the callback function is set will be saved 1576 * and passed to the callback each time that it is called. 1577 * @param status A reference to a UErrorCode to receive any errors. 1578 * @draft ICU 4.6 1579 */ 1580 U_DRAFT void U_EXPORT2 1581 uregex_setFindProgressCallback(URegularExpression *regexp, 1582 URegexFindProgressCallback *callback, 1583 const void *context, 1584 UErrorCode *status); 1585 1586 1587 /** 1588 * Get the find progress callback function for this URegularExpression. 1589 * 1590 * @param regexp The compiled regular expression. 1591 * @param callback Out parameter, receives a pointer to the user-supplied 1592 * callback function. 1593 * @param context Out parameter, receives the user context pointer that 1594 * was set when uregex_setFindProgressCallback() was called. 1595 * @param status A reference to a UErrorCode to receive any errors. 1596 * @draft ICU 4.6 1597 */ 1598 U_DRAFT void U_EXPORT2 1599 uregex_getFindProgressCallback(const URegularExpression *regexp, 1600 URegexFindProgressCallback **callback, 1601 const void **context, 1602 UErrorCode *status); 1603 1604 #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */ 1605 #endif /* UREGEX_H */ 1606