1 /* 2 ********************************************************************** 3 * Copyright (C) 2004-2007, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ********************************************************************** 6 * file name: regex.h 7 * encoding: US-ASCII 8 * indentation:4 9 * 10 * created on: 2004mar09 11 * created by: Andy Heninger 12 * 13 * ICU Regular Expressions, API for C 14 */ 15 16 /** 17 * \file 18 * \brief C API: Regular Expressions 19 * 20 * <p>This is a C wrapper around the C++ RegexPattern and RegexMatcher classes.</p> 21 */ 22 23 #ifndef UREGEX_H 24 #define UREGEX_H 25 26 #include "unicode/utypes.h" 27 28 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 29 30 #include "unicode/parseerr.h" 31 32 struct URegularExpression; 33 /** 34 * Structure represeting a compiled regular rexpression, plus the results 35 * of a match operation. 36 * @stable ICU 3.0 37 */ 38 typedef struct URegularExpression URegularExpression; 39 40 41 /** 42 * Constants for Regular Expression Match Modes. 43 * @stable ICU 2.4 44 */ 45 typedef enum URegexpFlag{ 46 47 #ifndef U_HIDE_DRAFT_API 48 /** Forces normalization of pattern and strings. 49 Not implemented yet, just a placeholder, hence draft. 50 @draft ICU 2.4 */ 51 UREGEX_CANON_EQ = 128, 52 #endif 53 /** Enable case insensitive matching. @stable ICU 2.4 */ 54 UREGEX_CASE_INSENSITIVE = 2, 55 56 /** Allow white space and comments within patterns @stable ICU 2.4 */ 57 UREGEX_COMMENTS = 4, 58 59 /** If set, '.' matches line terminators, otherwise '.' matching stops at line end. 60 * @stable ICU 2.4 */ 61 UREGEX_DOTALL = 32, 62 63 /** If set, treat the entire pattern as a literal string. 64 * Metacharacters or escape sequences in the input sequence will be given 65 * no special meaning. 66 * 67 * The flags CASE_INSENSITIVE and UNICODE_CASE retain their impact 68 * on matching when used in conjunction with this flag. 69 * The other flags become superfluous. 70 * TODO: say which escapes are still handled; anything Java does 71 * early (\u) we should still do. 72 * @draft ICU 4.0 73 */ 74 UREGEX_LITERAL = 16, 75 76 /** Control behavior of "$" and "^" 77 * If set, recognize line terminators within string, 78 * otherwise, match only at start and end of input string. 79 * @stable ICU 2.4 */ 80 UREGEX_MULTILINE = 8, 81 82 /** Unix-only line endings. 83 * When this mode is enabled, only \u000a is recognized as a line ending 84 * in the behavior of ., ^, and $. 85 * @draft ICU 4.0 86 */ 87 UREGEX_UNIX_LINES = 1, 88 89 /** Unicode word boundaries. 90 * If set, \b uses the Unicode TR 29 definition of word boundaries. 91 * Warning: Unicode word boundaries are quite different from 92 * traditional regular expression word boundaries. See 93 * http://unicode.org/reports/tr29/#Word_Boundaries 94 * @stable ICU 2.8 95 */ 96 UREGEX_UWORD = 256, 97 98 /** Error on Unrecognized backslash escapes. 99 * If set, fail with an error on patterns that contain 100 * backslash-escaped ASCII letters without a known specail 101 * meaning. If this flag is not set, these 102 * escaped letters represent themselves. 103 * @draft ICU 4.0 104 */ 105 UREGEX_ERROR_ON_UNKNOWN_ESCAPES = 512 106 107 } URegexpFlag; 108 109 /** 110 * Open (compile) an ICU regular expression. Compiles the regular expression in 111 * string form into an internal representation using the specified match mode flags. 112 * The resulting regular expression handle can then be used to perform various 113 * matching operations. 114 * 115 * @param pattern The Regular Expression pattern to be compiled. 116 * @param patternLength The length of the pattern, or -1 if the pattern is 117 * NUL termintated. 118 * @param flags Flags that alter the default matching behavior for 119 * the regular expression, UREGEX_CASE_INSENSITIVE, for 120 * example. For default behavior, set this parameter to zero. 121 * See <code>enum URegexpFlag</code>. All desired flags 122 * are bitwise-ORed together. 123 * @param pe Receives the position (line and column nubers) of any syntax 124 * error within the source regular expression string. If this 125 * information is not wanted, pass NULL for this parameter. 126 * @param status Receives error detected by this function. 127 * @stable ICU 3.0 128 * 129 */ 130 U_STABLE URegularExpression * U_EXPORT2 131 uregex_open( const UChar *pattern, 132 int32_t patternLength, 133 uint32_t flags, 134 UParseError *pe, 135 UErrorCode *status); 136 137 /** 138 * Open (compile) an ICU regular expression. The resulting regular expression 139 * handle can then be used to perform various matching operations. 140 * <p> 141 * This function is the same as uregex_open, except that the pattern 142 * is supplied as an 8 bit char * string in the default code page. 143 * 144 * @param pattern The Regular Expression pattern to be compiled, 145 * NUL termintated. 146 * @param flags Flags that alter the default matching behavior for 147 * the regular expression, UREGEX_CASE_INSENSITIVE, for 148 * example. For default behavior, set this parameter to zero. 149 * See <code>enum URegexpFlag</code>. All desired flags 150 * are bitwise-ORed together. 151 * @param pe Receives the position (line and column nubers) of any syntax 152 * error within the source regular expression string. If this 153 * information is not wanted, pass NULL for this parameter. 154 * @param status Receives errors detected by this function. 155 * @return The URegularExpression object representing the compiled 156 * pattern. 157 * 158 * @stable ICU 3.0 159 */ 160 #if !UCONFIG_NO_CONVERSION 161 U_STABLE URegularExpression * U_EXPORT2 162 uregex_openC( const char *pattern, 163 uint32_t flags, 164 UParseError *pe, 165 UErrorCode *status); 166 #endif 167 168 169 170 /** 171 * Close the regular expression, recovering all resources (memory) it 172 * was holding. 173 * 174 * @param regexp The regular expression to be closed. 175 * @stable ICU 3.0 176 */ 177 U_STABLE void U_EXPORT2 178 uregex_close(URegularExpression *regexp); 179 180 /** 181 * Make a copy of a compiled regular expression. Cloning a regular 182 * expression is faster than opening a second instance from the source 183 * form of the expression, and requires less memory. 184 * <p> 185 * Note that the current input string and the position of any matched text 186 * within it are not cloned; only the pattern itself and and the 187 * match mode flags are copied. 188 * <p> 189 * Cloning can be particularly useful to threaded applications that perform 190 * multiple match operations in parallel. Each concurrent RE 191 * operation requires its own instance of a URegularExpression. 192 * 193 * @param regexp The compiled regular expression to be cloned. 194 * @param status Receives indication of any errors encountered 195 * @return the cloned copy of the compiled regular expression. 196 * @stable ICU 3.0 197 */ 198 U_STABLE URegularExpression * U_EXPORT2 199 uregex_clone(const URegularExpression *regexp, UErrorCode *status); 200 201 /** 202 * Return a pointer to the source form of the pattern for this regular expression. 203 * 204 * @param regexp The compiled regular expression. 205 * @param patLength This output parameter will be set to the length of the 206 * pattern string. A NULL pointer may be used here if the 207 * pattern length is not needed, as would be the case if 208 * the pattern is known in advance to be a NUL terminated 209 * string. 210 * @param status Receives errors detected by this function. 211 * @return a pointer to the pattern string. The storage for the string is 212 * owned by the regular expression object, and must not be 213 * altered or deleted by the application. The returned string 214 * will remain valid until the regular expression is closed. 215 * @stable ICU 3.0 216 */ 217 U_STABLE const UChar * U_EXPORT2 218 uregex_pattern(const URegularExpression *regexp, 219 int32_t *patLength, 220 UErrorCode *status); 221 222 223 /** 224 * Get the match mode flags that were specified when compiling this regular expression. 225 * @param status Receives errors detected by this function. 226 * @param regexp The compiled regular expression. 227 * @return The match mode flags 228 * @see URegexpFlag 229 * @stable ICU 3.0 230 */ 231 U_STABLE int32_t U_EXPORT2 232 uregex_flags(const URegularExpression *regexp, 233 UErrorCode *status); 234 235 236 /** 237 * Set the subject text string upon which the regular expression will look for matches. 238 * This function may be called any number of times, allowing the regular 239 * expression pattern to be applied to different strings. 240 * <p> 241 * Regular expression matching operations work directly on the application's 242 * string data. No copy is made. The subject string data must not be 243 * altered after calling this function until after all regular expression 244 * operations involving this string data are completed. 245 * <p> 246 * Zero length strings are permitted. In this case, no subsequent match 247 * operation will dereference the text string pointer. 248 * 249 * @param regexp The compiled regular expression. 250 * @param text The subject text string. 251 * @param textLength The length of the subject text, or -1 if the string 252 * is NUL terminated. 253 * @param status Receives errors detected by this function. 254 * @stable ICU 3.0 255 */ 256 U_STABLE void U_EXPORT2 257 uregex_setText(URegularExpression *regexp, 258 const UChar *text, 259 int32_t textLength, 260 UErrorCode *status); 261 262 /** 263 * Get the subject text that is currently associated with this 264 * regular expression object. This simply returns whatever string 265 * pointer was previously supplied via uregex_setText(). 266 * 267 * @param regexp The compiled regular expression. 268 * @param textLength The length of the string is returned in this output parameter. 269 * A NULL pointer may be used here if the 270 * text length is not needed, as would be the case if 271 * the text is known in advance to be a NUL terminated 272 * string. 273 * @param status Receives errors detected by this function. 274 * @return Poiner to the subject text string currently associated with 275 * this regular expression. 276 * @stable ICU 3.0 277 */ 278 U_STABLE const UChar * U_EXPORT2 279 uregex_getText(URegularExpression *regexp, 280 int32_t *textLength, 281 UErrorCode *status); 282 283 /** 284 * Attempts to match the input string against the pattern. 285 * To succeed, the match must extend to the end of the string, 286 * or cover the complete match region. 287 * 288 * If startIndex >= zero the match operation starts at the specified 289 * index and must extend to the end of the input string. Any region 290 * that has been specified is reset. 291 * 292 * If startIndex == -1 the match must cover the input region, or the entire 293 * input string if no region has been set. This directly corresponds to 294 * Matcher.matches() in Java 295 * 296 * @param regexp The compiled regular expression. 297 * @param startIndex The input string index at which to begin matching, or -1 298 * to match the input Region. 299 * @param status Receives errors detected by this function. 300 * @return TRUE if there is a match 301 * @stable ICU 3.0 302 */ 303 U_STABLE UBool U_EXPORT2 304 uregex_matches(URegularExpression *regexp, 305 int32_t startIndex, 306 UErrorCode *status); 307 308 /** 309 * Attempts to match the input string, starting from the specified index, against the pattern. 310 * The match may be of any length, and is not required to extend to the end 311 * of the input string. Contrast with uregex_matches(). 312 * 313 * <p>If startIndex is >= 0 any input region that was set for this 314 * URegularExpression is reset before the operation begins. 315 * 316 * <p>If the specified starting index == -1 the match begins at the start of the input 317 * region, or at the start of the full string if no region has been specified. 318 * This corresponds directly with Matcher.lookingAt() in Java. 319 * 320 * <p>If the match succeeds then more information can be obtained via the 321 * <code>uregexp_start()</code>, <code>uregexp_end()</code>, 322 * and <code>uregexp_group()</code> functions.</p> 323 * 324 * @param regexp The compiled regular expression. 325 * @param startIndex The input string index at which to begin matching, or 326 * -1 to match the Input Region 327 * @param status A reference to a UErrorCode to receive any errors. 328 * @return TRUE if there is a match. 329 * @stable ICU 3.0 330 */ 331 U_STABLE UBool U_EXPORT2 332 uregex_lookingAt(URegularExpression *regexp, 333 int32_t startIndex, 334 UErrorCode *status); 335 336 /** 337 * Find the first matching substring of the input string that matches the pattern. 338 * If startIndex is >= zero the search for a match begins at the specified index, 339 * and any match region is reset. This corresponds directly with 340 * Matcher.find(startIndex) in Java. 341 * 342 * If startIndex == -1 the search begins at the start of the input region, 343 * or at the start of the full string if no region has been specified. 344 * 345 * If a match is found, <code>uregex_start(), uregex_end()</code>, and 346 * <code>uregex_group()</code> will provide more information regarding the match. 347 * 348 * @param regexp The compiled regular expression. 349 * @param startIndex The position in the input string to begin the search, or 350 * -1 to search within the Input Region. 351 * @param status A reference to a UErrorCode to receive any errors. 352 * @return TRUE if a match is found. 353 * @stable ICU 3.0 354 */ 355 U_STABLE UBool U_EXPORT2 356 uregex_find(URegularExpression *regexp, 357 int32_t startIndex, 358 UErrorCode *status); 359 360 /** 361 * Find the next pattern match in the input string. Begin searching 362 * the input at the location following the end of he previous match, 363 * or at the start of the string (or region) if there is no 364 * previous match. If a match is found, <code>uregex_start(), uregex_end()</code>, and 365 * <code>uregex_group()</code> will provide more information regarding the match. 366 * 367 * @param regexp The compiled regular expression. 368 * @param status A reference to a UErrorCode to receive any errors. 369 * @return TRUE if a match is found. 370 * @see uregex_reset 371 * @stable ICU 3.0 372 */ 373 U_STABLE UBool U_EXPORT2 374 uregex_findNext(URegularExpression *regexp, 375 UErrorCode *status); 376 377 /** 378 * Get the number of capturing groups in this regular expression's pattern. 379 * @param regexp The compiled regular expression. 380 * @param status A reference to a UErrorCode to receive any errors. 381 * @return the number of capture groups 382 * @stable ICU 3.0 383 */ 384 U_STABLE int32_t U_EXPORT2 385 uregex_groupCount(URegularExpression *regexp, 386 UErrorCode *status); 387 388 /** Extract the string for the specified matching expression or subexpression. 389 * Group #0 is the complete string of matched text. 390 * Group #1 is the text matched by the first set of capturing parentheses. 391 * 392 * @param regexp The compiled regular expression. 393 * @param groupNum The capture group to extract. Group 0 is the complete 394 * match. The value of this parameter must be 395 * less than or equal to the number of capture groups in 396 * the pattern. 397 * @param dest Buffer to receive the matching string data 398 * @param destCapacity Capacity of the dest buffer. 399 * @param status A reference to a UErrorCode to receive any errors. 400 * @return Length of matching data, 401 * or -1 if no applicable match. 402 * @stable ICU 3.0 403 */ 404 U_STABLE int32_t U_EXPORT2 405 uregex_group(URegularExpression *regexp, 406 int32_t groupNum, 407 UChar *dest, 408 int32_t destCapacity, 409 UErrorCode *status); 410 411 412 /** 413 * Returns the index in the input string of the start of the text matched by the 414 * specified capture group during the previous match operation. Return -1 if 415 * the capture group was not part of the last match. 416 * Group #0 refers to the complete range of matched text. 417 * Group #1 refers to the text matched by the first set of capturing parentheses. 418 * 419 * @param regexp The compiled regular expression. 420 * @param groupNum The capture group number 421 * @param status A reference to a UErrorCode to receive any errors. 422 * @return the starting position in the input of the text matched 423 * by the specified group. 424 * @stable ICU 3.0 425 */ 426 U_STABLE int32_t U_EXPORT2 427 uregex_start(URegularExpression *regexp, 428 int32_t groupNum, 429 UErrorCode *status); 430 431 /** 432 * Returns the index in the input string of the position following the end 433 * of the text matched by the specified capture group. 434 * Return -1 if the capture group was not part of the last match. 435 * Group #0 refers to the complete range of matched text. 436 * Group #1 refers to the text matched by the first set of capturing parentheses. 437 * 438 * @param regexp The compiled regular expression. 439 * @param groupNum The capture group number 440 * @param status A reference to a UErrorCode to receive any errors. 441 * @return the index of the position following the last matched character. 442 * @stable ICU 3.0 443 */ 444 U_STABLE int32_t U_EXPORT2 445 uregex_end(URegularExpression *regexp, 446 int32_t groupNum, 447 UErrorCode *status); 448 449 /** 450 * Reset any saved state from the previous match. Has the effect of 451 * causing uregex_findNext to begin at the specified index, and causing 452 * uregex_start(), uregex_end() and uregex_group() to return an error 453 * indicating that there is no match information available. Clears any 454 * match region that may have been set. 455 * 456 * @param regexp The compiled regular expression. 457 * @param index The position in the text at which a 458 * uregex_findNext() should begin searching. 459 * @param status A reference to a UErrorCode to receive any errors. 460 * @stable ICU 3.0 461 */ 462 U_STABLE void U_EXPORT2 463 uregex_reset(URegularExpression *regexp, 464 int32_t index, 465 UErrorCode *status); 466 467 468 /** Sets the limits of the matching region for this URegularExpression. 469 * The region is the part of the input string that will be considered when matching. 470 * Invoking this method resets any saved state from the previous match, 471 * then sets the region to start at the index specified by the start parameter 472 * and end at the index specified by the end parameter. 473 * 474 * Depending on the transparency and anchoring being used (see useTransparentBounds 475 * and useAnchoringBounds), certain constructs such as anchors may behave differently 476 * at or around the boundaries of the region 477 * 478 * The function will fail if start is greater than limit, or if either index 479 * is less than zero or greater than the length of the string being matched. 480 * 481 * @param regexp The compiled regular expression. 482 * @param start The index to begin searches at. 483 * @param limit The index to end searches at (exclusive). 484 * @param status A pointer to a UErrorCode to receive any errors. 485 * @draft ICU 4.0 486 */ 487 U_DRAFT void U_EXPORT2 488 uregex_setRegion(URegularExpression *regexp, 489 int32_t regionStart, 490 int32_t regionLimit, 491 UErrorCode *status); 492 493 /** 494 * Reports the start index of the matching region. Any matches found are limited to 495 * to the region bounded by regionStart (inclusive) and regionEnd (exclusive). 496 * 497 * @param regexp The compiled regular expression. 498 * @param status A pointer to a UErrorCode to receive any errors. 499 * @return The starting index of this matcher's region. 500 * @draft ICU 4.0 501 */ 502 U_DRAFT int32_t U_EXPORT2 503 uregex_regionStart(const URegularExpression *regexp, 504 UErrorCode *status); 505 506 507 508 /** 509 * Reports the end index (exclusive) of the matching region for this URegularExpression. 510 * Any matches found are limited to to the region bounded by regionStart (inclusive) 511 * and regionEnd (exclusive). 512 * 513 * @param regexp The compiled regular expression. 514 * @param status A pointer to a UErrorCode to receive any errors. 515 * @return The ending point of this matcher's region. 516 * @draft ICU 4.0 517 */ 518 U_DRAFT int32_t U_EXPORT2 519 uregex_regionEnd(const URegularExpression *regexp, 520 UErrorCode *status); 521 522 /** 523 * Queries the transparency of region bounds for this URegularExpression. 524 * See useTransparentBounds for a description of transparent and opaque bounds. 525 * By default, matching boundaries are opaque. 526 * 527 * @param regexp The compiled regular expression. 528 * @param status A pointer to a UErrorCode to receive any errors. 529 * @return TRUE if this matcher is using opaque bounds, false if it is not. 530 * @draft ICU 4.0 531 */ 532 U_DRAFT UBool U_EXPORT2 533 uregex_hasTransparentBounds(const URegularExpression *regexp, 534 UErrorCode *status); 535 536 537 /** 538 * Sets the transparency of region bounds for this URegularExpression. 539 * Invoking this function with an argument of TRUE will set matches to use transparent bounds. 540 * If the boolean argument is FALSE, then opaque bounds will be used. 541 * 542 * Using transparent bounds, the boundaries of the matching region are transparent 543 * to lookahead, lookbehind, and boundary matching constructs. Those constructs can 544 * see text beyond the boundaries of the region while checking for a match. 545 * 546 * With opaque bounds, no text outside of the matching region is visible to lookahead, 547 * lookbehind, and boundary matching constructs. 548 * 549 * By default, opaque bounds are used. 550 * 551 * @param regexp The compiled regular expression. 552 * @param b TRUE for transparent bounds; FALSE for opaque bounds 553 * @param status A pointer to a UErrorCode to receive any errors. 554 * @draft ICU 4.0 555 **/ 556 U_DRAFT void U_EXPORT2 557 uregex_useTransparentBounds(URegularExpression *regexp, 558 UBool b, 559 UErrorCode *status); 560 561 562 /** 563 * Return true if this URegularExpression is using anchoring bounds. 564 * By default, anchoring region bounds are used. 565 * 566 * @param regexp The compiled regular expression. 567 * @param status A pointer to a UErrorCode to receive any errors. 568 * @return TRUE if this matcher is using anchoring bounds. 569 * @draft ICU 4.0 570 */ 571 U_DRAFT UBool U_EXPORT2 572 uregex_hasAnchoringBounds(const URegularExpression *regexp, 573 UErrorCode *status); 574 575 576 /** 577 * Set whether this URegularExpression is using Anchoring Bounds for its region. 578 * With anchoring bounds, pattern anchors such as ^ and $ will match at the start 579 * and end of the region. Without Anchoring Bounds, anchors will only match at 580 * the positions they would in the complete text. 581 * 582 * Anchoring Bounds are the default for regions. 583 * 584 * @param regexp The compiled regular expression. 585 * @param b TRUE if to enable anchoring bounds; FALSE to disable them. 586 * @param status A pointer to a UErrorCode to receive any errors. 587 * @draft ICU 4.0 588 */ 589 U_DRAFT void U_EXPORT2 590 uregex_useAnchoringBounds(URegularExpression *regexp, 591 UBool b, 592 UErrorCode *status); 593 594 /** 595 * Return TRUE if the most recent matching operation touched the 596 * end of the text being processed. In this case, additional input text could 597 * change the results of that match. 598 * 599 * @param regexp The compiled regular expression. 600 * @param status A pointer to a UErrorCode to receive any errors. 601 * @return TRUE if the most recent match hit the end of input 602 * @draft ICU 4.0 603 */ 604 U_DRAFT UBool U_EXPORT2 605 uregex_hitEnd(const URegularExpression *regexp, 606 UErrorCode *status); 607 608 /** 609 * Return TRUE the most recent match succeeded and additional input could cause 610 * it to fail. If this function returns false and a match was found, then more input 611 * might change the match but the match won't be lost. If a match was not found, 612 * then requireEnd has no meaning. 613 * 614 * @param regexp The compiled regular expression. 615 * @param status A pointer to a UErrorCode to receive any errors. 616 * @return TRUE if more input could cause the most recent match to no longer match. 617 * @draft ICU 4.0 618 */ 619 U_DRAFT UBool U_EXPORT2 620 uregex_requireEnd(const URegularExpression *regexp, 621 UErrorCode *status); 622 623 624 625 626 627 /** 628 * Replaces every substring of the input that matches the pattern 629 * with the given replacement string. This is a convenience function that 630 * provides a complete find-and-replace-all operation. 631 * 632 * This method scans the input string looking for matches of the pattern. 633 * Input that is not part of any match is copied unchanged to the 634 * destination buffer. Matched regions are replaced in the output 635 * buffer by the replacement string. The replacement string may contain 636 * references to capture groups; these take the form of $1, $2, etc. 637 * 638 * @param regexp The compiled regular expression. 639 * @param replacementText A string containing the replacement text. 640 * @param replacementLength The length of the replacement string, or 641 * -1 if it is NUL terminated. 642 * @param destBuf A (UChar *) buffer that will receive the result. 643 * @param destCapacity The capacity of the desitnation buffer. 644 * @param status A reference to a UErrorCode to receive any errors. 645 * @return The length of the string resulting from the find 646 * and replace operation. In the event that the 647 * destination capacity is inadequate, the return value 648 * is still the full length of the untruncated string. 649 * @stable ICU 3.0 650 */ 651 U_STABLE int32_t U_EXPORT2 652 uregex_replaceAll(URegularExpression *regexp, 653 const UChar *replacementText, 654 int32_t replacementLength, 655 UChar *destBuf, 656 int32_t destCapacity, 657 UErrorCode *status); 658 659 660 /** 661 * Replaces the first substring of the input that matches the pattern 662 * with the given replacement string. This is a convenience function that 663 * provides a complete find-and-replace operation. 664 * 665 * This method scans the input string looking for a match of the pattern. 666 * All input that is not part of the match is copied unchanged to the 667 * destination buffer. The matched region is replaced in the output 668 * buffer by the replacement string. The replacement string may contain 669 * references to capture groups; these take the form of $1, $2, etc. 670 * 671 * @param regexp The compiled regular expression. 672 * @param replacementText A string containing the replacement text. 673 * @param replacementLength The length of the replacement string, or 674 * -1 if it is NUL terminated. 675 * @param destBuf A (UChar *) buffer that will receive the result. 676 * @param destCapacity The capacity of the desitnation buffer. 677 * @param status a reference to a UErrorCode to receive any errors. 678 * @return The length of the string resulting from the find 679 * and replace operation. In the event that the 680 * destination capacity is inadequate, the return value 681 * is still the full length of the untruncated string. 682 * @stable ICU 3.0 683 */ 684 U_STABLE int32_t U_EXPORT2 685 uregex_replaceFirst(URegularExpression *regexp, 686 const UChar *replacementText, 687 int32_t replacementLength, 688 UChar *destBuf, 689 int32_t destCapacity, 690 UErrorCode *status); 691 692 693 /** 694 * Implements a replace operation intended to be used as part of an 695 * incremental find-and-replace. 696 * 697 * <p>The input string, starting from the end of the previous match and ending at 698 * the start of the current match, is appended to the destination string. Then the 699 * replacement string is appended to the output string, 700 * including handling any substitutions of captured text.</p> 701 * 702 * <p>A note on preflight computation of buffersize and error handling: 703 * Calls to uregex_appendReplacement() and uregex_appendTail() are 704 * designed to be chained, one after another, with the destination 705 * buffer pointer and buffer capacity updated after each in preparation 706 * to for the next. If the destination buffer is exhausted partway through such a 707 * sequence, a U_BUFFER_OVERFLOW_ERROR status will be returned. Normal 708 * ICU conventions are for a function to perform no action if it is 709 * called with an error status, but for this one case, uregex_appendRepacement() 710 * will operate normally so that buffer size computations will complete 711 * correctly. 712 * 713 * <p>For simple, prepackaged, non-incremental find-and-replace 714 * operations, see replaceFirst() or replaceAll().</p> 715 * 716 * @param regexp The regular expression object. 717 * @param replacementText The string that will replace the matched portion of the 718 * input string as it is copied to the destination buffer. 719 * The replacement text may contain references ($1, for 720 * example) to capture groups from the match. 721 * @param replacementLength The length of the replacement text string, 722 * or -1 if the string is NUL terminated. 723 * @param destBuf The buffer into which the results of the 724 * find-and-replace are placed. On return, this pointer 725 * will be updated to refer to the beginning of the 726 * unused portion of buffer, leaving it in position for 727 * a subsequent call to this function. 728 * @param destCapacity The size of the output buffer, On return, this 729 * parameter will be updated to reflect the space remaining 730 * unused in the output buffer. 731 * @param status A reference to a UErrorCode to receive any errors. 732 * @return The length of the result string. In the event that 733 * destCapacity is inadequate, the full length of the 734 * untruncated output string is returned. 735 * 736 * @stable ICU 3.0 737 * 738 */ 739 U_STABLE int32_t U_EXPORT2 740 uregex_appendReplacement(URegularExpression *regexp, 741 const UChar *replacementText, 742 int32_t replacementLength, 743 UChar **destBuf, 744 int32_t *destCapacity, 745 UErrorCode *status); 746 747 748 /** 749 * As the final step in a find-and-replace operation, append the remainder 750 * of the input string, starting at the position following the last match, 751 * to the destination string. <code>uregex_appendTail()</code> is intended 752 * to be invoked after one or more invocations of the 753 * <code>uregex_appendReplacement()</code> function. 754 * 755 * @param regexp The regular expression object. This is needed to 756 * obtain the input string and with the position 757 * of the last match within it. 758 * @param destBuf The buffer in which the results of the 759 * find-and-replace are placed. On return, the pointer 760 * will be updated to refer to the beginning of the 761 * unused portion of buffer. 762 * @param destCapacity The size of the output buffer, On return, this 763 * value will be updated to reflect the space remaining 764 * unused in the output buffer. 765 * @param status A reference to a UErrorCode to receive any errors. 766 * @return The length of the result string. In the event that 767 * destCapacity is inadequate, the full length of the 768 * untruncated output string is returned. 769 * 770 * @stable ICU 3.0 771 */ 772 U_STABLE int32_t U_EXPORT2 773 uregex_appendTail(URegularExpression *regexp, 774 UChar **destBuf, 775 int32_t *destCapacity, 776 UErrorCode *status); 777 778 779 780 781 /** 782 * Split a string into fields. Somewhat like split() from Perl. 783 * The pattern matches identify delimiters that separate the input 784 * into fields. The input data between the matches becomes the 785 * fields themselves. 786 * <p> 787 * Each of the fields is copied from the input string to the destination 788 * buffer, and the NUL terminated. The position of each field within 789 * the destination buffer is returned in the destFields array. 790 * 791 * Note: another choice for the design of this function would be to not 792 * copy the resulting fields at all, but to return indexes and 793 * lengths within the source text. 794 * Advantages would be 795 * o Faster. No Copying. 796 * o Nothing extra needed when field data may contain embedded NUL chars. 797 * o Less memory needed if working on large data. 798 * Disadvantages 799 * o Less consistent with C++ split, which copies into an 800 * array of UnicodeStrings. 801 * o No NUL termination, extracted fields would be less convenient 802 * to use in most cases. 803 * o Possible problems in the future, when support Unicode Normalization 804 * could cause the fields to not correspond exactly to 805 * a range of the source text. 806 * 807 * @param regexp The compiled regular expression. 808 * @param destBuf A (UChar *) buffer to receive the fields that 809 * are extracted from the input string. These 810 * field pointers will refer to positions within the 811 * destination buffer supplied by the caller. Any 812 * extra positions within the destFields array will be 813 * set to NULL. 814 * @param destCapacity The capacity of the destBuf. 815 * @param requiredCapacity The actual capacity required of the destBuf. 816 * If destCapacity is too small, requiredCapacity will return 817 * the total capacity required to hold all of the output, and 818 * a U_BUFFER_OVERFLOW_ERROR will be returned. 819 * @param destFields An array to be filled with the position of each 820 * of the extracted fields within destBuf. 821 * @param destFieldsCapacity The number of elements in the destFields array. 822 * If the number of fields found is less than destFieldsCapacity, 823 * the extra destFields elements are set to zero. 824 * If destFieldsCapacity is too small, the trailing part of the 825 * input, including any field delimiters, is treated as if it 826 * were the last field - it is copied to the destBuf, and 827 * its position is in the destBuf is stored in the last element 828 * of destFields. This behavior mimics that of Perl. It is not 829 * an error condition, and no error status is returned when all destField 830 * positions are used. 831 * @param status A reference to a UErrorCode to receive any errors. 832 * @return The number of fields into which the input string was split. 833 * @stable ICU 3.0 834 */ 835 U_STABLE int32_t U_EXPORT2 836 uregex_split( URegularExpression *regexp, 837 UChar *destBuf, 838 int32_t destCapacity, 839 int32_t *requiredCapacity, 840 UChar *destFields[], 841 int32_t destFieldsCapacity, 842 UErrorCode *status); 843 844 845 846 #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */ 847 #endif /* UREGEX_H */ 848