• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 *   Copyright (C) 2004-2016, International Business Machines
6 *   Corporation and others.  All Rights Reserved.
7 **********************************************************************
8 *   file name:  uregex.h
9 *   encoding:   UTF-8
10 *   indentation:4
11 *
12 *   created on: 2004mar09
13 *   created by: Andy Heninger
14 *
15 *   ICU Regular Expressions, API for C
16 */
17 
18 /**
19  * \file
20  * \brief C API: Regular Expressions
21  *
22  * <p>This is a C wrapper around the C++ RegexPattern and RegexMatcher classes.</p>
23  */
24 
25 #ifndef UREGEX_H
26 #define UREGEX_H
27 
28 #include "unicode/utext.h"
29 #include "unicode/utypes.h"
30 
31 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
32 
33 #include "unicode/parseerr.h"
34 
35 #if U_SHOW_CPLUSPLUS_API
36 #include "unicode/localpointer.h"
37 #endif   // U_SHOW_CPLUSPLUS_API
38 
39 struct URegularExpression;
40 /**
41   * Structure representing a compiled regular expression, plus the results
42   *    of a match operation.
43   * @stable ICU 3.0
44   */
45 typedef struct URegularExpression URegularExpression;
46 
47 
48 /**
49  * Constants for Regular Expression Match Modes.
50  * @stable ICU 2.4
51  */
52 typedef enum URegexpFlag{
53 
54 #ifndef U_HIDE_DRAFT_API
55     /** Forces normalization of pattern and strings.
56     Not implemented yet, just a placeholder, hence draft.
57     @draft ICU 2.4 */
58     UREGEX_CANON_EQ         = 128,
59 #endif /* U_HIDE_DRAFT_API */
60     /**  Enable case insensitive matching.  @stable ICU 2.4 */
61     UREGEX_CASE_INSENSITIVE = 2,
62 
63     /**  Allow white space and comments within patterns  @stable ICU 2.4 */
64     UREGEX_COMMENTS         = 4,
65 
66     /**  If set, '.' matches line terminators,  otherwise '.' matching stops at line end.
67       *  @stable ICU 2.4 */
68     UREGEX_DOTALL           = 32,
69 
70     /**  If set, treat the entire pattern as a literal string.
71       *  Metacharacters or escape sequences in the input sequence will be given
72       *  no special meaning.
73       *
74       *  The flag UREGEX_CASE_INSENSITIVE retains its impact
75       *  on matching when used in conjunction with this flag.
76       *  The other flags become superfluous.
77       *
78       * @stable ICU 4.0
79       */
80     UREGEX_LITERAL = 16,
81 
82     /**   Control behavior of "$" and "^"
83       *    If set, recognize line terminators within string,
84       *    otherwise, match only at start and end of input string.
85       *   @stable ICU 2.4 */
86     UREGEX_MULTILINE        = 8,
87 
88     /**   Unix-only line endings.
89       *   When this mode is enabled, only \\u000a is recognized as a line ending
90       *    in the behavior of ., ^, and $.
91       *   @stable ICU 4.0
92       */
93     UREGEX_UNIX_LINES = 1,
94 
95     /**  Unicode word boundaries.
96       *     If set, \b uses the Unicode TR 29 definition of word boundaries.
97       *     Warning: Unicode word boundaries are quite different from
98       *     traditional regular expression word boundaries.  See
99       *     http://unicode.org/reports/tr29/#Word_Boundaries
100       *     @stable ICU 2.8
101       */
102     UREGEX_UWORD            = 256,
103 
104      /**  Error on Unrecognized backslash escapes.
105        *     If set, fail with an error on patterns that contain
106        *     backslash-escaped ASCII letters without a known special
107        *     meaning.  If this flag is not set, these
108        *     escaped letters represent themselves.
109        *     @stable ICU 4.0
110        */
111      UREGEX_ERROR_ON_UNKNOWN_ESCAPES = 512
112 
113 }  URegexpFlag;
114 
115 /**
116   *  Open (compile) an ICU regular expression.  Compiles the regular expression in
117   *  string form into an internal representation using the specified match mode flags.
118   *  The resulting regular expression handle can then be used to perform various
119   *   matching operations.
120   *
121   *
122   * @param pattern        The Regular Expression pattern to be compiled.
123   * @param patternLength  The length of the pattern, or -1 if the pattern is
124   *                       NUL terminated.
125   * @param flags          Flags that alter the default matching behavior for
126   *                       the regular expression, UREGEX_CASE_INSENSITIVE, for
127   *                       example.  For default behavior, set this parameter to zero.
128   *                       See <code>enum URegexpFlag</code>.  All desired flags
129   *                       are bitwise-ORed together.
130   * @param pe             Receives the position (line and column numbers) of any syntax
131   *                       error within the source regular expression string.  If this
132   *                       information is not wanted, pass NULL for this parameter.
133   * @param status         Receives error detected by this function.
134   * @stable ICU 3.0
135   *
136   */
137 U_CAPI URegularExpression * U_EXPORT2
138 uregex_open( const  UChar          *pattern,
139                     int32_t         patternLength,
140                     uint32_t        flags,
141                     UParseError    *pe,
142                     UErrorCode     *status);
143 
144 /**
145   *  Open (compile) an ICU regular expression.  Compiles the regular expression in
146   *  string form into an internal representation using the specified match mode flags.
147   *  The resulting regular expression handle can then be used to perform various
148   *   matching operations.
149   *  <p>
150   *  The contents of the pattern UText will be extracted and saved. Ownership of the
151   *   UText struct itself remains with the caller. This is to match the behavior of
152   *   uregex_open().
153   *
154   * @param pattern        The Regular Expression pattern to be compiled.
155   * @param flags          Flags that alter the default matching behavior for
156   *                       the regular expression, UREGEX_CASE_INSENSITIVE, for
157   *                       example.  For default behavior, set this parameter to zero.
158   *                       See <code>enum URegexpFlag</code>.  All desired flags
159   *                       are bitwise-ORed together.
160   * @param pe             Receives the position (line and column numbers) of any syntax
161   *                       error within the source regular expression string.  If this
162   *                       information is not wanted, pass NULL for this parameter.
163   * @param status         Receives error detected by this function.
164   *
165   * @stable ICU 4.6
166   */
167 U_CAPI URegularExpression *  U_EXPORT2
168 uregex_openUText(UText          *pattern,
169                  uint32_t        flags,
170                  UParseError    *pe,
171                  UErrorCode     *status);
172 
173 #if !UCONFIG_NO_CONVERSION
174 /**
175   *  Open (compile) an ICU regular expression.  The resulting regular expression
176   *   handle can then be used to perform various matching operations.
177   *  <p>
178   *   This function is the same as uregex_open, except that the pattern
179   *   is supplied as an 8 bit char * string in the default code page.
180   *
181   * @param pattern        The Regular Expression pattern to be compiled,
182   *                       NUL terminated.
183   * @param flags          Flags that alter the default matching behavior for
184   *                       the regular expression, UREGEX_CASE_INSENSITIVE, for
185   *                       example.  For default behavior, set this parameter to zero.
186   *                       See <code>enum URegexpFlag</code>.  All desired flags
187   *                       are bitwise-ORed together.
188   * @param pe             Receives the position (line and column numbers) of any syntax
189   *                       error within the source regular expression string.  If this
190   *                       information is not wanted, pass NULL for this parameter.
191   * @param status         Receives errors detected by this function.
192   * @return               The URegularExpression object representing the compiled
193   *                       pattern.
194   *
195   * @stable ICU 3.0
196   */
197 U_CAPI URegularExpression * U_EXPORT2
198 uregex_openC( const char           *pattern,
199                     uint32_t        flags,
200                     UParseError    *pe,
201                     UErrorCode     *status);
202 #endif
203 
204 
205 
206 /**
207   *  Close the regular expression, recovering all resources (memory) it
208   *   was holding.
209   *
210   * @param regexp   The regular expression to be closed.
211   * @stable ICU 3.0
212   */
213 U_CAPI void U_EXPORT2
214 uregex_close(URegularExpression *regexp);
215 
216 #if U_SHOW_CPLUSPLUS_API
217 
218 U_NAMESPACE_BEGIN
219 
220 /**
221  * \class LocalURegularExpressionPointer
222  * "Smart pointer" class, closes a URegularExpression via uregex_close().
223  * For most methods see the LocalPointerBase base class.
224  *
225  * @see LocalPointerBase
226  * @see LocalPointer
227  * @stable ICU 4.4
228  */
229 U_DEFINE_LOCAL_OPEN_POINTER(LocalURegularExpressionPointer, URegularExpression, uregex_close);
230 
231 U_NAMESPACE_END
232 
233 #endif
234 
235 /**
236  * Make a copy of a compiled regular expression.  Cloning a regular
237  * expression is faster than opening a second instance from the source
238  * form of the expression, and requires less memory.
239  * <p>
240  * Note that the current input string and the position of any matched text
241  *  within it are not cloned; only the pattern itself and the
242  *  match mode flags are copied.
243  * <p>
244  * Cloning can be particularly useful to threaded applications that perform
245  * multiple match operations in parallel.  Each concurrent RE
246  * operation requires its own instance of a URegularExpression.
247  *
248  * @param regexp   The compiled regular expression to be cloned.
249  * @param status   Receives indication of any errors encountered
250  * @return the cloned copy of the compiled regular expression.
251  * @stable ICU 3.0
252  */
253 U_CAPI URegularExpression * U_EXPORT2
254 uregex_clone(const URegularExpression *regexp, UErrorCode *status);
255 
256 /**
257  *  Returns a pointer to the source form of the pattern for this regular expression.
258  *  This function will work even if the pattern was originally specified as a UText.
259  *
260  * @param regexp     The compiled regular expression.
261  * @param patLength  This output parameter will be set to the length of the
262  *                   pattern string.  A NULL pointer may be used here if the
263  *                   pattern length is not needed, as would be the case if
264  *                   the pattern is known in advance to be a NUL terminated
265  *                   string.
266  * @param status     Receives errors detected by this function.
267  * @return a pointer to the pattern string.  The storage for the string is
268  *                   owned by the regular expression object, and must not be
269  *                   altered or deleted by the application.  The returned string
270  *                   will remain valid until the regular expression is closed.
271  * @stable ICU 3.0
272  */
273 U_CAPI const UChar * U_EXPORT2
274 uregex_pattern(const URegularExpression *regexp,
275                      int32_t            *patLength,
276                      UErrorCode         *status);
277 
278 /**
279  *  Returns the source text of the pattern for this regular expression.
280  *  This function will work even if the pattern was originally specified as a UChar string.
281  *
282  * @param regexp     The compiled regular expression.
283  * @param status     Receives errors detected by this function.
284  * @return the pattern text.  The storage for the text is owned by the regular expression
285  *                   object, and must not be altered or deleted.
286  *
287  * @stable ICU 4.6
288  */
289 U_CAPI UText * U_EXPORT2
290 uregex_patternUText(const URegularExpression *regexp,
291                           UErrorCode         *status);
292 
293 /**
294   * Get the match mode flags that were specified when compiling this regular expression.
295   * @param status   Receives errors detected by this function.
296   * @param regexp   The compiled regular expression.
297   * @return         The match mode flags
298   * @see URegexpFlag
299   * @stable ICU 3.0
300   */
301 U_CAPI int32_t U_EXPORT2
302 uregex_flags(const  URegularExpression   *regexp,
303                     UErrorCode           *status);
304 
305 
306 /**
307   *  Set the subject text string upon which the regular expression will look for matches.
308   *  This function may be called any number of times, allowing the regular
309   *  expression pattern to be applied to different strings.
310   *  <p>
311   *  Regular expression matching operations work directly on the application's
312   *  string data.  No copy is made.  The subject string data must not be
313   *  altered after calling this function until after all regular expression
314   *  operations involving this string data are completed.
315   *  <p>
316   *  Zero length strings are permitted.  In this case, no subsequent match
317   *  operation will dereference the text string pointer.
318   *
319   * @param regexp     The compiled regular expression.
320   * @param text       The subject text string.
321   * @param textLength The length of the subject text, or -1 if the string
322   *                   is NUL terminated.
323   * @param status     Receives errors detected by this function.
324   * @stable ICU 3.0
325   */
326 U_CAPI void U_EXPORT2
327 uregex_setText(URegularExpression *regexp,
328                const UChar        *text,
329                int32_t             textLength,
330                UErrorCode         *status);
331 
332 
333 /**
334   *  Set the subject text string upon which the regular expression will look for matches.
335   *  This function may be called any number of times, allowing the regular
336   *  expression pattern to be applied to different strings.
337   *  <p>
338   *  Regular expression matching operations work directly on the application's
339   *  string data; only a shallow clone is made.  The subject string data must not be
340   *  altered after calling this function until after all regular expression
341   *  operations involving this string data are completed.
342   *
343   * @param regexp     The compiled regular expression.
344   * @param text       The subject text string.
345   * @param status     Receives errors detected by this function.
346   *
347   * @stable ICU 4.6
348   */
349 U_CAPI void U_EXPORT2
350 uregex_setUText(URegularExpression *regexp,
351                 UText              *text,
352                 UErrorCode         *status);
353 
354 /**
355   *  Get the subject text that is currently associated with this
356   *   regular expression object.  If the input was supplied using uregex_setText(),
357   *   that pointer will be returned.  Otherwise, the characters in the input will
358   *   be extracted to a buffer and returned.  In either case, ownership remains
359   *   with the regular expression object.
360   *
361   *  This function will work even if the input was originally specified as a UText.
362   *
363   * @param regexp      The compiled regular expression.
364   * @param textLength  The length of the string is returned in this output parameter.
365   *                    A NULL pointer may be used here if the
366   *                    text length is not needed, as would be the case if
367   *                    the text is known in advance to be a NUL terminated
368   *                    string.
369   * @param status      Receives errors detected by this function.
370   * @return            Pointer to the subject text string currently associated with
371   *                    this regular expression.
372   * @stable ICU 3.0
373   */
374 U_CAPI const UChar * U_EXPORT2
375 uregex_getText(URegularExpression *regexp,
376                int32_t            *textLength,
377                UErrorCode         *status);
378 
379 /**
380   *  Get the subject text that is currently associated with this
381   *   regular expression object.
382   *
383   *  This function will work even if the input was originally specified as a UChar string.
384   *
385   * @param regexp      The compiled regular expression.
386   * @param dest        A mutable UText in which to store the current input.
387   *                    If NULL, a new UText will be created as an immutable shallow clone
388   *                    of the actual input string.
389   * @param status      Receives errors detected by this function.
390   * @return            The subject text currently associated with this regular expression.
391   *                    If a pre-allocated UText was provided, it will always be used and returned.
392   *
393   * @stable ICU 4.6
394   */
395 U_CAPI UText * U_EXPORT2
396 uregex_getUText(URegularExpression *regexp,
397                 UText              *dest,
398                 UErrorCode         *status);
399 
400 /**
401   *  Set the subject text string upon which the regular expression is looking for matches
402   *  without changing any other aspect of the matching state.
403   *  The new and previous text strings must have the same content.
404   *
405   *  This function is intended for use in environments where ICU is operating on
406   *  strings that may move around in memory.  It provides a mechanism for notifying
407   *  ICU that the string has been relocated, and providing a new UText to access the
408   *  string in its new position.
409   *
410   *  Note that the regular expression implementation never copies the underlying text
411   *  of a string being matched, but always operates directly on the original text
412   *  provided by the user. Refreshing simply drops the references to the old text
413   *  and replaces them with references to the new.
414   *
415   *  Caution:  this function is normally used only by very specialized
416   *            system-level code.   One example use case is with garbage collection
417   *            that moves the text in memory.
418   *
419   * @param regexp     The compiled regular expression.
420   * @param text       The new (moved) text string.
421   * @param status     Receives errors detected by this function.
422   *
423   * @stable ICU 4.8
424   */
425 U_CAPI void U_EXPORT2
426 uregex_refreshUText(URegularExpression *regexp,
427                     UText              *text,
428                     UErrorCode         *status);
429 
430 /**
431   *   Attempts to match the input string against the pattern.
432   *   To succeed, the match must extend to the end of the string,
433   *   or cover the complete match region.
434   *
435   *   If startIndex >= zero the match operation starts at the specified
436   *   index and must extend to the end of the input string.  Any region
437   *   that has been specified is reset.
438   *
439   *   If startIndex == -1 the match must cover the input region, or the entire
440   *   input string if no region has been set.  This directly corresponds to
441   *   Matcher.matches() in Java
442   *
443   *    @param  regexp      The compiled regular expression.
444   *    @param  startIndex  The input string (native) index at which to begin matching, or -1
445   *                        to match the input Region.
446   *    @param  status      Receives errors detected by this function.
447   *    @return             true if there is a match
448   *    @stable ICU 3.0
449   */
450 U_CAPI UBool U_EXPORT2
451 uregex_matches(URegularExpression *regexp,
452                 int32_t            startIndex,
453                 UErrorCode        *status);
454 
455 /**
456   *   64bit version of uregex_matches.
457   *   Attempts to match the input string against the pattern.
458   *   To succeed, the match must extend to the end of the string,
459   *   or cover the complete match region.
460   *
461   *   If startIndex >= zero the match operation starts at the specified
462   *   index and must extend to the end of the input string.  Any region
463   *   that has been specified is reset.
464   *
465   *   If startIndex == -1 the match must cover the input region, or the entire
466   *   input string if no region has been set.  This directly corresponds to
467   *   Matcher.matches() in Java
468   *
469   *    @param  regexp      The compiled regular expression.
470   *    @param  startIndex  The input string (native) index at which to begin matching, or -1
471   *                        to match the input Region.
472   *    @param  status      Receives errors detected by this function.
473   *    @return             true if there is a match
474   *   @stable ICU 4.6
475   */
476 U_CAPI UBool U_EXPORT2
477 uregex_matches64(URegularExpression *regexp,
478                  int64_t            startIndex,
479                  UErrorCode        *status);
480 
481 /**
482   *   Attempts to match the input string, starting from the specified index, against the pattern.
483   *   The match may be of any length, and is not required to extend to the end
484   *   of the input string.  Contrast with uregex_matches().
485   *
486   *   <p>If startIndex is >= 0 any input region that was set for this
487   *   URegularExpression is reset before the operation begins.
488   *
489   *   <p>If the specified starting index == -1 the match begins at the start of the input
490   *   region, or at the start of the full string if no region has been specified.
491   *   This corresponds directly with Matcher.lookingAt() in Java.
492   *
493   *   <p>If the match succeeds then more information can be obtained via the
494   *    <code>uregexp_start()</code>, <code>uregexp_end()</code>,
495   *    and <code>uregex_group()</code> functions.</p>
496   *
497   *    @param   regexp      The compiled regular expression.
498   *    @param   startIndex  The input string (native) index at which to begin matching, or
499   *                         -1 to match the Input Region
500   *    @param   status      A reference to a UErrorCode to receive any errors.
501   *    @return  true if there is a match.
502   *    @stable ICU 3.0
503   */
504 U_CAPI UBool U_EXPORT2
505 uregex_lookingAt(URegularExpression *regexp,
506                  int32_t             startIndex,
507                  UErrorCode         *status);
508 
509 /**
510   *   64bit version of uregex_lookingAt.
511   *   Attempts to match the input string, starting from the specified index, against the pattern.
512   *   The match may be of any length, and is not required to extend to the end
513   *   of the input string.  Contrast with uregex_matches().
514   *
515   *   <p>If startIndex is >= 0 any input region that was set for this
516   *   URegularExpression is reset before the operation begins.
517   *
518   *   <p>If the specified starting index == -1 the match begins at the start of the input
519   *   region, or at the start of the full string if no region has been specified.
520   *   This corresponds directly with Matcher.lookingAt() in Java.
521   *
522   *   <p>If the match succeeds then more information can be obtained via the
523   *    <code>uregexp_start()</code>, <code>uregexp_end()</code>,
524   *    and <code>uregex_group()</code> functions.</p>
525   *
526   *    @param   regexp      The compiled regular expression.
527   *    @param   startIndex  The input string (native) index at which to begin matching, or
528   *                         -1 to match the Input Region
529   *    @param   status      A reference to a UErrorCode to receive any errors.
530   *    @return  true if there is a match.
531   *    @stable ICU 4.6
532   */
533 U_CAPI UBool U_EXPORT2
534 uregex_lookingAt64(URegularExpression *regexp,
535                    int64_t             startIndex,
536                    UErrorCode         *status);
537 
538 /**
539   *   Find the first matching substring of the input string that matches the pattern.
540   *   If startIndex is >= zero the search for a match begins at the specified index,
541   *          and any match region is reset.  This corresponds directly with
542   *          Matcher.find(startIndex) in Java.
543   *
544   *   If startIndex == -1 the search begins at the start of the input region,
545   *           or at the start of the full string if no region has been specified.
546   *
547   *   If a match is found, <code>uregex_start(), uregex_end()</code>, and
548   *   <code>uregex_group()</code> will provide more information regarding the match.
549   *
550   *   @param   regexp      The compiled regular expression.
551   *   @param   startIndex  The position (native) in the input string to begin the search, or
552   *                        -1 to search within the Input Region.
553   *   @param   status      A reference to a UErrorCode to receive any errors.
554   *   @return              true if a match is found.
555   *   @stable ICU 3.0
556   */
557 U_CAPI UBool U_EXPORT2
558 uregex_find(URegularExpression *regexp,
559             int32_t             startIndex,
560             UErrorCode         *status);
561 
562 /**
563   *   64bit version of uregex_find.
564   *   Find the first matching substring of the input string that matches the pattern.
565   *   If startIndex is >= zero the search for a match begins at the specified index,
566   *          and any match region is reset.  This corresponds directly with
567   *          Matcher.find(startIndex) in Java.
568   *
569   *   If startIndex == -1 the search begins at the start of the input region,
570   *           or at the start of the full string if no region has been specified.
571   *
572   *   If a match is found, <code>uregex_start(), uregex_end()</code>, and
573   *   <code>uregex_group()</code> will provide more information regarding the match.
574   *
575   *   @param   regexp      The compiled regular expression.
576   *   @param   startIndex  The position (native) in the input string to begin the search, or
577   *                        -1 to search within the Input Region.
578   *   @param   status      A reference to a UErrorCode to receive any errors.
579   *   @return              true if a match is found.
580   *   @stable ICU 4.6
581   */
582 U_CAPI UBool U_EXPORT2
583 uregex_find64(URegularExpression *regexp,
584               int64_t             startIndex,
585               UErrorCode         *status);
586 
587 /**
588   *  Find the next pattern match in the input string.  Begin searching
589   *  the input at the location following the end of he previous match,
590   *  or at the start of the string (or region) if there is no
591   *  previous match.  If a match is found, <code>uregex_start(), uregex_end()</code>, and
592   *  <code>uregex_group()</code> will provide more information regarding the match.
593   *
594   *  @param   regexp      The compiled regular expression.
595   *  @param   status      A reference to a UErrorCode to receive any errors.
596   *  @return              true if a match is found.
597   *  @see uregex_reset
598   *  @stable ICU 3.0
599   */
600 U_CAPI UBool U_EXPORT2
601 uregex_findNext(URegularExpression *regexp,
602                 UErrorCode         *status);
603 
604 /**
605   *   Get the number of capturing groups in this regular expression's pattern.
606   *   @param   regexp      The compiled regular expression.
607   *   @param   status      A reference to a UErrorCode to receive any errors.
608   *   @return the number of capture groups
609   *   @stable ICU 3.0
610   */
611 U_CAPI int32_t U_EXPORT2
612 uregex_groupCount(URegularExpression *regexp,
613                   UErrorCode         *status);
614 
615 /**
616   * Get the group number corresponding to a named capture group.
617   * The returned number can be used with any function that access
618   * capture groups by number.
619   *
620   * The function returns an error status if the specified name does not
621   * appear in the pattern.
622   *
623   * @param  regexp      The compiled regular expression.
624   * @param  groupName   The capture group name.
625   * @param  nameLength  The length of the name, or -1 if the name is a
626   *                     nul-terminated string.
627   * @param  status      A pointer to a UErrorCode to receive any errors.
628   *
629   * @stable ICU 55
630   */
631 U_CAPI int32_t U_EXPORT2
632 uregex_groupNumberFromName(URegularExpression *regexp,
633                            const UChar        *groupName,
634                            int32_t             nameLength,
635                            UErrorCode          *status);
636 
637 
638 /**
639   * Get the group number corresponding to a named capture group.
640   * The returned number can be used with any function that access
641   * capture groups by number.
642   *
643   * The function returns an error status if the specified name does not
644   * appear in the pattern.
645   *
646   * @param  regexp      The compiled regular expression.
647   * @param  groupName   The capture group name,
648   *                     platform invariant characters only.
649   * @param  nameLength  The length of the name, or -1 if the name is
650   *                     nul-terminated.
651   * @param  status      A pointer to a UErrorCode to receive any errors.
652   *
653   * @stable ICU 55
654   */
655 U_CAPI int32_t U_EXPORT2
656 uregex_groupNumberFromCName(URegularExpression *regexp,
657                             const char         *groupName,
658                             int32_t             nameLength,
659                             UErrorCode          *status);
660 
661 /** Extract the string for the specified matching expression or subexpression.
662   * Group #0 is the complete string of matched text.
663   * Group #1 is the text matched by the first set of capturing parentheses.
664   *
665   *   @param   regexp       The compiled regular expression.
666   *   @param   groupNum     The capture group to extract.  Group 0 is the complete
667   *                         match.  The value of this parameter must be
668   *                         less than or equal to the number of capture groups in
669   *                         the pattern.
670   *   @param   dest         Buffer to receive the matching string data
671   *   @param   destCapacity Capacity of the dest buffer.
672   *   @param   status       A reference to a UErrorCode to receive any errors.
673   *   @return               Length of matching data,
674   *                         or -1 if no applicable match.
675   *   @stable ICU 3.0
676   */
677 U_CAPI int32_t U_EXPORT2
678 uregex_group(URegularExpression *regexp,
679              int32_t             groupNum,
680              UChar              *dest,
681              int32_t             destCapacity,
682              UErrorCode          *status);
683 
684 /** Returns a shallow immutable clone of the entire input string with the current index set
685   *   to the beginning of the requested capture group.  The capture group length is also
686   *   returned via groupLength.
687   * Group #0 is the complete string of matched text.
688   * Group #1 is the text matched by the first set of capturing parentheses.
689   *
690   *   @param   regexp       The compiled regular expression.
691   *   @param   groupNum     The capture group to extract.  Group 0 is the complete
692   *                         match.  The value of this parameter must be
693   *                         less than or equal to the number of capture groups in
694   *                         the pattern.
695   *   @param   dest         A mutable UText in which to store the current input.
696   *                         If NULL, a new UText will be created as an immutable shallow clone
697   *                         of the entire input string.
698   *   @param   groupLength  The group length of the desired capture group. Output parameter.
699   *   @param   status       A reference to a UErrorCode to receive any errors.
700   *   @return               The subject text currently associated with this regular expression.
701   *                         If a pre-allocated UText was provided, it will always be used and returned.
702 
703   *
704   *   @stable ICU 4.6
705   */
706 U_CAPI UText * U_EXPORT2
707 uregex_groupUText(URegularExpression *regexp,
708                   int32_t             groupNum,
709                   UText              *dest,
710                   int64_t            *groupLength,
711                   UErrorCode         *status);
712 
713 /**
714   *   Returns the index in the input string of the start of the text matched by the
715   *   specified capture group during the previous match operation.  Return -1 if
716   *   the capture group was not part of the last match.
717   *   Group #0 refers to the complete range of matched text.
718   *   Group #1 refers to the text matched by the first set of capturing parentheses.
719   *
720   *    @param   regexp      The compiled regular expression.
721   *    @param   groupNum    The capture group number
722   *    @param   status      A reference to a UErrorCode to receive any errors.
723   *    @return              the starting (native) position in the input of the text matched
724   *                         by the specified group.
725   *    @stable ICU 3.0
726   */
727 U_CAPI int32_t U_EXPORT2
728 uregex_start(URegularExpression *regexp,
729              int32_t             groupNum,
730              UErrorCode          *status);
731 
732 /**
733   *   64bit version of uregex_start.
734   *   Returns the index in the input string of the start of the text matched by the
735   *   specified capture group during the previous match operation.  Return -1 if
736   *   the capture group was not part of the last match.
737   *   Group #0 refers to the complete range of matched text.
738   *   Group #1 refers to the text matched by the first set of capturing parentheses.
739   *
740   *    @param   regexp      The compiled regular expression.
741   *    @param   groupNum    The capture group number
742   *    @param   status      A reference to a UErrorCode to receive any errors.
743   *    @return              the starting (native) position in the input of the text matched
744   *                         by the specified group.
745   *   @stable ICU 4.6
746   */
747 U_CAPI int64_t U_EXPORT2
748 uregex_start64(URegularExpression *regexp,
749                int32_t             groupNum,
750                UErrorCode          *status);
751 
752 /**
753   *   Returns the index in the input string of the position following the end
754   *   of the text matched by the specified capture group.
755   *   Return -1 if the capture group was not part of the last match.
756   *   Group #0 refers to the complete range of matched text.
757   *   Group #1 refers to the text matched by the first set of capturing parentheses.
758   *
759   *    @param   regexp      The compiled regular expression.
760   *    @param   groupNum    The capture group number
761   *    @param   status      A reference to a UErrorCode to receive any errors.
762   *    @return              the (native) index of the position following the last matched character.
763   *    @stable ICU 3.0
764   */
765 U_CAPI int32_t U_EXPORT2
766 uregex_end(URegularExpression   *regexp,
767            int32_t               groupNum,
768            UErrorCode           *status);
769 
770 /**
771   *   64bit version of uregex_end.
772   *   Returns the index in the input string of the position following the end
773   *   of the text matched by the specified capture group.
774   *   Return -1 if the capture group was not part of the last match.
775   *   Group #0 refers to the complete range of matched text.
776   *   Group #1 refers to the text matched by the first set of capturing parentheses.
777   *
778   *    @param   regexp      The compiled regular expression.
779   *    @param   groupNum    The capture group number
780   *    @param   status      A reference to a UErrorCode to receive any errors.
781   *    @return              the (native) index of the position following the last matched character.
782   *   @stable ICU 4.6
783   */
784 U_CAPI int64_t U_EXPORT2
785 uregex_end64(URegularExpression *regexp,
786              int32_t               groupNum,
787              UErrorCode           *status);
788 
789 /**
790   *  Reset any saved state from the previous match.  Has the effect of
791   *  causing uregex_findNext to begin at the specified index, and causing
792   *  uregex_start(), uregex_end() and uregex_group() to return an error
793   *  indicating that there is no match information available.  Clears any
794   *  match region that may have been set.
795   *
796   *    @param   regexp      The compiled regular expression.
797   *    @param   index       The position (native) in the text at which a
798   *                         uregex_findNext() should begin searching.
799   *    @param   status      A reference to a UErrorCode to receive any errors.
800   *    @stable ICU 3.0
801   */
802 U_CAPI void U_EXPORT2
803 uregex_reset(URegularExpression    *regexp,
804              int32_t               index,
805              UErrorCode            *status);
806 
807 /**
808   *  64bit version of uregex_reset.
809   *  Reset any saved state from the previous match.  Has the effect of
810   *  causing uregex_findNext to begin at the specified index, and causing
811   *  uregex_start(), uregex_end() and uregex_group() to return an error
812   *  indicating that there is no match information available.  Clears any
813   *  match region that may have been set.
814   *
815   *    @param   regexp      The compiled regular expression.
816   *    @param   index       The position (native) in the text at which a
817   *                         uregex_findNext() should begin searching.
818   *    @param   status      A reference to a UErrorCode to receive any errors.
819   *    @stable ICU 4.6
820   */
821 U_CAPI void U_EXPORT2
822 uregex_reset64(URegularExpression  *regexp,
823                int64_t               index,
824                UErrorCode            *status);
825 
826 /**
827   * Sets the limits of the matching region for this URegularExpression.
828   * The region is the part of the input string that will be considered when matching.
829   * Invoking this method resets any saved state from the previous match,
830   * then sets the region to start at the index specified by the start parameter
831   * and end at the index specified by the end parameter.
832   *
833   * Depending on the transparency and anchoring being used (see useTransparentBounds
834   * and useAnchoringBounds), certain constructs such as anchors may behave differently
835   * at or around the boundaries of the region
836   *
837   * The function will fail if start is greater than limit, or if either index
838   *  is less than zero or greater than the length of the string being matched.
839   *
840   * @param regexp The compiled regular expression.
841   * @param regionStart  The (native) index to begin searches at.
842   * @param regionLimit  The (native) index to end searches at (exclusive).
843   * @param status A pointer to a UErrorCode to receive any errors.
844   * @stable ICU 4.0
845   */
846 U_CAPI void U_EXPORT2
847 uregex_setRegion(URegularExpression   *regexp,
848                  int32_t               regionStart,
849                  int32_t               regionLimit,
850                  UErrorCode           *status);
851 
852 /**
853   * 64bit version of uregex_setRegion.
854   * Sets the limits of the matching region for this URegularExpression.
855   * The region is the part of the input string that will be considered when matching.
856   * Invoking this method resets any saved state from the previous match,
857   * then sets the region to start at the index specified by the start parameter
858   * and end at the index specified by the end parameter.
859   *
860   * Depending on the transparency and anchoring being used (see useTransparentBounds
861   * and useAnchoringBounds), certain constructs such as anchors may behave differently
862   * at or around the boundaries of the region
863   *
864   * The function will fail if start is greater than limit, or if either index
865   *  is less than zero or greater than the length of the string being matched.
866   *
867   * @param regexp The compiled regular expression.
868   * @param regionStart  The (native) index to begin searches at.
869   * @param regionLimit  The (native) index to end searches at (exclusive).
870   * @param status A pointer to a UErrorCode to receive any errors.
871   * @stable ICU 4.6
872   */
873 U_CAPI void U_EXPORT2
874 uregex_setRegion64(URegularExpression *regexp,
875                  int64_t               regionStart,
876                  int64_t               regionLimit,
877                  UErrorCode           *status);
878 
879 /**
880   *  Set the matching region and the starting index for subsequent matches
881   *  in a single operation.
882   *  This is useful because the usual function for setting the starting
883   *  index, urgex_reset(), also resets any region limits.
884   *
885   * @param regexp The compiled regular expression.
886   * @param regionStart  The (native) index to begin searches at.
887   * @param regionLimit  The (native) index to end searches at (exclusive).
888   * @param startIndex   The index in the input text at which the next
889   *                     match operation should begin.
890   * @param status A pointer to a UErrorCode to receive any errors.
891   * @stable ICU 4.6
892   */
893 U_CAPI void U_EXPORT2
894 uregex_setRegionAndStart(URegularExpression *regexp,
895                  int64_t               regionStart,
896                  int64_t               regionLimit,
897                  int64_t               startIndex,
898                  UErrorCode           *status);
899 
900 /**
901   * Reports the start index of the matching region. Any matches found are limited to
902   * to the region bounded by regionStart (inclusive) and regionEnd (exclusive).
903   *
904   * @param regexp The compiled regular expression.
905   * @param status A pointer to a UErrorCode to receive any errors.
906   * @return The starting (native) index of this matcher's region.
907   * @stable ICU 4.0
908   */
909 U_CAPI int32_t U_EXPORT2
910 uregex_regionStart(const  URegularExpression   *regexp,
911                           UErrorCode           *status);
912 
913 /**
914   * 64bit version of uregex_regionStart.
915   * Reports the start index of the matching region. Any matches found are limited to
916   * to the region bounded by regionStart (inclusive) and regionEnd (exclusive).
917   *
918   * @param regexp The compiled regular expression.
919   * @param status A pointer to a UErrorCode to receive any errors.
920   * @return The starting (native) index of this matcher's region.
921   * @stable ICU 4.6
922   */
923 U_CAPI int64_t U_EXPORT2
924 uregex_regionStart64(const  URegularExpression   *regexp,
925                             UErrorCode           *status);
926 
927 /**
928   * Reports the end index (exclusive) of the matching region for this URegularExpression.
929   * Any matches found are limited to to the region bounded by regionStart (inclusive)
930   * and regionEnd (exclusive).
931   *
932   * @param regexp The compiled regular expression.
933   * @param status A pointer to a UErrorCode to receive any errors.
934   * @return The ending point (native) of this matcher's region.
935   * @stable ICU 4.0
936   */
937 U_CAPI int32_t U_EXPORT2
938 uregex_regionEnd(const  URegularExpression   *regexp,
939                         UErrorCode           *status);
940 
941 /**
942   * 64bit version of uregex_regionEnd.
943   * Reports the end index (exclusive) of the matching region for this URegularExpression.
944   * Any matches found are limited to to the region bounded by regionStart (inclusive)
945   * and regionEnd (exclusive).
946   *
947   * @param regexp The compiled regular expression.
948   * @param status A pointer to a UErrorCode to receive any errors.
949   * @return The ending point (native) of this matcher's region.
950   * @stable ICU 4.6
951   */
952 U_CAPI int64_t U_EXPORT2
953 uregex_regionEnd64(const  URegularExpression   *regexp,
954                           UErrorCode           *status);
955 
956 /**
957   * Queries the transparency of region bounds for this URegularExpression.
958   * See useTransparentBounds for a description of transparent and opaque bounds.
959   * By default, matching boundaries are opaque.
960   *
961   * @param regexp The compiled regular expression.
962   * @param status A pointer to a UErrorCode to receive any errors.
963   * @return true if this matcher is using opaque bounds, false if it is not.
964   * @stable ICU 4.0
965   */
966 U_CAPI UBool U_EXPORT2
967 uregex_hasTransparentBounds(const  URegularExpression   *regexp,
968                                    UErrorCode           *status);
969 
970 
971 /**
972   * Sets the transparency of region bounds for this URegularExpression.
973   * Invoking this function with an argument of true will set matches to use transparent bounds.
974   * If the boolean argument is false, then opaque bounds will be used.
975   *
976   * Using transparent bounds, the boundaries of the matching region are transparent
977   * to lookahead, lookbehind, and boundary matching constructs. Those constructs can
978   * see text beyond the boundaries of the region while checking for a match.
979   *
980   * With opaque bounds, no text outside of the matching region is visible to lookahead,
981   * lookbehind, and boundary matching constructs.
982   *
983   * By default, opaque bounds are used.
984   *
985   * @param   regexp The compiled regular expression.
986   * @param   b      true for transparent bounds; false for opaque bounds
987   * @param   status A pointer to a UErrorCode to receive any errors.
988   * @stable ICU 4.0
989   **/
990 U_CAPI void U_EXPORT2
991 uregex_useTransparentBounds(URegularExpression   *regexp,
992                             UBool                b,
993                             UErrorCode           *status);
994 
995 
996 /**
997   * Return true if this URegularExpression is using anchoring bounds.
998   * By default, anchoring region bounds are used.
999   *
1000   * @param  regexp The compiled regular expression.
1001   * @param  status A pointer to a UErrorCode to receive any errors.
1002   * @return true if this matcher is using anchoring bounds.
1003   * @stable ICU 4.0
1004   */
1005 U_CAPI UBool U_EXPORT2
1006 uregex_hasAnchoringBounds(const  URegularExpression   *regexp,
1007                                  UErrorCode           *status);
1008 
1009 
1010 /**
1011   * Set whether this URegularExpression is using Anchoring Bounds for its region.
1012   * With anchoring bounds, pattern anchors such as ^ and $ will match at the start
1013   * and end of the region.  Without Anchoring Bounds, anchors will only match at
1014   * the positions they would in the complete text.
1015   *
1016   * Anchoring Bounds are the default for regions.
1017   *
1018   * @param regexp The compiled regular expression.
1019   * @param b      true if to enable anchoring bounds; false to disable them.
1020   * @param status A pointer to a UErrorCode to receive any errors.
1021   * @stable ICU 4.0
1022   */
1023 U_CAPI void U_EXPORT2
1024 uregex_useAnchoringBounds(URegularExpression   *regexp,
1025                           UBool                 b,
1026                           UErrorCode           *status);
1027 
1028 /**
1029   * Return true if the most recent matching operation touched the
1030   *  end of the text being processed.  In this case, additional input text could
1031   *  change the results of that match.
1032   *
1033   *  @param regexp The compiled regular expression.
1034   *  @param status A pointer to a UErrorCode to receive any errors.
1035   *  @return  true if the most recent match hit the end of input
1036   *  @stable ICU 4.0
1037   */
1038 U_CAPI UBool U_EXPORT2
1039 uregex_hitEnd(const  URegularExpression   *regexp,
1040                      UErrorCode           *status);
1041 
1042 /**
1043   * Return true the most recent match succeeded and additional input could cause
1044   * it to fail. If this function returns false and a match was found, then more input
1045   * might change the match but the match won't be lost. If a match was not found,
1046   * then requireEnd has no meaning.
1047   *
1048   * @param regexp The compiled regular expression.
1049   * @param status A pointer to a UErrorCode to receive any errors.
1050   * @return true  if more input could cause the most recent match to no longer match.
1051   * @stable ICU 4.0
1052   */
1053 U_CAPI UBool U_EXPORT2
1054 uregex_requireEnd(const  URegularExpression   *regexp,
1055                          UErrorCode           *status);
1056 
1057 
1058 
1059 
1060 
1061 /**
1062   *    Replaces every substring of the input that matches the pattern
1063   *    with the given replacement string.  This is a convenience function that
1064   *    provides a complete find-and-replace-all operation.
1065   *
1066   *    This method scans the input string looking for matches of the pattern.
1067   *    Input that is not part of any match is copied unchanged to the
1068   *    destination buffer.  Matched regions are replaced in the output
1069   *    buffer by the replacement string.   The replacement string may contain
1070   *    references to capture groups; these take the form of $1, $2, etc.
1071   *
1072   *    @param   regexp             The compiled regular expression.
1073   *    @param   replacementText    A string containing the replacement text.
1074   *    @param   replacementLength  The length of the replacement string, or
1075   *                                -1 if it is NUL terminated.
1076   *    @param   destBuf            A (UChar *) buffer that will receive the result.
1077   *    @param   destCapacity       The capacity of the destination buffer.
1078   *    @param   status             A reference to a UErrorCode to receive any errors.
1079   *    @return                     The length of the string resulting from the find
1080   *                                and replace operation.  In the event that the
1081   *                                destination capacity is inadequate, the return value
1082   *                                is still the full length of the untruncated string.
1083   *    @stable ICU 3.0
1084   */
1085 U_CAPI int32_t U_EXPORT2
1086 uregex_replaceAll(URegularExpression    *regexp,
1087                   const UChar           *replacementText,
1088                   int32_t                replacementLength,
1089                   UChar                 *destBuf,
1090                   int32_t                destCapacity,
1091                   UErrorCode            *status);
1092 
1093 /**
1094   *    Replaces every substring of the input that matches the pattern
1095   *    with the given replacement string.  This is a convenience function that
1096   *    provides a complete find-and-replace-all operation.
1097   *
1098   *    This method scans the input string looking for matches of the pattern.
1099   *    Input that is not part of any match is copied unchanged to the
1100   *    destination buffer.  Matched regions are replaced in the output
1101   *    buffer by the replacement string.   The replacement string may contain
1102   *    references to capture groups; these take the form of $1, $2, etc.
1103   *
1104   *    @param   regexp         The compiled regular expression.
1105   *    @param   replacement    A string containing the replacement text.
1106   *    @param   dest           A mutable UText that will receive the result.
1107   *                             If NULL, a new UText will be created (which may not be mutable).
1108   *    @param   status         A reference to a UErrorCode to receive any errors.
1109   *    @return                 A UText containing the results of the find and replace.
1110   *                             If a pre-allocated UText was provided, it will always be used and returned.
1111   *
1112   *    @stable ICU 4.6
1113   */
1114 U_CAPI UText * U_EXPORT2
1115 uregex_replaceAllUText(URegularExpression *regexp,
1116                        UText              *replacement,
1117                        UText              *dest,
1118                        UErrorCode         *status);
1119 
1120 /**
1121   *    Replaces the first substring of the input that matches the pattern
1122   *    with the given replacement string.  This is a convenience function that
1123   *    provides a complete find-and-replace operation.
1124   *
1125   *    This method scans the input string looking for a match of the pattern.
1126   *    All input that is not part of the match is copied unchanged to the
1127   *    destination buffer.  The matched region is replaced in the output
1128   *    buffer by the replacement string.   The replacement string may contain
1129   *    references to capture groups; these take the form of $1, $2, etc.
1130   *
1131   *    @param   regexp             The compiled regular expression.
1132   *    @param   replacementText    A string containing the replacement text.
1133   *    @param   replacementLength  The length of the replacement string, or
1134   *                                -1 if it is NUL terminated.
1135   *    @param   destBuf            A (UChar *) buffer that will receive the result.
1136   *    @param   destCapacity       The capacity of the destination buffer.
1137   *    @param   status             a reference to a UErrorCode to receive any errors.
1138   *    @return                     The length of the string resulting from the find
1139   *                                and replace operation.  In the event that the
1140   *                                destination capacity is inadequate, the return value
1141   *                                is still the full length of the untruncated string.
1142   *    @stable ICU 3.0
1143   */
1144 U_CAPI int32_t U_EXPORT2
1145 uregex_replaceFirst(URegularExpression  *regexp,
1146                     const UChar         *replacementText,
1147                     int32_t              replacementLength,
1148                     UChar               *destBuf,
1149                     int32_t              destCapacity,
1150                     UErrorCode          *status);
1151 
1152 /**
1153   *    Replaces the first substring of the input that matches the pattern
1154   *    with the given replacement string.  This is a convenience function that
1155   *    provides a complete find-and-replace operation.
1156   *
1157   *    This method scans the input string looking for a match of the pattern.
1158   *    All input that is not part of the match is copied unchanged to the
1159   *    destination buffer.  The matched region is replaced in the output
1160   *    buffer by the replacement string.   The replacement string may contain
1161   *    references to capture groups; these take the form of $1, $2, etc.
1162   *
1163   *    @param   regexp         The compiled regular expression.
1164   *    @param   replacement    A string containing the replacement text.
1165   *    @param   dest           A mutable UText that will receive the result.
1166   *                             If NULL, a new UText will be created (which may not be mutable).
1167   *    @param   status         A reference to a UErrorCode to receive any errors.
1168   *    @return                 A UText containing the results of the find and replace.
1169   *                             If a pre-allocated UText was provided, it will always be used and returned.
1170   *
1171   *    @stable ICU 4.6
1172   */
1173 U_CAPI UText * U_EXPORT2
1174 uregex_replaceFirstUText(URegularExpression *regexp,
1175                          UText              *replacement,
1176                          UText              *dest,
1177                          UErrorCode         *status);
1178 
1179 /**
1180   *   Implements a replace operation intended to be used as part of an
1181   *   incremental find-and-replace.
1182   *
1183   *   <p>The input string, starting from the end of the previous match and ending at
1184   *   the start of the current match, is appended to the destination string.  Then the
1185   *   replacement string is appended to the output string,
1186   *   including handling any substitutions of captured text.</p>
1187   *
1188   *   <p>A note on preflight computation of buffersize and error handling:
1189   *   Calls to uregex_appendReplacement() and uregex_appendTail() are
1190   *   designed to be chained, one after another, with the destination
1191   *   buffer pointer and buffer capacity updated after each in preparation
1192   *   to for the next.  If the destination buffer is exhausted partway through such a
1193   *   sequence, a U_BUFFER_OVERFLOW_ERROR status will be returned.  Normal
1194   *   ICU conventions are for a function to perform no action if it is
1195   *   called with an error status, but for this one case, uregex_appendRepacement()
1196   *   will operate normally so that buffer size computations will complete
1197   *   correctly.
1198   *
1199   *   <p>For simple, prepackaged, non-incremental find-and-replace
1200   *      operations, see replaceFirst() or replaceAll().</p>
1201   *
1202   *   @param   regexp      The regular expression object.
1203   *   @param   replacementText The string that will replace the matched portion of the
1204   *                        input string as it is copied to the destination buffer.
1205   *                        The replacement text may contain references ($1, for
1206   *                        example) to capture groups from the match.
1207   *   @param   replacementLength  The length of the replacement text string,
1208   *                        or -1 if the string is NUL terminated.
1209   *   @param   destBuf     The buffer into which the results of the
1210   *                        find-and-replace are placed.  On return, this pointer
1211   *                        will be updated to refer to the beginning of the
1212   *                        unused portion of buffer, leaving it in position for
1213   *                        a subsequent call to this function.
1214   *   @param   destCapacity The size of the output buffer,  On return, this
1215   *                        parameter will be updated to reflect the space remaining
1216   *                        unused in the output buffer.
1217   *   @param   status      A reference to a UErrorCode to receive any errors.
1218   *   @return              The length of the result string.  In the event that
1219   *                        destCapacity is inadequate, the full length of the
1220   *                        untruncated output string is returned.
1221   *
1222   *   @stable ICU 3.0
1223   *
1224   */
1225 U_CAPI int32_t U_EXPORT2
1226 uregex_appendReplacement(URegularExpression    *regexp,
1227                          const UChar           *replacementText,
1228                          int32_t                replacementLength,
1229                          UChar                **destBuf,
1230                          int32_t               *destCapacity,
1231                          UErrorCode            *status);
1232 
1233 /**
1234   *   Implements a replace operation intended to be used as part of an
1235   *   incremental find-and-replace.
1236   *
1237   *   <p>The input string, starting from the end of the previous match and ending at
1238   *   the start of the current match, is appended to the destination string.  Then the
1239   *   replacement string is appended to the output string,
1240   *   including handling any substitutions of captured text.</p>
1241   *
1242   *   <p>For simple, prepackaged, non-incremental find-and-replace
1243   *      operations, see replaceFirst() or replaceAll().</p>
1244   *
1245   *   @param   regexp      The regular expression object.
1246   *   @param   replacementText The string that will replace the matched portion of the
1247   *                        input string as it is copied to the destination buffer.
1248   *                        The replacement text may contain references ($1, for
1249   *                        example) to capture groups from the match.
1250   *   @param   dest        A mutable UText that will receive the result. Must not be NULL.
1251   *   @param   status      A reference to a UErrorCode to receive any errors.
1252   *
1253   *   @stable ICU 4.6
1254   */
1255 U_CAPI void U_EXPORT2
1256 uregex_appendReplacementUText(URegularExpression    *regexp,
1257                               UText                 *replacementText,
1258                               UText                 *dest,
1259                               UErrorCode            *status);
1260 
1261 /**
1262   * As the final step in a find-and-replace operation, append the remainder
1263   * of the input string, starting at the position following the last match,
1264   * to the destination string. <code>uregex_appendTail()</code> is intended
1265   *  to be invoked after one or more invocations of the
1266   *  <code>uregex_appendReplacement()</code> function.
1267   *
1268   *   @param   regexp      The regular expression object.  This is needed to
1269   *                        obtain the input string and with the position
1270   *                        of the last match within it.
1271   *   @param   destBuf     The buffer in which the results of the
1272   *                        find-and-replace are placed.  On return, the pointer
1273   *                        will be updated to refer to the beginning of the
1274   *                        unused portion of buffer.
1275   *   @param   destCapacity The size of the output buffer,  On return, this
1276   *                        value will be updated to reflect the space remaining
1277   *                        unused in the output buffer.
1278   *   @param   status      A reference to a UErrorCode to receive any errors.
1279   *   @return              The length of the result string.  In the event that
1280   *                        destCapacity is inadequate, the full length of the
1281   *                        untruncated output string is returned.
1282   *
1283   *   @stable ICU 3.0
1284   */
1285 U_CAPI int32_t U_EXPORT2
1286 uregex_appendTail(URegularExpression    *regexp,
1287                   UChar                **destBuf,
1288                   int32_t               *destCapacity,
1289                   UErrorCode            *status);
1290 
1291 /**
1292   * As the final step in a find-and-replace operation, append the remainder
1293   * of the input string, starting at the position following the last match,
1294   * to the destination string. <code>uregex_appendTailUText()</code> is intended
1295   *  to be invoked after one or more invocations of the
1296   *  <code>uregex_appendReplacementUText()</code> function.
1297   *
1298   *   @param   regexp      The regular expression object.  This is needed to
1299   *                        obtain the input string and with the position
1300   *                        of the last match within it.
1301   *   @param   dest        A mutable UText that will receive the result. Must not be NULL.
1302   *
1303   *   @param status        Error code
1304   *
1305   *   @return              The destination UText.
1306   *
1307   *   @stable ICU 4.6
1308   */
1309 U_CAPI UText * U_EXPORT2
1310 uregex_appendTailUText(URegularExpression    *regexp,
1311                        UText                 *dest,
1312                        UErrorCode            *status);
1313 
1314  /**
1315    * Split a string into fields.  Somewhat like split() from Perl.
1316    *  The pattern matches identify delimiters that separate the input
1317    *  into fields.  The input data between the matches becomes the
1318    *  fields themselves.
1319    *
1320    *  Each of the fields is copied from the input string to the destination
1321    *  buffer, and NUL terminated.  The position of each field within
1322    *  the destination buffer is returned in the destFields array.
1323    *
1324    *  If the delimiter pattern includes capture groups, the captured text will
1325    *  also appear in the destination array of output strings, interspersed
1326    *  with the fields.  This is similar to Perl, but differs from Java,
1327    *  which ignores the presence of capture groups in the pattern.
1328    *
1329    *  Trailing empty fields will always be returned, assuming sufficient
1330    *  destination capacity.  This differs from the default behavior for Java
1331    *  and Perl where trailing empty fields are not returned.
1332    *
1333    *  The number of strings produced by the split operation is returned.
1334    *  This count includes the strings from capture groups in the delimiter pattern.
1335    *  This behavior differs from Java, which ignores capture groups.
1336    *
1337    *    @param   regexp      The compiled regular expression.
1338    *    @param   destBuf     A (UChar *) buffer to receive the fields that
1339    *                         are extracted from the input string. These
1340    *                         field pointers will refer to positions within the
1341    *                         destination buffer supplied by the caller.  Any
1342    *                         extra positions within the destFields array will be
1343    *                         set to NULL.
1344    *    @param   destCapacity The capacity of the destBuf.
1345    *    @param   requiredCapacity  The actual capacity required of the destBuf.
1346    *                         If destCapacity is too small, requiredCapacity will return
1347    *                         the total capacity required to hold all of the output, and
1348    *                         a U_BUFFER_OVERFLOW_ERROR will be returned.
1349    *    @param   destFields  An array to be filled with the position of each
1350    *                         of the extracted fields within destBuf.
1351    *    @param   destFieldsCapacity  The number of elements in the destFields array.
1352    *                If the number of fields found is less than destFieldsCapacity,
1353    *                the extra destFields elements are set to zero.
1354    *                If destFieldsCapacity is too small, the trailing part of the
1355    *                input, including any field delimiters, is treated as if it
1356    *                were the last field - it is copied to the destBuf, and
1357    *                its position is in the destBuf is stored in the last element
1358    *                of destFields.  This behavior mimics that of Perl.  It is not
1359    *                an error condition, and no error status is returned when all destField
1360    *                positions are used.
1361    * @param status  A reference to a UErrorCode to receive any errors.
1362    * @return        The number of fields into which the input string was split.
1363    * @stable ICU 3.0
1364    */
1365 U_CAPI int32_t U_EXPORT2
1366 uregex_split(   URegularExpression      *regexp,
1367                   UChar                 *destBuf,
1368                   int32_t                destCapacity,
1369                   int32_t               *requiredCapacity,
1370                   UChar                 *destFields[],
1371                   int32_t                destFieldsCapacity,
1372                   UErrorCode            *status);
1373 
1374   /**
1375    * Split a string into fields.  Somewhat like split() from Perl.
1376    * The pattern matches identify delimiters that separate the input
1377    *  into fields.  The input data between the matches becomes the
1378    *  fields themselves.
1379    * <p>
1380    * The behavior of this function is not very closely aligned with uregex_split();
1381    * instead, it is based on (and implemented directly on top of) the C++ split method.
1382    *
1383    * @param regexp  The compiled regular expression.
1384    * @param destFields    An array of mutable UText structs to receive the results of the split.
1385    *                If a field is NULL, a new UText is allocated to contain the results for
1386    *                that field. This new UText is not guaranteed to be mutable.
1387    * @param destFieldsCapacity  The number of elements in the destination array.
1388    *                If the number of fields found is less than destCapacity, the
1389    *                extra strings in the destination array are not altered.
1390    *                If the number of destination strings is less than the number
1391    *                of fields, the trailing part of the input string, including any
1392    *                field delimiters, is placed in the last destination string.
1393    *                This behavior mimics that of Perl.  It is not  an error condition, and no
1394    *                error status is returned when all destField positions are used.
1395    * @param status  A reference to a UErrorCode to receive any errors.
1396    * @return        The number of fields into which the input string was split.
1397    *
1398    * @stable ICU 4.6
1399    */
1400 U_CAPI int32_t U_EXPORT2
1401 uregex_splitUText(URegularExpression    *regexp,
1402                   UText                 *destFields[],
1403                   int32_t                destFieldsCapacity,
1404                   UErrorCode            *status);
1405 
1406 /**
1407  * Set a processing time limit for match operations with this URegularExpression.
1408  *
1409  * Some patterns, when matching certain strings, can run in exponential time.
1410  * For practical purposes, the match operation may appear to be in an
1411  * infinite loop.
1412  * When a limit is set a match operation will fail with an error if the
1413  * limit is exceeded.
1414  * <p>
1415  * The units of the limit are steps of the match engine.
1416  * Correspondence with actual processor time will depend on the speed
1417  * of the processor and the details of the specific pattern, but will
1418  * typically be on the order of milliseconds.
1419  * <p>
1420  * By default, the matching time is not limited.
1421  * <p>
1422  *
1423  * @param   regexp      The compiled regular expression.
1424  * @param   limit       The limit value, or 0 for no limit.
1425  * @param   status      A reference to a UErrorCode to receive any errors.
1426  * @stable ICU 4.0
1427  */
1428 U_CAPI void U_EXPORT2
1429 uregex_setTimeLimit(URegularExpression      *regexp,
1430                     int32_t                  limit,
1431                     UErrorCode              *status);
1432 
1433 /**
1434  * Get the time limit for for matches with this URegularExpression.
1435  * A return value of zero indicates that there is no limit.
1436  *
1437  * @param   regexp      The compiled regular expression.
1438  * @param   status      A reference to a UErrorCode to receive any errors.
1439  * @return the maximum allowed time for a match, in units of processing steps.
1440  * @stable ICU 4.0
1441  */
1442 U_CAPI int32_t U_EXPORT2
1443 uregex_getTimeLimit(const URegularExpression      *regexp,
1444                           UErrorCode              *status);
1445 
1446 /**
1447  * Set the amount of heap storage available for use by the match backtracking stack.
1448  * <p>
1449  * ICU uses a backtracking regular expression engine, with the backtrack stack
1450  * maintained on the heap.  This function sets the limit to the amount of memory
1451  * that can be used  for this purpose.  A backtracking stack overflow will
1452  * result in an error from the match operation that caused it.
1453  * <p>
1454  * A limit is desirable because a malicious or poorly designed pattern can use
1455  * excessive memory, potentially crashing the process.  A limit is enabled
1456  * by default.
1457  * <p>
1458  * @param   regexp      The compiled regular expression.
1459  * @param   limit       The maximum size, in bytes, of the matching backtrack stack.
1460  *                      A value of zero means no limit.
1461  *                      The limit must be greater than or equal to zero.
1462  * @param   status      A reference to a UErrorCode to receive any errors.
1463  *
1464  * @stable ICU 4.0
1465  */
1466 U_CAPI void U_EXPORT2
1467 uregex_setStackLimit(URegularExpression      *regexp,
1468                      int32_t                  limit,
1469                      UErrorCode              *status);
1470 
1471 /**
1472  * Get the size of the heap storage available for use by the back tracking stack.
1473  *
1474  * @return  the maximum backtracking stack size, in bytes, or zero if the
1475  *          stack size is unlimited.
1476  * @stable ICU 4.0
1477  */
1478 U_CAPI int32_t U_EXPORT2
1479 uregex_getStackLimit(const URegularExpression      *regexp,
1480                            UErrorCode              *status);
1481 
1482 
1483 /**
1484  * Function pointer for a regular expression matching callback function.
1485  * When set, a callback function will be called periodically during matching
1486  * operations.  If the call back function returns false, the matching
1487  * operation will be terminated early.
1488  *
1489  * Note:  the callback function must not call other functions on this
1490  *        URegularExpression.
1491  *
1492  * @param context  context pointer.  The callback function will be invoked
1493  *                 with the context specified at the time that
1494  *                 uregex_setMatchCallback() is called.
1495  * @param steps    the accumulated processing time, in match steps,
1496  *                 for this matching operation.
1497  * @return         true to continue the matching operation.
1498  *                 false to terminate the matching operation.
1499  * @stable ICU 4.0
1500  */
1501 U_CDECL_BEGIN
1502 typedef UBool U_CALLCONV URegexMatchCallback (
1503                    const void *context,
1504                    int32_t     steps);
1505 U_CDECL_END
1506 
1507 /**
1508  * Set a callback function for this URegularExpression.
1509  * During matching operations the function will be called periodically,
1510  * giving the application the opportunity to terminate a long-running
1511  * match.
1512  *
1513  * @param   regexp      The compiled regular expression.
1514  * @param   callback    A pointer to the user-supplied callback function.
1515  * @param   context     User context pointer.  The value supplied at the
1516  *                      time the callback function is set will be saved
1517  *                      and passed to the callback each time that it is called.
1518  * @param   status      A reference to a UErrorCode to receive any errors.
1519  * @stable ICU 4.0
1520  */
1521 U_CAPI void U_EXPORT2
1522 uregex_setMatchCallback(URegularExpression      *regexp,
1523                         URegexMatchCallback     *callback,
1524                         const void              *context,
1525                         UErrorCode              *status);
1526 
1527 
1528 /**
1529  *  Get the callback function for this URegularExpression.
1530  *
1531  * @param   regexp      The compiled regular expression.
1532  * @param   callback    Out parameter, receives a pointer to the user-supplied
1533  *                      callback function.
1534  * @param   context     Out parameter, receives the user context pointer that
1535  *                      was set when uregex_setMatchCallback() was called.
1536  * @param   status      A reference to a UErrorCode to receive any errors.
1537  * @stable ICU 4.0
1538  */
1539 U_CAPI void U_EXPORT2
1540 uregex_getMatchCallback(const URegularExpression    *regexp,
1541                         URegexMatchCallback        **callback,
1542                         const void                 **context,
1543                         UErrorCode                  *status);
1544 
1545 /**
1546  * Function pointer for a regular expression find callback function.
1547  *
1548  * When set, a callback function will be called during a find operation
1549  * and for operations that depend on find, such as findNext, split and some replace
1550  * operations like replaceFirst.
1551  * The callback will usually be called after each attempt at a match, but this is not a
1552  * guarantee that the callback will be invoked at each character.  For finds where the
1553  * match engine is invoked at each character, this may be close to true, but less likely
1554  * for more optimized loops where the pattern is known to only start, and the match
1555  * engine invoked, at certain characters.
1556  * When invoked, this callback will specify the index at which a match operation is about
1557  * to be attempted, giving the application the opportunity to terminate a long-running
1558  * find operation.
1559  *
1560  * If the call back function returns false, the find operation will be terminated early.
1561  *
1562  * Note:  the callback function must not call other functions on this
1563  *        URegularExpression
1564  *
1565  * @param context  context pointer.  The callback function will be invoked
1566  *                 with the context specified at the time that
1567  *                 uregex_setFindProgressCallback() is called.
1568  * @param matchIndex  the next index at which a match attempt will be attempted for this
1569  *                 find operation.  If this callback interrupts the search, this is the
1570  *                 index at which a find/findNext operation may be re-initiated.
1571  * @return         true to continue the matching operation.
1572  *                 false to terminate the matching operation.
1573  * @stable ICU 4.6
1574  */
1575 U_CDECL_BEGIN
1576 typedef UBool U_CALLCONV URegexFindProgressCallback (
1577                    const void *context,
1578                    int64_t     matchIndex);
1579 U_CDECL_END
1580 
1581 
1582 /**
1583  *  Set the find progress callback function for this URegularExpression.
1584  *
1585  * @param   regexp      The compiled regular expression.
1586  * @param   callback    A pointer to the user-supplied callback function.
1587  * @param   context     User context pointer.  The value supplied at the
1588  *                      time the callback function is set will be saved
1589  *                      and passed to the callback each time that it is called.
1590  * @param   status      A reference to a UErrorCode to receive any errors.
1591  * @stable ICU 4.6
1592  */
1593 U_CAPI void U_EXPORT2
1594 uregex_setFindProgressCallback(URegularExpression              *regexp,
1595                                 URegexFindProgressCallback      *callback,
1596                                 const void                      *context,
1597                                 UErrorCode                      *status);
1598 
1599 /**
1600  *  Get the find progress callback function for this URegularExpression.
1601  *
1602  * @param   regexp      The compiled regular expression.
1603  * @param   callback    Out parameter, receives a pointer to the user-supplied
1604  *                      callback function.
1605  * @param   context     Out parameter, receives the user context pointer that
1606  *                      was set when uregex_setFindProgressCallback() was called.
1607  * @param   status      A reference to a UErrorCode to receive any errors.
1608  * @stable ICU 4.6
1609  */
1610 U_CAPI void U_EXPORT2
1611 uregex_getFindProgressCallback(const URegularExpression          *regexp,
1612                                 URegexFindProgressCallback        **callback,
1613                                 const void                        **context,
1614                                 UErrorCode                        *status);
1615 
1616 #endif   /*  !UCONFIG_NO_REGULAR_EXPRESSIONS  */
1617 #endif   /*  UREGEX_H  */
1618