• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 **********************************************************************
3 *   Copyright (C) 2002-2007, International Business Machines
4 *   Corporation and others.  All Rights Reserved.
5 **********************************************************************
6 *   file name:  regex.h
7 *   encoding:   US-ASCII
8 *   indentation:4
9 *
10 *   created on: 2002oct22
11 *   created by: Andy Heninger
12 *
13 *   ICU Regular Expressions, API for C++
14 */
15 
16 #ifndef REGEX_H
17 #define REGEX_H
18 
19 #define REGEX_DEBUG
20 
21 /**
22  * \file
23  * \brief  C++ API:  Regular Expressions
24  *
25  * <h2>Regular Expression API</h2>
26  *
27  * <p>The ICU API for processing regular expressions consists of two classes,
28  *  <code>RegexPattern</code> and <code>RegexMatcher</code>.
29  *  <code>RegexPattern</code> objects represent a pre-processed, or compiled
30  *  regular expression.  They are created from a regular expression pattern string,
31  *  and can be used to create <code>RegexMatcher</code> objects for the pattern.</p>
32  *
33  * <p>Class <code>RegexMatcher</code> bundles together a regular expression
34  *  pattern and a target string to which the search pattern will be applied.
35  *  <code>RegexMatcher</code> includes API for doing plain find or search
36  *  operations, for search and replace operations, and for obtaining detailed
37  *  information about bounds of a match. </p>
38  *
39  * <p>Note that by constructing <code>RegexMatcher</code> objects directly from regular
40  * expression pattern strings application code can be simplified and the explicit
41  * need for <code>RegexPattern</code> objects can usually be eliminated.
42  * </p>
43  */
44 
45 #include "unicode/utypes.h"
46 
47 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
48 
49 #include "unicode/uobject.h"
50 #include "unicode/unistr.h"
51 #include "unicode/parseerr.h"
52 
53 #include "unicode/uregex.h"
54 
55 U_NAMESPACE_BEGIN
56 
57 
58 // Forward Declarations...
59 
60 class RegexMatcher;
61 class RegexPattern;
62 class UVector;
63 class UVector32;
64 class UnicodeSet;
65 struct REStackFrame;
66 struct Regex8BitSet;
67 class  RuleBasedBreakIterator;
68 class  RegexCImpl;
69 
70 
71 
72 
73 /**
74  *   RBBIPatternDump   Debug function, displays the compiled form of a pattern.
75  *   @internal
76  */
77 #ifdef REGEX_DEBUG
78 U_INTERNAL void U_EXPORT2
79     RegexPatternDump(const RegexPattern *pat);
80 #else
81     #define RegexPatternDump(pat)
82 #endif
83 
84 
85 
86 /**
87   * Class <code>RegexPattern</code> represents a compiled regular expression.  It includes
88   * factory methods for creating a RegexPattern object from the source (string) form
89   * of a regular expression, methods for creating RegexMatchers that allow the pattern
90   * to be applied to input text, and a few convenience methods for simple common
91   * uses of regular expressions.
92   *
93   * <p>Class RegexPattern is not intended to be subclassed.</p>
94   *
95   * @stable ICU 2.4
96   */
97 class U_I18N_API RegexPattern: public UObject {
98 public:
99 
100     /**
101      * default constructor.  Create a RegexPattern object that refers to no actual
102      *   pattern.  Not normally needed; RegexPattern objects are usually
103      *   created using the factory method <code>compile()</code>.
104      *
105      * @stable ICU 2.4
106      */
107     RegexPattern();
108 
109     /**
110      * Copy Constructor.  Create a new RegexPattern object that is equivalent
111      *                    to the source object.
112      * @param source the pattern object to be copied.
113      * @stable ICU 2.4
114      */
115     RegexPattern(const RegexPattern &source);
116 
117     /**
118      * Destructor.  Note that a RegexPattern object must persist so long as any
119      *  RegexMatcher objects that were created from the RegexPattern are active.
120      * @stable ICU 2.4
121      */
122     virtual ~RegexPattern();
123 
124     /**
125      * Comparison operator.  Two RegexPattern objects are considered equal if they
126      * were constructed from identical source patterns using the same match flag
127      * settings.
128      * @param that a RegexPattern object to compare with "this".
129      * @return TRUE if the objects are equivalent.
130      * @stable ICU 2.4
131      */
132     UBool           operator==(const RegexPattern& that) const;
133 
134     /**
135      * Comparison operator.  Two RegexPattern objects are considered equal if they
136      * were constructed from identical source patterns using the same match flag
137      * settings.
138      * @param that a RegexPattern object to compare with "this".
139      * @return TRUE if the objects are different.
140      * @stable ICU 2.4
141      */
142     inline UBool    operator!=(const RegexPattern& that) const {return ! operator ==(that);};
143 
144     /**
145      * Assignment operator.  After assignment, this RegexPattern will behave identically
146      *     to the source object.
147      * @stable ICU 2.4
148      */
149     RegexPattern  &operator =(const RegexPattern &source);
150 
151     /**
152      * Create an exact copy of this RegexPattern object.  Since RegexPattern is not
153      * intended to be subclasses, <code>clone()</code> and the copy construction are
154      * equivalent operations.
155      * @return the copy of this RegexPattern
156      * @stable ICU 2.4
157      */
158     virtual RegexPattern  *clone() const;
159 
160 
161    /**
162     * Compiles the regular expression in string form into a RegexPattern
163     * object.  These compile methods, rather than the constructors, are the usual
164     * way that RegexPattern objects are created.
165     *
166     * <p>Note that RegexPattern objects must not be deleted while RegexMatcher
167     * objects created from the pattern are active.  RegexMatchers keep a pointer
168     * back to their pattern, so premature deletion of the pattern is a
169     * catastrophic error.</p>
170     *
171     * <p>All pattern match mode flags are set to their default values.</p>
172     *
173     * <p>Note that it is often more convenient to construct a RegexMatcher directly
174     *    from a pattern string rather than separately compiling the pattern and
175     *    then creating a RegexMatcher object from the pattern.</p>
176     *
177     * @param regex The regular expression to be compiled.
178     * @param pe    Receives the position (line and column nubers) of any error
179     *              within the regular expression.)
180     * @param status A reference to a UErrorCode to receive any errors.
181     * @return      A regexPattern object for the compiled pattern.
182     *
183     * @stable ICU 2.4
184     */
185     static RegexPattern * U_EXPORT2 compile( const UnicodeString &regex,
186         UParseError          &pe,
187         UErrorCode           &status);
188 
189    /**
190     * Compiles the regular expression in string form into a RegexPattern
191     * object using the specified match mode flags.  These compile methods,
192     * rather than the constructors, are the usual way that RegexPattern objects
193     * are created.
194     *
195     * <p>Note that RegexPattern objects must not be deleted while RegexMatcher
196     * objects created from the pattern are active.  RegexMatchers keep a pointer
197     * back to their pattern, so premature deletion of the pattern is a
198     * catastrophic error.</p>
199     *
200     * <p>Note that it is often more convenient to construct a RegexMatcher directly
201     *    from a pattern string instead of than separately compiling the pattern and
202     *    then creating a RegexMatcher object from the pattern.</p>
203     *
204     * @param regex The regular expression to be compiled.
205     * @param flags The match mode flags to be used.
206     * @param pe    Receives the position (line and column nubers) of any error
207     *              within the regular expression.)
208     * @param status   A reference to a UErrorCode to receive any errors.
209     * @return      A regexPattern object for the compiled pattern.
210     *
211     * @stable ICU 2.4
212     */
213     static RegexPattern * U_EXPORT2 compile( const UnicodeString &regex,
214         uint32_t             flags,
215         UParseError          &pe,
216         UErrorCode           &status);
217 
218 
219    /**
220     * Compiles the regular expression in string form into a RegexPattern
221     * object using the specified match mode flags.  These compile methods,
222     * rather than the constructors, are the usual way that RegexPattern objects
223     * are created.
224     *
225     * <p>Note that RegexPattern objects must not be deleted while RegexMatcher
226     * objects created from the pattern are active.  RegexMatchers keep a pointer
227     * back to their pattern, so premature deletion of the pattern is a
228     * catastrophic error.</p>
229     *
230     * <p>Note that it is often more convenient to construct a RegexMatcher directly
231     *    from a pattern string instead of than separately compiling the pattern and
232     *    then creating a RegexMatcher object from the pattern.</p>
233     *
234     * @param regex The regular expression to be compiled.
235     * @param flags The match mode flags to be used.
236     * @param status   A reference to a UErrorCode to receive any errors.
237     * @return      A regexPattern object for the compiled pattern.
238     *
239     * @stable ICU 2.6
240     */
241     static RegexPattern * U_EXPORT2 compile( const UnicodeString &regex,
242         uint32_t             flags,
243         UErrorCode           &status);
244 
245 
246    /**
247     * Get the match mode flags that were used when compiling this pattern.
248     * @return  the match mode flags
249     * @stable ICU 2.4
250     */
251     virtual uint32_t flags() const;
252 
253    /**
254     * Creates a RegexMatcher that will match the given input against this pattern.  The
255     * RegexMatcher can then be used to perform match, find or replace operations
256     * on the input.  Note that a RegexPattern object must not be deleted while
257     * RegexMatchers created from it still exist and might possibly be used again.
258     * <p>
259     * The matcher will retain a reference to the supplied input string, and all regexp
260     * pattern matching operations happen directly on this original string.  It is
261     * critical that the string not be altered or deleted before use by the regular
262     * expression operations is complete.
263     *
264     * @param input    The input string to which the regular expression will be applied.
265     * @param status   A reference to a UErrorCode to receive any errors.
266     * @return         A RegexMatcher object for this pattern and input.
267     *
268     * @stable ICU 2.4
269     */
270     virtual RegexMatcher *matcher(const UnicodeString &input,
271         UErrorCode          &status) const;
272 
273 private:
274     /**
275      * Cause a compilation error if an application accidently attempts to
276      *   create a matcher with a (UChar *) string as input rather than
277      *   a UnicodeString.  Avoids a dangling reference to a temporary string.
278      * <p>
279      * To efficiently work with UChar *strings, wrap the data in a UnicodeString
280      * using one of the aliasing constructors, such as
281      * <code>UnicodeString(UBool isTerminated, const UChar *text, int32_t textLength);</code>
282      *
283      * @internal
284      */
285     RegexMatcher *matcher(const UChar *input,
286         UErrorCode          &status) const;
287 public:
288 
289 
290    /**
291     * Creates a RegexMatcher that will match against this pattern.  The
292     * RegexMatcher can be used to perform match, find or replace operations.
293     * Note that a RegexPattern object must not be deleted while
294     * RegexMatchers created from it still exist and might possibly be used again.
295     *
296     * @param status   A reference to a UErrorCode to receive any errors.
297     * @return      A RegexMatcher object for this pattern and input.
298     *
299     * @stable ICU 2.6
300     */
301     virtual RegexMatcher *matcher(UErrorCode  &status) const;
302 
303 
304    /**
305     * Test whether a string matches a regular expression.  This convenience function
306     * both compiles the reguluar expression and applies it in a single operation.
307     * Note that if the same pattern needs to be applied repeatedly, this method will be
308     * less efficient than creating and reusing a RegexMatcher object.
309     *
310     * @param regex The regular expression
311     * @param input The string data to be matched
312     * @param pe Receives the position of any syntax errors within the regular expression
313     * @param status A reference to a UErrorCode to receive any errors.
314     * @return True if the regular expression exactly matches the full input string.
315     *
316     * @stable ICU 2.4
317     */
318     static UBool U_EXPORT2 matches(const UnicodeString   &regex,
319         const UnicodeString   &input,
320         UParseError     &pe,
321         UErrorCode      &status);
322 
323 
324    /**
325     *    Returns the regular expression from which this pattern was compiled.
326     *    @stable ICU 2.4
327     */
328     virtual UnicodeString pattern() const;
329 
330 
331     /**
332      * Split a string into fields.  Somewhat like split() from Perl.
333      * The pattern matches identify delimiters that separate the input
334      *  into fields.  The input data between the matches becomes the
335      *  fields themselves.
336      * <p>
337      *  For the best performance on split() operations,
338      *  <code>RegexMatcher::split</code> is perferable to this function
339      *
340      * @param input   The string to be split into fields.  The field delimiters
341      *                match the pattern (in the "this" object)
342      * @param dest    An array of UnicodeStrings to receive the results of the split.
343      *                This is an array of actual UnicodeString objects, not an
344      *                array of pointers to strings.  Local (stack based) arrays can
345      *                work well here.
346      * @param destCapacity  The number of elements in the destination array.
347      *                If the number of fields found is less than destCapacity, the
348      *                extra strings in the destination array are not altered.
349      *                If the number of destination strings is less than the number
350      *                of fields, the trailing part of the input string, including any
351      *                field delimiters, is placed in the last destination string.
352      * @param status  A reference to a UErrorCode to receive any errors.
353      * @return        The number of fields into which the input string was split.
354      * @stable ICU 2.4
355      */
356     virtual int32_t  split(const UnicodeString &input,
357         UnicodeString    dest[],
358         int32_t          destCapacity,
359         UErrorCode       &status) const;
360 
361 
362     /**
363      * ICU "poor man's RTTI", returns a UClassID for the actual class.
364      *
365      * @stable ICU 2.4
366      */
367     virtual UClassID getDynamicClassID() const;
368 
369     /**
370      * ICU "poor man's RTTI", returns a UClassID for this class.
371      *
372      * @stable ICU 2.4
373      */
374     static UClassID U_EXPORT2 getStaticClassID();
375 
376 private:
377     //
378     //  Implementation Data
379     //
380     UnicodeString   fPattern;      // The original pattern string.
381     uint32_t        fFlags;        // The flags used when compiling the pattern.
382                                    //
383     UVector32       *fCompiledPat; // The compiled pattern p-code.
384     UnicodeString   fLiteralText;  // Any literal string data from the pattern,
385                                    //   after un-escaping, for use during the match.
386 
387     UVector         *fSets;        // Any UnicodeSets referenced from the pattern.
388     Regex8BitSet    *fSets8;       //      (and fast sets for latin-1 range.)
389 
390 
391     UErrorCode      fDeferredStatus; // status if some prior error has left this
392                                    //  RegexPattern in an unusable state.
393 
394     int32_t         fMinMatchLen;  // Minimum Match Length.  All matches will have length
395                                    //   >= this value.  For some patterns, this calculated
396                                    //   value may be less than the true shortest
397                                    //   possible match.
398 
399     int32_t         fFrameSize;    // Size of a state stack frame in the
400                                    //   execution engine.
401 
402     int32_t         fDataSize;     // The size of the data needed by the pattern that
403                                    //   does not go on the state stack, but has just
404                                    //   a single copy per matcher.
405 
406     UVector32       *fGroupMap;    // Map from capture group number to position of
407                                    //   the group's variables in the matcher stack frame.
408 
409     int32_t         fMaxCaptureDigits;
410 
411     UnicodeSet     **fStaticSets;  // Ptr to static (shared) sets for predefined
412                                    //   regex character classes, e.g. Word.
413 
414     Regex8BitSet   *fStaticSets8;  // Ptr to the static (shared) latin-1 only
415                                    //  sets for predefined regex classes.
416 
417     int32_t         fStartType;    // Info on how a match must start.
418     int32_t         fInitialStringIdx;     //
419     int32_t         fInitialStringLen;
420     UnicodeSet     *fInitialChars;
421     UChar32         fInitialChar;
422     Regex8BitSet   *fInitialChars8;
423 
424     friend class RegexCompile;
425     friend class RegexMatcher;
426     friend class RegexCImpl;
427 
428     //
429     //  Implementation Methods
430     //
431     void        init();            // Common initialization, for use by constructors.
432     void        zap();             // Common cleanup
433 #ifdef REGEX_DEBUG
434     void        dumpOp(int32_t index) const;
435     friend     void U_EXPORT2 RegexPatternDump(const RegexPattern *);
436 #endif
437 
438 };
439 
440 
441 
442 /**
443  *  class RegexMatcher bundles together a reular expression pattern and
444  *  input text to which the expression can be applied.  It includes methods
445  *  for testing for matches, and for find and replace operations.
446  *
447  * <p>Class RegexMatcher is not intended to be subclassed.</p>
448  *
449  * @stable ICU 2.4
450  */
451 class U_I18N_API RegexMatcher: public UObject {
452 public:
453 
454     /**
455       * Construct a RegexMatcher for a regular expression.
456       * This is a convenience method that avoids the need to explicitly create
457       * a RegexPattern object.  Note that if several RegexMatchers need to be
458       * created for the same expression, it will be more efficient to
459       * separately create and cache a RegexPattern object, and use
460       * its matcher() method to create the RegexMatcher objects.
461       *
462       *  @param regexp The Regular Expression to be compiled.
463       *  @param flags  Regular expression options, such as case insensitive matching.
464       *                @see UREGEX_CASE_INSENSITIVE
465       *  @param status Any errors are reported by setting this UErrorCode variable.
466       *  @stable ICU 2.6
467       */
468     RegexMatcher(const UnicodeString &regexp, uint32_t flags, UErrorCode &status);
469 
470     /**
471       * Construct a RegexMatcher for a regular expression.
472       * This is a convenience method that avoids the need to explicitly create
473       * a RegexPattern object.  Note that if several RegexMatchers need to be
474       * created for the same expression, it will be more efficient to
475       * separately create and cache a RegexPattern object, and use
476       * its matcher() method to create the RegexMatcher objects.
477       * <p>
478       * The matcher will retain a reference to the supplied input string, and all regexp
479       * pattern matching operations happen directly on the original string.  It is
480       * critical that the string not be altered or deleted before use by the regular
481       * expression operations is complete.
482       *
483       *  @param regexp The Regular Expression to be compiled.
484       *  @param input  The string to match.  The matcher retains a reference to the
485       *                caller's string; mo copy is made.
486       *  @param flags  Regular expression options, such as case insensitive matching.
487       *                @see UREGEX_CASE_INSENSITIVE
488       *  @param status Any errors are reported by setting this UErrorCode variable.
489       *  @stable ICU 2.6
490       */
491     RegexMatcher(const UnicodeString &regexp, const UnicodeString &input,
492         uint32_t flags, UErrorCode &status);
493 
494 private:
495     /**
496      * Cause a compilation error if an application accidently attempts to
497      *   create a matcher with a (UChar *) string as input rather than
498      *   a UnicodeString.    Avoids a dangling reference to a temporary string.
499      * <p>
500      * To efficiently work with UChar *strings, wrap the data in a UnicodeString
501      * using one of the aliasing constructors, such as
502      * <code>UnicodeString(UBool isTerminated, const UChar *text, int32_t textLength);</code>
503      *
504      * @internal
505      */
506     RegexMatcher(const UnicodeString &regexp, const UChar *input,
507         uint32_t flags, UErrorCode &status);
508 public:
509 
510 
511    /**
512     *   Destructor.
513     *
514     *  @stable ICU 2.4
515     */
516     virtual ~RegexMatcher();
517 
518 
519    /**
520     *   Attempts to match the entire input region against the pattern.
521     *    @param   status     A reference to a UErrorCode to receive any errors.
522     *    @return TRUE if there is a match
523     *    @stable ICU 2.4
524     */
525     virtual UBool matches(UErrorCode &status);
526 
527    /**
528     *   Resets the matcher, then attempts to match the input beginning
529     *   at the specified startIndex, and extending to the end of the input.
530     *   The input region is reset to include the entire input string.
531     *   A successful match must extend to the end of the input.
532     *    @param   startIndex The input string index at which to begin matching.
533     *    @param   status     A reference to a UErrorCode to receive any errors.
534     *    @return TRUE if there is a match
535     *    @stable ICU 2.8
536     */
537     virtual UBool matches(int32_t startIndex, UErrorCode &status);
538 
539 
540 
541 
542    /**
543     *   Attempts to match the input string, starting from the beginning of the region,
544     *   against the pattern.  Like the matches() method, this function
545     *   always starts at the beginning of the input region;
546     *   unlike that function, it does not require that the entire region be matched.
547     *
548     *   <p>If the match succeeds then more information can be obtained via the <code>start()</code>,
549     *     <code>end()</code>, and <code>group()</code> functions.</p>
550     *
551     *    @param   status     A reference to a UErrorCode to receive any errors.
552     *    @return  TRUE if there is a match at the start of the input string.
553     *    @stable ICU 2.4
554     */
555     virtual UBool lookingAt(UErrorCode &status);
556 
557 
558   /**
559     *   Attempts to match the input string, starting from the specified index, against the pattern.
560     *   The match may be of any length, and is not required to extend to the end
561     *   of the input string.  Contrast with match().
562     *
563     *   <p>If the match succeeds then more information can be obtained via the <code>start()</code>,
564     *     <code>end()</code>, and <code>group()</code> functions.</p>
565     *
566     *    @param   startIndex The input string index at which to begin matching.
567     *    @param   status     A reference to a UErrorCode to receive any errors.
568     *    @return  TRUE if there is a match.
569     *    @stable ICU 2.8
570     */
571     virtual UBool lookingAt(int32_t startIndex, UErrorCode &status);
572 
573    /**
574     *  Find the next pattern match in the input string.
575     *  The find begins searching the input at the location following the end of
576     *  the previous match, or at the start of the string if there is no previous match.
577     *  If a match is found, <code>start(), end()</code> and <code>group()</code>
578     *  will provide more information regarding the match.
579     *  <p>Note that if the input string is changed by the application,
580     *     use find(startPos, status) instead of find(), because the saved starting
581     *     position may not be valid with the altered input string.</p>
582     *  @return  TRUE if a match is found.
583     *  @stable ICU 2.4
584     */
585     virtual UBool find();
586 
587 
588    /**
589     *   Resets this RegexMatcher and then attempts to find the next substring of the
590     *   input string that matches the pattern, starting at the specified index.
591     *
592     *   @param   start     the position in the input string to begin the search
593     *   @param   status    A reference to a UErrorCode to receive any errors.
594     *   @return  TRUE if a match is found.
595     *   @stable ICU 2.4
596     */
597     virtual UBool find(int32_t start, UErrorCode &status);
598 
599 
600    /**
601     *   Returns a string containing the text matched by the previous match.
602     *   If the pattern can match an empty string, an empty string may be returned.
603     *   @param   status      A reference to a UErrorCode to receive any errors.
604     *                        Possible errors are  U_REGEX_INVALID_STATE if no match
605     *                        has been attempted or the last match failed.
606     *   @return  a string containing the matched input text.
607     *   @stable ICU 2.4
608     */
609     virtual UnicodeString group(UErrorCode &status) const;
610 
611 
612    /**
613     *    Returns a string containing the text captured by the given group
614     *    during the previous match operation.  Group(0) is the entire match.
615     *
616     *    @param groupNum the capture group number
617     *    @param   status     A reference to a UErrorCode to receive any errors.
618     *                        Possible errors are  U_REGEX_INVALID_STATE if no match
619     *                        has been attempted or the last match failed and
620     *                        U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number.
621     *    @return the captured text
622     *    @stable ICU 2.4
623     */
624     virtual UnicodeString group(int32_t groupNum, UErrorCode &status) const;
625 
626 
627    /**
628     *   Returns the number of capturing groups in this matcher's pattern.
629     *   @return the number of capture groups
630     *   @stable ICU 2.4
631     */
632     virtual int32_t groupCount() const;
633 
634 
635    /**
636     *   Returns the index in the input string of the start of the text matched
637     *   during the previous match operation.
638     *    @param   status      a reference to a UErrorCode to receive any errors.
639     *    @return              The position in the input string of the start of the last match.
640     *    @stable ICU 2.4
641     */
642     virtual int32_t start(UErrorCode &status) const;
643 
644 
645    /**
646     *   Returns the index in the input string of the start of the text matched by the
647     *    specified capture group during the previous match operation.  Return -1 if
648     *    the capture group exists in the pattern, but was not part of the last match.
649     *
650     *    @param  group       the capture group number
651     *    @param  status      A reference to a UErrorCode to receive any errors.  Possible
652     *                        errors are  U_REGEX_INVALID_STATE if no match has been
653     *                        attempted or the last match failed, and
654     *                        U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number
655     *    @return the start position of substring matched by the specified group.
656     *    @stable ICU 2.4
657     */
658     virtual int32_t start(int32_t group, UErrorCode &status) const;
659 
660 
661    /**
662     *    Returns the index in the input string of the first character following the
663     *    text matched during the previous match operation.
664     *   @param   status      A reference to a UErrorCode to receive any errors.  Possible
665     *                        errors are  U_REGEX_INVALID_STATE if no match has been
666     *                        attempted or the last match failed.
667     *    @return the index of the last character matched, plus one.
668     *   @stable ICU 2.4
669     */
670     virtual int32_t end(UErrorCode &status) const;
671 
672 
673    /**
674     *    Returns the index in the input string of the character following the
675     *    text matched by the specified capture group during the previous match operation.
676     *    @param group  the capture group number
677     *    @param   status      A reference to a UErrorCode to receive any errors.  Possible
678     *                        errors are  U_REGEX_INVALID_STATE if no match has been
679     *                        attempted or the last match failed and
680     *                        U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number
681     *    @return  the index of the first character following the text
682     *              captured by the specifed group during the previous match operation.
683     *              Return -1 if the capture group exists in the pattern but was not part of the match.
684     *    @stable ICU 2.4
685     */
686     virtual int32_t end(int32_t group, UErrorCode &status) const;
687 
688 
689    /**
690     *   Resets this matcher.  The effect is to remove any memory of previous matches,
691     *       and to cause subsequent find() operations to begin at the beginning of
692     *       the input string.
693     *
694     *   @return this RegexMatcher.
695     *   @stable ICU 2.4
696     */
697     virtual RegexMatcher &reset();
698 
699 
700    /**
701     *   Resets this matcher, and set the current input position.
702     *   The effect is to remove any memory of previous matches,
703     *       and to cause subsequent find() operations to begin at
704     *       the specified position in the input string.
705     * <p>
706     *   The matcher's region is reset to its default, which is the entire
707     *   input string.
708     * <p>
709     *   An alternative to this function is to set a match region
710     *   beginning at the desired index.
711     *
712     *   @return this RegexMatcher.
713     *   @stable ICU 2.8
714     */
715     virtual RegexMatcher &reset(int32_t index, UErrorCode &status);
716 
717 
718    /**
719     *   Resets this matcher with a new input string.  This allows instances of RegexMatcher
720     *     to be reused, which is more efficient than creating a new RegexMatcher for
721     *     each input string to be processed.
722     *   @param input The new string on which subsequent pattern matches will operate.
723     *                The matcher retains a reference to the callers string, and operates
724     *                directly on that.  Ownership of the string remains with the caller.
725     *                Because no copy of the string is made, it is essential that the
726     *                caller not delete the string until after regexp operations on it
727     *                are done.
728     *   @return this RegexMatcher.
729     *   @stable ICU 2.4
730     */
731     virtual RegexMatcher &reset(const UnicodeString &input);
732 
733 private:
734     /**
735      * Cause a compilation error if an application accidently attempts to
736      *   reset a matcher with a (UChar *) string as input rather than
737      *   a UnicodeString.    Avoids a dangling reference to a temporary string.
738      * <p>
739      * To efficiently work with UChar *strings, wrap the data in a UnicodeString
740      * using one of the aliasing constructors, such as
741      * <code>UnicodeString(UBool isTerminated, const UChar *text, int32_t textLength);</code>
742      *
743      * @internal
744      */
745     RegexMatcher &reset(const UChar *input);
746 public:
747 
748    /**
749     *   Returns the input string being matched.  The returned string is not a copy,
750     *   but the live input string.  It should not be altered or deleted.
751     *   @return the input string
752     *   @stable ICU 2.4
753     */
754     virtual const UnicodeString &input() const;
755 
756 
757 
758    /** Sets the limits of this matcher's region.
759      * The region is the part of the input string that will be searched to find a match.
760      * Invoking this method resets the matcher, and then sets the region to start
761      * at the index specified by the start parameter and end at the index specified
762      * by the end parameter.
763      *
764      * Depending on the transparency and anchoring being used (see useTransparentBounds
765      * and useAnchoringBounds), certain constructs such as anchors may behave differently
766      * at or around the boundaries of the region
767      *
768      * The function will fail if start is greater than limit, or if either index
769      *  is less than zero or greater than the length of the string being matched.
770      *
771      * @param start  The index to begin searches at.
772      * @param limit  The index to end searches at (exclusive).
773      * @param status A reference to a UErrorCode to receive any errors.
774      * @draft ICU 4.0
775      */
776      virtual RegexMatcher &region(int32_t start, int32_t limit, UErrorCode &status);
777 
778 
779    /**
780      * Reports the start index of this matcher's region. The searches this matcher
781      * conducts are limited to finding matches within regionStart (inclusive) and
782      * regionEnd (exclusive).
783      *
784      * @return The starting index of this matcher's region.
785      * @draft ICU 4.0
786      */
787      virtual int32_t regionStart() const;
788 
789 
790     /**
791       * Reports the end (limit) index (exclusive) of this matcher's region. The searches
792       * this matcher conducts are limited to finding matches within regionStart
793       * (inclusive) and regionEnd (exclusive).
794       *
795       * @return The ending point of this matcher's region.
796       * @draft ICU 4.0
797       */
798       virtual int32_t regionEnd() const;
799 
800     /**
801       * Queries the transparency of region bounds for this matcher.
802       * See useTransparentBounds for a description of transparent and opaque bounds.
803       * By default, a matcher uses opaque region boundaries.
804       *
805       * @return TRUE if this matcher is using opaque bounds, false if it is not.
806       * @draft ICU 4.0
807       */
808       virtual UBool hasTransparentBounds() const;
809 
810     /**
811       * Sets the transparency of region bounds for this matcher.
812       * Invoking this function with an argument of true will set this matcher to use transparent bounds.
813       * If the boolean argument is false, then opaque bounds will be used.
814       *
815       * Using transparent bounds, the boundaries of this matcher's region are transparent
816       * to lookahead, lookbehind, and boundary matching constructs. Those constructs can
817       * see text beyond the boundaries of the region while checking for a match.
818       *
819       * With opaque bounds, no text outside of the matcher's region is visible to lookahead,
820       * lookbehind, and boundary matching constructs.
821       *
822       * By default, a matcher uses opaque bounds.
823       *
824       * @param   b TRUE for transparent bounds; FALSE for opaque bounds
825       * @return  This Matcher;
826       * @draft   ICU 4.0
827       **/
828       virtual RegexMatcher &useTransparentBounds(UBool b);
829 
830 
831     /**
832       * Return true if this matcher is using anchoring bounds.
833       * By default, matchers use anchoring region boounds.
834       *
835       * @return TRUE if this matcher is using anchoring bounds.
836       * @draft  ICU 4.0
837       */
838       virtual UBool hasAnchoringBounds() const;
839 
840     /**
841       * Set whether this matcher is using Anchoring Bounds for its region.
842       * With anchoring bounds, pattern anchors such as ^ and $ will match at the start
843       * and end of the region.  Without Anchoring Bounds, anchors will only match at
844       * the positions they would in the complete text.
845       *
846       * Anchoring Bounds are the default for regions.
847       *
848       * @param b TRUE if to enable anchoring bounds; FALSE to disable them.
849       * @return  This Matcher
850       * @draft   ICU 4.0
851       */
852       virtual RegexMatcher &useAnchoringBounds(UBool b);
853 
854     /**
855       * Return TRUE if the most recent matching operation touched the
856       *  end of the text being processed.  In this case, additional input text could
857       *  change the results of that match.
858       *
859       *  hitEnd() is defined for both successful and unsuccessful matches.
860       *  In either case hitEnd() will return TRUE if if the end of the text was
861       *  reached at any point during the matching process.
862       *
863       *  @return  TRUE if the most recent match hit the end of input
864       *  @draft   ICU 4.0
865       */
866       virtual UBool hitEnd() const;
867 
868     /**
869       * Return TRUE the most recent match succeeded and additional input could cause
870       * it to fail. If this method returns false and a match was found, then more input
871       * might change the match but the match won't be lost. If a match was not found,
872       * then requireEnd has no meaning.
873       *
874       * @return TRUE if more input could cause the most recent match to no longer match.
875       * @draft  ICU 4.0
876       */
877       virtual UBool requireEnd() const;
878 
879 
880 
881 
882 
883    /**
884     *    Returns the pattern that is interpreted by this matcher.
885     *    @return  the RegexPattern for this RegexMatcher
886     *    @stable ICU 2.4
887     */
888     virtual const RegexPattern &pattern() const;
889 
890 
891    /**
892     *    Replaces every substring of the input that matches the pattern
893     *    with the given replacement string.  This is a convenience function that
894     *    provides a complete find-and-replace-all operation.
895     *
896     *    This method first resets this matcher. It then scans the input string
897     *    looking for matches of the pattern. Input that is not part of any
898     *    match is left unchanged; each match is replaced in the result by the
899     *    replacement string. The replacement string may contain references to
900     *    capture groups.
901     *
902     *    @param   replacement a string containing the replacement text.
903     *    @param   status      a reference to a UErrorCode to receive any errors.
904     *    @return              a string containing the results of the find and replace.
905     *    @stable ICU 2.4
906     */
907     virtual UnicodeString replaceAll(const UnicodeString &replacement, UErrorCode &status);
908 
909 
910    /**
911     * Replaces the first substring of the input that matches
912     * the pattern with the replacement string.   This is a convenience
913     * function that provides a complete find-and-replace operation.
914     *
915     * <p>This function first resets this RegexMatcher. It then scans the input string
916     * looking for a match of the pattern. Input that is not part
917     * of the match is appended directly to the result string; the match is replaced
918     * in the result by the replacement string. The replacement string may contain
919     * references to captured groups.</p>
920     *
921     * <p>The state of the matcher (the position at which a subsequent find()
922     *    would begin) after completing a replaceFirst() is not specified.  The
923     *    RegexMatcher should be reset before doing additional find() operations.</p>
924     *
925     *    @param   replacement a string containing the replacement text.
926     *    @param   status      a reference to a UErrorCode to receive any errors.
927     *    @return              a string containing the results of the find and replace.
928     *    @stable ICU 2.4
929     */
930     virtual UnicodeString replaceFirst(const UnicodeString &replacement, UErrorCode &status);
931 
932    /**
933     *   Implements a replace operation intended to be used as part of an
934     *   incremental find-and-replace.
935     *
936     *   <p>The input string, starting from the end of the previous replacement and ending at
937     *   the start of the current match, is appended to the destination string.  Then the
938     *   replacement string is appended to the output string,
939     *   including handling any substitutions of captured text.</p>
940     *
941     *   <p>For simple, prepackaged, non-incremental find-and-replace
942     *   operations, see replaceFirst() or replaceAll().</p>
943     *
944     *   @param   dest        A UnicodeString to which the results of the find-and-replace are appended.
945     *   @param   replacement A UnicodeString that provides the text to be substituted for
946     *                        the input text that matched the regexp pattern.  The replacement
947     *                        text may contain references to captured text from the
948     *                        input.
949     *   @param   status      A reference to a UErrorCode to receive any errors.  Possible
950     *                        errors are  U_REGEX_INVALID_STATE if no match has been
951     *                        attempted or the last match failed, and U_INDEX_OUTOFBOUNDS_ERROR
952     *                        if the replacement text specifies a capture group that
953     *                        does not exist in the pattern.
954     *
955     *   @return  this  RegexMatcher
956     *   @stable ICU 2.4
957     *
958     */
959     virtual RegexMatcher &appendReplacement(UnicodeString &dest,
960         const UnicodeString &replacement, UErrorCode &status);
961 
962 
963    /**
964     * As the final step in a find-and-replace operation, append the remainder
965     * of the input string, starting at the position following the last appendReplacement(),
966     * to the destination string. <code>appendTail()</code> is intended to be invoked after one
967     * or more invocations of the <code>RegexMatcher::appendReplacement()</code>.
968     *
969     *  @param dest A UnicodeString to which the results of the find-and-replace are appended.
970     *  @return  the destination string.
971     *  @stable ICU 2.4
972     */
973     virtual UnicodeString &appendTail(UnicodeString &dest);
974 
975 
976 
977     /**
978      * Split a string into fields.  Somewhat like split() from Perl.
979      * The pattern matches identify delimiters that separate the input
980      *  into fields.  The input data between the matches becomes the
981      *  fields themselves.
982      * <p>
983      *
984      * @param input   The string to be split into fields.  The field delimiters
985      *                match the pattern (in the "this" object).  This matcher
986      *                will be reset to this input string.
987      * @param dest    An array of UnicodeStrings to receive the results of the split.
988      *                This is an array of actual UnicodeString objects, not an
989      *                array of pointers to strings.  Local (stack based) arrays can
990      *                work well here.
991      * @param destCapacity  The number of elements in the destination array.
992      *                If the number of fields found is less than destCapacity, the
993      *                extra strings in the destination array are not altered.
994      *                If the number of destination strings is less than the number
995      *                of fields, the trailing part of the input string, including any
996      *                field delimiters, is placed in the last destination string.
997      * @param status  A reference to a UErrorCode to receive any errors.
998      * @return        The number of fields into which the input string was split.
999      * @stable ICU 2.6
1000      */
1001     virtual int32_t  split(const UnicodeString &input,
1002         UnicodeString    dest[],
1003         int32_t          destCapacity,
1004         UErrorCode       &status);
1005 
1006 
1007 
1008    /**
1009      *   setTrace   Debug function, enable/disable tracing of the matching engine.
1010      *              For internal ICU development use only.  DO NO USE!!!!
1011      *   @internal
1012      */
1013     void setTrace(UBool state);
1014 
1015 
1016     /**
1017     * ICU "poor man's RTTI", returns a UClassID for this class.
1018     *
1019     * @stable ICU 2.2
1020     */
1021     static UClassID U_EXPORT2 getStaticClassID();
1022 
1023     /**
1024      * ICU "poor man's RTTI", returns a UClassID for the actual class.
1025      *
1026      * @stable ICU 2.2
1027      */
1028     virtual UClassID getDynamicClassID() const;
1029 
1030 private:
1031     // Constructors and other object boilerplate are private.
1032     // Instances of RegexMatcher can not be assigned, copied, cloned, etc.
1033     RegexMatcher(); // default constructor not implemented
1034     RegexMatcher(const RegexPattern *pat);
1035     RegexMatcher(const RegexMatcher &other);
1036     RegexMatcher &operator =(const RegexMatcher &rhs);
1037     friend class RegexPattern;
1038     friend class RegexCImpl;
1039 public:
1040     /** @internal  */
1041     void resetPreserveRegion();  // Reset matcher state, but preserve any region.
1042 private:
1043 
1044     //
1045     //  MatchAt   This is the internal interface to the match engine itself.
1046     //            Match status comes back in matcher member variables.
1047     //
1048     void                 MatchAt(int32_t startIdx, UBool toEnd, UErrorCode &status);
1049     inline void          backTrack(int32_t &inputIdx, int32_t &patIdx);
1050     UBool                isWordBoundary(int32_t pos);         // perform Perl-like  \b test
1051     UBool                isUWordBoundary(int32_t pos);        // perform RBBI based \b test
1052     REStackFrame        *resetStack();
1053     inline REStackFrame *StateSave(REStackFrame *fp, int32_t savePatIdx,
1054                                    int32_t frameSize, UErrorCode &status);
1055 
1056 
1057     const RegexPattern  *fPattern;
1058     RegexPattern        *fPatternOwned;    // Non-NULL if this matcher owns the pattern, and
1059                                            //   should delete it when through.
1060 
1061     const UnicodeString *fInput;           // The text being matched. Is never NULL.
1062 
1063     int32_t              fRegionStart;     // Start of the input region, default = 0.
1064     int32_t              fRegionLimit;     // End of input region, default to input.length.
1065 
1066     int32_t              fAnchorStart;     // Region bounds for anchoring operations (^ or $).
1067     int32_t              fAnchorLimit;     //   See useAnchoringBounds
1068 
1069     int32_t              fLookStart;       // Region bounds for look-ahead/behind and
1070     int32_t              fLookLimit;       //   and other boundary tests.  See
1071                                            //   useTransparentBounds
1072 
1073     int32_t              fActiveStart;     // Currently active bounds for matching.
1074     int32_t              fActiveLimit;     //   Usually is the same as region, but
1075                                            //   is changed to fLookStart/Limit when
1076                                            //   entering look around regions.
1077 
1078     UBool                fTransparentBounds;  // True if using transparent bounds.
1079     UBool                fAnchoringBounds; // True if using anchoring bounds.
1080 
1081     UBool                fMatch;           // True if the last attempted match was successful.
1082     int32_t              fMatchStart;      // Position of the start of the most recent match
1083     int32_t              fMatchEnd;        // First position after the end of the most recent match
1084                                            //   Zero if no previous match, even when a region
1085                                            //   is active.
1086     int32_t              fLastMatchEnd;    // First position after the end of the previous match,
1087                                            //   or -1 if there was no previous match.
1088     int32_t              fAppendPosition;  // First position after the end of the previous
1089                                            //   appendReplacement().  As described by the
1090                                            //   JavaDoc for Java Matcher, where it is called
1091                                            //   "append position"
1092     UBool                fHitEnd;          // True if the last match touched the end of input.
1093     UBool                fRequireEnd;      // True if the last match required end-of-input
1094                                            //    (matched $ or Z)
1095 
1096     UVector32           *fStack;
1097     REStackFrame        *fFrame;           // After finding a match, the last active stack frame,
1098                                            //   which will contain the capture group results.
1099                                            //   NOT valid while match engine is running.
1100 
1101     int32_t             *fData;            // Data area for use by the compiled pattern.
1102     int32_t             fSmallData[8];     //   Use this for data if it's enough.
1103 
1104     UBool               fTraceDebug;       // Set true for debug tracing of match engine.
1105 
1106     UErrorCode          fDeferredStatus;   // Save error state if that cannot be immediately
1107                                            //   reported, or that permanently disables this matcher.
1108 
1109     RuleBasedBreakIterator  *fWordBreakItr;
1110 
1111 
1112 };
1113 
1114 U_NAMESPACE_END
1115 #endif  // UCONFIG_NO_REGULAR_EXPRESSIONS
1116 #endif
1117