• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 **************************************************************************
3 *   Copyright (C) 2002-2007 International Business Machines Corporation  *
4 *   and others. All rights reserved.                                     *
5 **************************************************************************
6 */
7 //
8 //  file:  rematch.cpp
9 //
10 //         Contains the implementation of class RegexMatcher,
11 //         which is one of the main API classes for the ICU regular expression package.
12 //
13 
14 #include "unicode/utypes.h"
15 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
16 
17 #include "unicode/regex.h"
18 #include "unicode/uniset.h"
19 #include "unicode/uchar.h"
20 #include "unicode/ustring.h"
21 #include "unicode/rbbi.h"
22 #include "uassert.h"
23 #include "cmemory.h"
24 #include "uvector.h"
25 #include "uvectr32.h"
26 #include "regeximp.h"
27 #include "regexst.h"
28 
29 // #include <malloc.h>        // Needed for heapcheck testing
30 
31 U_NAMESPACE_BEGIN
32 
33 //-----------------------------------------------------------------------------
34 //
35 //   Constructor and Destructor
36 //
37 //-----------------------------------------------------------------------------
RegexMatcher(const RegexPattern * pat)38 RegexMatcher::RegexMatcher(const RegexPattern *pat)  {
39     fPattern           = pat;
40     fPatternOwned      = NULL;
41     fInput             = NULL;
42     fTraceDebug        = FALSE;
43     fDeferredStatus    = U_ZERO_ERROR;
44     fStack             = new UVector32(fDeferredStatus);
45     fData              = fSmallData;
46     fWordBreakItr      = NULL;
47     fTransparentBounds = FALSE;
48     fAnchoringBounds   = TRUE;
49     if (pat==NULL) {
50         fDeferredStatus = U_ILLEGAL_ARGUMENT_ERROR;
51         return;
52     }
53     if (pat->fDataSize > (int32_t)(sizeof(fSmallData)/sizeof(int32_t))) {
54         fData = (int32_t *)uprv_malloc(pat->fDataSize * sizeof(int32_t));
55     }
56     if (fStack == NULL || fData == NULL) {
57         fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
58     }
59 
60     reset(RegexStaticSets::gStaticSets->fEmptyString);
61 }
62 
63 
64 
RegexMatcher(const UnicodeString & regexp,const UnicodeString & input,uint32_t flags,UErrorCode & status)65 RegexMatcher::RegexMatcher(const UnicodeString &regexp, const UnicodeString &input,
66                            uint32_t flags, UErrorCode &status) {
67     UParseError    pe;
68     fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
69     fPattern           = fPatternOwned;
70     fTraceDebug        = FALSE;
71     fDeferredStatus    = U_ZERO_ERROR;
72     fStack             = new UVector32(status);
73     fData              = fSmallData;
74     fWordBreakItr      = NULL;
75     fTransparentBounds = FALSE;
76     fAnchoringBounds   = TRUE;
77     if (U_FAILURE(status)) {
78         return;
79     }
80     if (fPattern->fDataSize > (int32_t)(sizeof(fSmallData)/sizeof(int32_t))) {
81         fData = (int32_t *)uprv_malloc(fPattern->fDataSize * sizeof(int32_t));
82     }
83     if (fStack == NULL || fData == NULL) {
84         status = U_MEMORY_ALLOCATION_ERROR;
85     }
86     reset(input);
87 }
88 
89 
RegexMatcher(const UnicodeString & regexp,uint32_t flags,UErrorCode & status)90 RegexMatcher::RegexMatcher(const UnicodeString &regexp,
91                            uint32_t flags, UErrorCode &status) {
92     UParseError    pe;
93     fTraceDebug        = FALSE;
94     fDeferredStatus    = U_ZERO_ERROR;
95     fStack             = new UVector32(status);
96     fData              = fSmallData;
97     fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
98     fPattern           = fPatternOwned;
99     fWordBreakItr      = NULL;
100     fTransparentBounds = FALSE;
101     fAnchoringBounds   = TRUE;
102     if (U_FAILURE(status)) {
103         return;
104     }
105 
106     if (fPattern->fDataSize > (int32_t)(sizeof(fSmallData)/sizeof(int32_t))) {
107         fData = (int32_t *)uprv_malloc(fPattern->fDataSize * sizeof(int32_t));
108     }
109     if (fStack == NULL || fData == NULL) {
110         status = U_MEMORY_ALLOCATION_ERROR;
111     }
112     reset(RegexStaticSets::gStaticSets->fEmptyString);
113 }
114 
115 
116 
~RegexMatcher()117 RegexMatcher::~RegexMatcher() {
118     delete fStack;
119     if (fData != fSmallData) {
120         uprv_free(fData);
121         fData = NULL;
122     }
123     if (fPatternOwned) {
124         delete fPatternOwned;
125         fPatternOwned = NULL;
126         fPattern = NULL;
127     }
128     #if UCONFIG_NO_BREAK_ITERATION==0
129     delete fWordBreakItr;
130     #endif
131 }
132 
133 
134 
135 static const UChar BACKSLASH  = 0x5c;
136 static const UChar DOLLARSIGN = 0x24;
137 //--------------------------------------------------------------------------------
138 //
139 //    appendReplacement
140 //
141 //--------------------------------------------------------------------------------
appendReplacement(UnicodeString & dest,const UnicodeString & replacement,UErrorCode & status)142 RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest,
143                                               const UnicodeString &replacement,
144                                               UErrorCode &status) {
145     if (U_FAILURE(status)) {
146         return *this;
147     }
148     if (U_FAILURE(fDeferredStatus)) {
149         status = fDeferredStatus;
150         return *this;
151     }
152     if (fMatch == FALSE) {
153         status = U_REGEX_INVALID_STATE;
154         return *this;
155     }
156 
157     // Copy input string from the end of previous match to start of current match
158     int32_t  len = fMatchStart-fAppendPosition;
159     if (len > 0) {
160         dest.append(*fInput, fAppendPosition, len);
161     }
162     fAppendPosition = fMatchEnd;
163 
164 
165     // scan the replacement text, looking for substitutions ($n) and \escapes.
166     //  TODO:  optimize this loop by efficiently scanning for '$' or '\',
167     //         move entire ranges not containing substitutions.
168     int32_t  replLen = replacement.length();
169     int32_t  replIdx = 0;
170     while (replIdx<replLen) {
171         UChar  c = replacement.charAt(replIdx);
172         replIdx++;
173         if (c == BACKSLASH) {
174             // Backslash Escape.  Copy the following char out without further checks.
175             //                    Note:  Surrogate pairs don't need any special handling
176             //                           The second half wont be a '$' or a '\', and
177             //                           will move to the dest normally on the next
178             //                           loop iteration.
179             if (replIdx >= replLen) {
180                 break;
181             }
182             c = replacement.charAt(replIdx);
183 
184             if (c==0x55/*U*/ || c==0x75/*u*/) {
185                 // We have a \udddd or \Udddddddd escape sequence.
186                 UChar32 escapedChar = replacement.unescapeAt(replIdx);
187                 if (escapedChar != (UChar32)0xFFFFFFFF) {
188                     dest.append(escapedChar);
189                     // TODO:  Report errors for mal-formed \u escapes?
190                     //        As this is, the original sequence is output, which may be OK.
191                     continue;
192                 }
193             }
194 
195             // Plain backslash escape.  Just put out the escaped character.
196             dest.append(c);
197             replIdx++;
198             continue;
199         }
200 
201         if (c != DOLLARSIGN) {
202             // Normal char, not a $.  Copy it out without further checks.
203             dest.append(c);
204             continue;
205         }
206 
207         // We've got a $.  Pick up a capture group number if one follows.
208         // Consume at most the number of digits necessary for the largest capture
209         // number that is valid for this pattern.
210 
211         int32_t numDigits = 0;
212         int32_t groupNum  = 0;
213         UChar32 digitC;
214         for (;;) {
215             if (replIdx >= replLen) {
216                 break;
217             }
218             digitC = replacement.char32At(replIdx);
219             if (u_isdigit(digitC) == FALSE) {
220                 break;
221             }
222             replIdx = replacement.moveIndex32(replIdx, 1);
223             groupNum=groupNum*10 + u_charDigitValue(digitC);
224             numDigits++;
225             if (numDigits >= fPattern->fMaxCaptureDigits) {
226                 break;
227             }
228         }
229 
230 
231         if (numDigits == 0) {
232             // The $ didn't introduce a group number at all.
233             // Treat it as just part of the substitution text.
234             dest.append(DOLLARSIGN);
235             continue;
236         }
237 
238         // Finally, append the capture group data to the destination.
239         dest.append(group(groupNum, status));
240         if (U_FAILURE(status)) {
241             // Can fail if group number is out of range.
242             break;
243         }
244 
245     }
246 
247     return *this;
248 }
249 
250 
251 
252 //--------------------------------------------------------------------------------
253 //
254 //    appendTail     Intended to be used in conjunction with appendReplacement()
255 //                   To the destination string, append everything following
256 //                   the last match position from the input string.
257 //
258 //                   Note:  Match ranges do not affect appendTail or appendReplacement
259 //
260 //--------------------------------------------------------------------------------
appendTail(UnicodeString & dest)261 UnicodeString &RegexMatcher::appendTail(UnicodeString &dest) {
262     int32_t  len = fInput->length() - fAppendPosition;
263     if (len > 0) {
264         dest.append(*fInput, fAppendPosition, len);
265     }
266     return dest;
267 }
268 
269 
270 
271 //--------------------------------------------------------------------------------
272 //
273 //   end
274 //
275 //--------------------------------------------------------------------------------
end(UErrorCode & err) const276 int32_t RegexMatcher::end(UErrorCode &err) const {
277     return end(0, err);
278 }
279 
280 
281 
end(int32_t group,UErrorCode & err) const282 int32_t RegexMatcher::end(int32_t group, UErrorCode &err) const {
283     if (U_FAILURE(err)) {
284         return -1;
285     }
286     if (fMatch == FALSE) {
287         err = U_REGEX_INVALID_STATE;
288         return -1;
289     }
290     if (group < 0 || group > fPattern->fGroupMap->size()) {
291         err = U_INDEX_OUTOFBOUNDS_ERROR;
292         return -1;
293     }
294     int32_t e = -1;
295     if (group == 0) {
296         e = fMatchEnd;
297     } else {
298         // Get the position within the stack frame of the variables for
299         //    this capture group.
300         int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
301         U_ASSERT(groupOffset < fPattern->fFrameSize);
302         U_ASSERT(groupOffset >= 0);
303         e = fFrame->fExtra[groupOffset + 1];
304     }
305     return e;
306 }
307 
308 
309 
310 //--------------------------------------------------------------------------------
311 //
312 //   find()
313 //
314 //--------------------------------------------------------------------------------
find()315 UBool RegexMatcher::find() {
316     // Start at the position of the last match end.  (Will be zero if the
317     //   matcher has been reset.
318     //
319     if (U_FAILURE(fDeferredStatus)) {
320         return FALSE;
321     }
322 
323     int32_t startPos = fMatchEnd;
324     if (startPos==0) {
325         startPos = fActiveStart;
326     }
327 
328     if (fMatch) {
329         // Save the position of any previous successful match.
330         fLastMatchEnd = fMatchEnd;
331 
332         if (fMatchStart == fMatchEnd) {
333             // Previous match had zero length.  Move start position up one position
334             //  to avoid sending find() into a loop on zero-length matches.
335             if (startPos >= fActiveLimit) {
336                 fMatch = FALSE;
337                 fHitEnd = TRUE;
338                 return FALSE;
339             }
340             startPos = fInput->moveIndex32(startPos, 1);
341         }
342     } else {
343         if (fLastMatchEnd >= 0) {
344             // A previous find() failed to match.  Don't try again.
345             //   (without this test, a pattern with a zero-length match
346             //    could match again at the end of an input string.)
347             fHitEnd = TRUE;
348             return FALSE;
349         }
350     }
351 
352 
353     // Compute the position in the input string beyond which a match can not begin, because
354     //   the minimum length match would extend past the end of the input.
355     //   Note:  some patterns that cannot match anything will have fMinMatchLength==Max Int.
356     //          Be aware of possible overflows if making changes here.
357     int32_t testLen  = fActiveLimit - fPattern->fMinMatchLen;
358     if (startPos > testLen) {
359         fMatch = FALSE;
360         fHitEnd = TRUE;
361         return FALSE;
362     }
363 
364     const UChar *inputBuf = fInput->getBuffer();
365     UChar32  c;
366     U_ASSERT(startPos >= 0);
367 
368     switch (fPattern->fStartType) {
369     case START_NO_INFO:
370         // No optimization was found.
371         //  Try a match at each input position.
372         for (;;) {
373             MatchAt(startPos, FALSE, fDeferredStatus);
374             if (U_FAILURE(fDeferredStatus)) {
375                 return FALSE;
376             }
377             if (fMatch) {
378                 return TRUE;
379             }
380             if (startPos >= testLen) {
381                 fHitEnd = TRUE;
382                 return FALSE;
383             }
384             U16_FWD_1(inputBuf, startPos, fActiveLimit);
385             // Note that it's perfectly OK for a pattern to have a zero-length
386             //   match at the end of a string, so we must make sure that the loop
387             //   runs with startPos == testLen the last time through.
388         }
389         U_ASSERT(FALSE);
390 
391     case START_START:
392         // Matches are only possible at the start of the input string
393         //   (pattern begins with ^ or \A)
394         if (startPos > fActiveStart) {
395             fMatch = FALSE;
396             return FALSE;
397         }
398         MatchAt(startPos, FALSE, fDeferredStatus);
399         if (U_FAILURE(fDeferredStatus)) {
400             return FALSE;
401         }
402         return fMatch;
403 
404 
405     case START_SET:
406         {
407             // Match may start on any char from a pre-computed set.
408             U_ASSERT(fPattern->fMinMatchLen > 0);
409             for (;;) {
410                 int32_t pos = startPos;
411                 U16_NEXT(inputBuf, startPos, fActiveLimit, c);  // like c = inputBuf[startPos++];
412                 if (c<256 && fPattern->fInitialChars8->contains(c) ||
413                     c>=256 && fPattern->fInitialChars->contains(c)) {
414                     MatchAt(pos, FALSE, fDeferredStatus);
415                     if (U_FAILURE(fDeferredStatus)) {
416                         return FALSE;
417                     }
418                     if (fMatch) {
419                         return TRUE;
420                     }
421                 }
422                 if (pos >= testLen) {
423                     fMatch = FALSE;
424                     fHitEnd = TRUE;
425                     return FALSE;
426                 }
427             }
428         }
429         U_ASSERT(FALSE);
430 
431     case START_STRING:
432     case START_CHAR:
433         {
434             // Match starts on exactly one char.
435             U_ASSERT(fPattern->fMinMatchLen > 0);
436             UChar32 theChar = fPattern->fInitialChar;
437             for (;;) {
438                 int32_t pos = startPos;
439                 U16_NEXT(inputBuf, startPos, fActiveLimit, c);  // like c = inputBuf[startPos++];
440                 if (c == theChar) {
441                     MatchAt(pos, FALSE, fDeferredStatus);
442                     if (U_FAILURE(fDeferredStatus)) {
443                         return FALSE;
444                     }
445                     if (fMatch) {
446                         return TRUE;
447                     }
448                 }
449                 if (pos >= testLen) {
450                     fMatch = FALSE;
451                     fHitEnd = TRUE;
452                     return FALSE;
453                 }
454             }
455         }
456         U_ASSERT(FALSE);
457 
458     case START_LINE:
459         {
460             UChar32  c;
461             if (startPos == fAnchorStart) {
462                 MatchAt(startPos, FALSE, fDeferredStatus);
463                 if (U_FAILURE(fDeferredStatus)) {
464                     return FALSE;
465                 }
466                 if (fMatch) {
467                     return TRUE;
468                 }
469                 U16_NEXT(inputBuf, startPos, fActiveLimit, c);  // like c = inputBuf[startPos++];
470             }
471 
472             if (fPattern->fFlags & UREGEX_UNIX_LINES) {
473                for (;;) {
474                     c = inputBuf[startPos-1];
475                     if (c == 0x0a) {
476                             MatchAt(startPos, FALSE, fDeferredStatus);
477                             if (U_FAILURE(fDeferredStatus)) {
478                                 return FALSE;
479                             }
480                             if (fMatch) {
481                                 return TRUE;
482                             }
483                     }
484                     if (startPos >= testLen) {
485                         fMatch = FALSE;
486                         fHitEnd = TRUE;
487                         return FALSE;
488                     }
489                     U16_NEXT(inputBuf, startPos, fActiveLimit, c);  // like c = inputBuf[startPos++];
490                     // Note that it's perfectly OK for a pattern to have a zero-length
491                     //   match at the end of a string, so we must make sure that the loop
492                     //   runs with startPos == testLen the last time through.
493                 }
494             } else {
495                 for (;;) {
496                     c = inputBuf[startPos-1];
497                     if (((c & 0x7f) <= 0x29) &&     // First quickly bypass as many chars as possible
498                         ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 )) {
499                             if (c == 0x0d && startPos < fActiveLimit && inputBuf[startPos] == 0x0a) {
500                                 startPos++;
501                             }
502                             MatchAt(startPos, FALSE, fDeferredStatus);
503                             if (U_FAILURE(fDeferredStatus)) {
504                                 return FALSE;
505                             }
506                             if (fMatch) {
507                                 return TRUE;
508                             }
509                     }
510                     if (startPos >= testLen) {
511                         fMatch = FALSE;
512                         fHitEnd = TRUE;
513                         return FALSE;
514                     }
515                     U16_NEXT(inputBuf, startPos, fActiveLimit, c);  // like c = inputBuf[startPos++];
516                     // Note that it's perfectly OK for a pattern to have a zero-length
517                     //   match at the end of a string, so we must make sure that the loop
518                     //   runs with startPos == testLen the last time through.
519                 }
520             }
521         }
522 
523     default:
524         U_ASSERT(FALSE);
525     }
526 
527     U_ASSERT(FALSE);
528     return FALSE;
529 }
530 
531 
532 
find(int32_t start,UErrorCode & status)533 UBool RegexMatcher::find(int32_t start, UErrorCode &status) {
534     if (U_FAILURE(status)) {
535         return FALSE;
536     }
537     if (U_FAILURE(fDeferredStatus)) {
538         status = fDeferredStatus;
539         return FALSE;
540     }
541     this->reset();                        // Note:  Reset() is specified by Java Matcher documentation.
542                                           //        This will reset the region to be the full input length.
543     if (start < fActiveStart || start > fActiveLimit) {
544         status = U_INDEX_OUTOFBOUNDS_ERROR;
545         return FALSE;
546     }
547     fMatchEnd = start;
548     return find();
549 }
550 
551 
552 
553 //--------------------------------------------------------------------------------
554 //
555 //  group()
556 //
557 //--------------------------------------------------------------------------------
group(UErrorCode & status) const558 UnicodeString RegexMatcher::group(UErrorCode &status) const {
559     return group(0, status);
560 }
561 
562 
563 
group(int32_t groupNum,UErrorCode & status) const564 UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const {
565     int32_t  s = start(groupNum, status);
566     int32_t  e = end(groupNum, status);
567 
568     // Note:  calling start() and end() above will do all necessary checking that
569     //        the group number is OK and that a match exists.  status will be set.
570     if (U_FAILURE(status)) {
571         return UnicodeString();
572     }
573     if (U_FAILURE(fDeferredStatus)) {
574         status = fDeferredStatus;
575         return UnicodeString();
576     }
577 
578     if (s < 0) {
579         // A capture group wasn't part of the match
580         return UnicodeString();
581     }
582     U_ASSERT(s <= e);
583     return UnicodeString(*fInput, s, e-s);
584 }
585 
586 
587 
588 
groupCount() const589 int32_t RegexMatcher::groupCount() const {
590     return fPattern->fGroupMap->size();
591 }
592 
593 
594 
input() const595 const UnicodeString &RegexMatcher::input() const {
596     return *fInput;
597 }
598 
599 
600 //--------------------------------------------------------------------------------
601 //
602 //  hasAnchoringBounds()
603 //
604 //--------------------------------------------------------------------------------
hasAnchoringBounds() const605 UBool RegexMatcher::hasAnchoringBounds() const {
606     return fAnchoringBounds;
607 }
608 
609 
610 //--------------------------------------------------------------------------------
611 //
612 //  hasTransparentBounds()
613 //
614 //--------------------------------------------------------------------------------
hasTransparentBounds() const615 UBool RegexMatcher::hasTransparentBounds() const {
616     return fTransparentBounds;
617 }
618 
619 
620 
621 //--------------------------------------------------------------------------------
622 //
623 //  hitEnd()
624 //
625 //--------------------------------------------------------------------------------
hitEnd() const626 UBool RegexMatcher::hitEnd() const {
627     return fHitEnd;
628 }
629 
630 //--------------------------------------------------------------------------------
631 //
632 //  lookingAt()
633 //
634 //--------------------------------------------------------------------------------
lookingAt(UErrorCode & status)635 UBool RegexMatcher::lookingAt(UErrorCode &status) {
636     if (U_FAILURE(status)) {
637         return FALSE;
638     }
639     if (U_FAILURE(fDeferredStatus)) {
640         status = fDeferredStatus;
641         return FALSE;
642     }
643     resetPreserveRegion();
644     MatchAt(fActiveStart, FALSE, status);
645     return fMatch;
646 }
647 
648 
lookingAt(int32_t start,UErrorCode & status)649 UBool RegexMatcher::lookingAt(int32_t start, UErrorCode &status) {
650     if (U_FAILURE(status)) {
651         return FALSE;
652     }
653     if (U_FAILURE(fDeferredStatus)) {
654         status = fDeferredStatus;
655         return FALSE;
656     }
657     reset();
658     if (start < fActiveStart || start > fActiveLimit) {
659         status = U_INDEX_OUTOFBOUNDS_ERROR;
660         return FALSE;
661     }
662     MatchAt(start, FALSE, status);
663     return fMatch;
664 }
665 
666 
667 
668 //--------------------------------------------------------------------------------
669 //
670 //  matches()
671 //
672 //--------------------------------------------------------------------------------
matches(UErrorCode & status)673 UBool RegexMatcher::matches(UErrorCode &status) {
674     if (U_FAILURE(status)) {
675         return FALSE;
676     }
677     if (U_FAILURE(fDeferredStatus)) {
678         status = fDeferredStatus;
679         return FALSE;
680     }
681     resetPreserveRegion();
682     MatchAt(fActiveStart, TRUE, status);
683     return fMatch;
684 }
685 
686 
matches(int32_t start,UErrorCode & status)687 UBool RegexMatcher::matches(int32_t start, UErrorCode &status) {
688     if (U_FAILURE(status)) {
689         return FALSE;
690     }
691     if (U_FAILURE(fDeferredStatus)) {
692         status = fDeferredStatus;
693         return FALSE;
694     }
695     reset();
696     if (start < fActiveStart || start > fActiveLimit) {
697         status = U_INDEX_OUTOFBOUNDS_ERROR;
698         return FALSE;
699     }
700     MatchAt(start, TRUE, status);
701     return fMatch;
702 }
703 
704 
705 
706 //--------------------------------------------------------------------------------
707 //
708 //    pattern
709 //
710 //--------------------------------------------------------------------------------
pattern() const711 const RegexPattern &RegexMatcher::pattern() const {
712     return *fPattern;
713 }
714 
715 
716 
717 //--------------------------------------------------------------------------------
718 //
719 //    region
720 //
721 //--------------------------------------------------------------------------------
region(int32_t start,int32_t limit,UErrorCode & status)722 RegexMatcher &RegexMatcher::region(int32_t start, int32_t limit, UErrorCode &status) {
723     if (U_FAILURE(status)) {
724         return *this;
725     }
726     if (start>limit || start<0 || limit<0 || limit>fInput->length()) {
727         status = U_ILLEGAL_ARGUMENT_ERROR;
728     }
729     this->reset();
730     fRegionStart = start;
731     fRegionLimit = limit;
732     fActiveStart = start;
733     fActiveLimit = limit;
734     if (!fTransparentBounds) {
735         fLookStart = start;
736         fLookLimit = limit;
737     }
738     if (fAnchoringBounds) {
739         fAnchorStart = start;
740         fAnchorLimit = limit;
741     }
742     return *this;
743 }
744 
745 
746 
747 //--------------------------------------------------------------------------------
748 //
749 //    regionEnd
750 //
751 //--------------------------------------------------------------------------------
regionEnd() const752 int RegexMatcher::regionEnd() const {
753     return fRegionLimit;
754 }
755 
756 
757 //--------------------------------------------------------------------------------
758 //
759 //    regionStart
760 //
761 //--------------------------------------------------------------------------------
regionStart() const762 int RegexMatcher::regionStart() const {
763     return fRegionStart;
764 }
765 
766 
767 //--------------------------------------------------------------------------------
768 //
769 //    replaceAll
770 //
771 //--------------------------------------------------------------------------------
replaceAll(const UnicodeString & replacement,UErrorCode & status)772 UnicodeString RegexMatcher::replaceAll(const UnicodeString &replacement, UErrorCode &status) {
773     if (U_FAILURE(status)) {
774         return *fInput;
775     }
776     if (U_FAILURE(fDeferredStatus)) {
777         status = fDeferredStatus;
778         return *fInput;
779     }
780     UnicodeString destString;
781     reset();
782     while (find()) {
783         appendReplacement(destString, replacement, status);
784         if (U_FAILURE(status)) {
785             break;
786         }
787     }
788     appendTail(destString);
789     return destString;
790 }
791 
792 
793 
794 
795 //--------------------------------------------------------------------------------
796 //
797 //    replaceFirst
798 //
799 //--------------------------------------------------------------------------------
replaceFirst(const UnicodeString & replacement,UErrorCode & status)800 UnicodeString RegexMatcher::replaceFirst(const UnicodeString &replacement, UErrorCode &status) {
801     if (U_FAILURE(status)) {
802         return *fInput;
803     }
804     if (U_FAILURE(fDeferredStatus)) {
805         status = fDeferredStatus;
806         return *fInput;
807     }
808 
809     reset();
810     if (!find()) {
811         return *fInput;
812     }
813 
814     UnicodeString destString;
815     appendReplacement(destString, replacement, status);
816     appendTail(destString);
817     return destString;
818 }
819 
820 
821 //--------------------------------------------------------------------------------
822 //
823 //     requireEnd
824 //
825 //--------------------------------------------------------------------------------
requireEnd() const826 UBool RegexMatcher::requireEnd() const {
827     return fRequireEnd;
828 }
829 
830 
831 //--------------------------------------------------------------------------------
832 //
833 //     reset
834 //
835 //--------------------------------------------------------------------------------
reset()836 RegexMatcher &RegexMatcher::reset() {
837     fRegionStart    = 0;
838     fRegionLimit    = fInput->length();
839     fActiveStart    = 0;
840     fActiveLimit    = fRegionLimit;
841     fAnchorStart    = 0;
842     fAnchorLimit    = fRegionLimit;
843     fLookStart      = 0;
844     fLookLimit      = fRegionLimit;
845     resetPreserveRegion();
846     return *this;
847 }
848 
849 
850 
resetPreserveRegion()851 void RegexMatcher::resetPreserveRegion() {
852     fMatchStart     = 0;
853     fMatchEnd       = 0;
854     fLastMatchEnd   = -1;
855     fAppendPosition = 0;
856     fMatch          = FALSE;
857     fHitEnd         = FALSE;
858     fRequireEnd     = FALSE;
859     resetStack();
860 }
861 
862 
reset(const UnicodeString & input)863 RegexMatcher &RegexMatcher::reset(const UnicodeString &input) {
864     fInput          = &input;
865     reset();
866     if (fWordBreakItr != NULL) {
867         #if UCONFIG_NO_BREAK_ITERATION==0
868         fWordBreakItr->setText(input);
869         #endif
870     }
871     return *this;
872 }
873 
874 /*RegexMatcher &RegexMatcher::reset(const UChar *) {
875     fDeferredStatus = U_INTERNAL_PROGRAM_ERROR;
876     return *this;
877 }*/
878 
879 
reset(int32_t position,UErrorCode & status)880 RegexMatcher &RegexMatcher::reset(int32_t position, UErrorCode &status) {
881     if (U_FAILURE(status)) {
882         return *this;
883     }
884     reset();       // Reset also resets the region to be the entire string.
885     if (position < 0 || position >= fActiveLimit) {
886         status = U_INDEX_OUTOFBOUNDS_ERROR;
887         return *this;
888     }
889     fMatchEnd = position;
890     return *this;
891 }
892 
893 
894 
895 
896 
897 //--------------------------------------------------------------------------------
898 //
899 //    setTrace
900 //
901 //--------------------------------------------------------------------------------
setTrace(UBool state)902 void RegexMatcher::setTrace(UBool state) {
903     fTraceDebug = state;
904 }
905 
906 
907 
908 //---------------------------------------------------------------------
909 //
910 //   split
911 //
912 //---------------------------------------------------------------------
split(const UnicodeString & input,UnicodeString dest[],int32_t destCapacity,UErrorCode & status)913 int32_t  RegexMatcher::split(const UnicodeString &input,
914         UnicodeString    dest[],
915         int32_t          destCapacity,
916         UErrorCode       &status)
917 {
918     //
919     // Check arguements for validity
920     //
921     if (U_FAILURE(status)) {
922         return 0;
923     };
924 
925     if (destCapacity < 1) {
926         status = U_ILLEGAL_ARGUMENT_ERROR;
927         return 0;
928     }
929 
930     //
931     // Reset for the input text
932     //
933     reset(input);
934     int32_t   nextOutputStringStart = 0;
935     if (fActiveLimit == 0) {
936         return 0;
937     }
938 
939     //
940     // Loop through the input text, searching for the delimiter pattern
941     //
942     int32_t i;
943     int32_t numCaptureGroups = fPattern->fGroupMap->size();
944     for (i=0; ; i++) {
945         if (i>=destCapacity-1) {
946             // There is one or zero output string left.
947             // Fill the last output string with whatever is left from the input, then exit the loop.
948             //  ( i will be == destCapicity if we filled the output array while processing
949             //    capture groups of the delimiter expression, in which case we will discard the
950             //    last capture group saved in favor of the unprocessed remainder of the
951             //    input string.)
952             i = destCapacity-1;
953             int32_t remainingLength = fActiveLimit-nextOutputStringStart;
954             if (remainingLength > 0) {
955                 dest[i].setTo(input, nextOutputStringStart, remainingLength);
956             }
957             break;
958         }
959         if (find()) {
960             // We found another delimiter.  Move everything from where we started looking
961             //  up until the start of the delimiter into the next output string.
962             int32_t fieldLen = fMatchStart - nextOutputStringStart;
963             dest[i].setTo(input, nextOutputStringStart, fieldLen);
964             nextOutputStringStart = fMatchEnd;
965 
966             // If the delimiter pattern has capturing parentheses, the captured
967             //  text goes out into the next n destination strings.
968             int32_t groupNum;
969             for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
970                 if (i==destCapacity-1) {
971                     break;
972                 }
973                 i++;
974                 dest[i] = group(groupNum, status);
975             }
976 
977             if (nextOutputStringStart == fActiveLimit) {
978                 // The delimiter was at the end of the string.  We're done.
979                 break;
980             }
981 
982         }
983         else
984         {
985             // We ran off the end of the input while looking for the next delimiter.
986             // All the remaining text goes into the current output string.
987             dest[i].setTo(input, nextOutputStringStart, fActiveLimit-nextOutputStringStart);
988             break;
989         }
990     }
991     return i+1;
992 }
993 
994 
995 
996 //--------------------------------------------------------------------------------
997 //
998 //     start
999 //
1000 //--------------------------------------------------------------------------------
start(UErrorCode & status) const1001 int32_t RegexMatcher::start(UErrorCode &status) const {
1002     return start(0, status);
1003 }
1004 
1005 
1006 
1007 
1008 //--------------------------------------------------------------------------------
1009 //
1010 //     start(int32_t group, UErrorCode &status)
1011 //
1012 //--------------------------------------------------------------------------------
start(int32_t group,UErrorCode & status) const1013 int32_t RegexMatcher::start(int32_t group, UErrorCode &status) const {
1014     if (U_FAILURE(status)) {
1015         return -1;
1016     }
1017     if (U_FAILURE(fDeferredStatus)) {
1018         status = fDeferredStatus;
1019         return -1;
1020     }
1021     if (fMatch == FALSE) {
1022         status = U_REGEX_INVALID_STATE;
1023         return -1;
1024     }
1025     if (group < 0 || group > fPattern->fGroupMap->size()) {
1026         status = U_INDEX_OUTOFBOUNDS_ERROR;
1027         return -1;
1028     }
1029     int32_t s;
1030     if (group == 0) {
1031         s = fMatchStart;
1032     } else {
1033         int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
1034         U_ASSERT(groupOffset < fPattern->fFrameSize);
1035         U_ASSERT(groupOffset >= 0);
1036         s = fFrame->fExtra[groupOffset];
1037     }
1038     return s;
1039 }
1040 
1041 
1042 
1043 //--------------------------------------------------------------------------------
1044 //
1045 //     useAnchoringBounds
1046 //
1047 //--------------------------------------------------------------------------------
useAnchoringBounds(UBool b)1048 RegexMatcher &RegexMatcher::useAnchoringBounds(UBool b) {
1049     fAnchoringBounds = b;
1050     UErrorCode status = U_ZERO_ERROR;
1051     region(fRegionStart, fRegionLimit, status);
1052     U_ASSERT(U_SUCCESS(status));
1053     return *this;
1054 }
1055 
1056 
1057 //--------------------------------------------------------------------------------
1058 //
1059 //     useTransparentBounds
1060 //
1061 //--------------------------------------------------------------------------------
useTransparentBounds(UBool b)1062 RegexMatcher &RegexMatcher::useTransparentBounds(UBool b) {
1063     fTransparentBounds = b;
1064     UErrorCode status = U_ZERO_ERROR;
1065     region(fRegionStart, fRegionLimit, status);
1066     U_ASSERT(U_SUCCESS(status));
1067     return *this;
1068 }
1069 
1070 
1071 //================================================================================
1072 //
1073 //    Code following this point in this file is the internal
1074 //    Match Engine Implementation.
1075 //
1076 //================================================================================
1077 
1078 
1079 //--------------------------------------------------------------------------------
1080 //
1081 //   resetStack
1082 //           Discard any previous contents of the state save stack, and initialize a
1083 //           new stack frame to all -1.  The -1s are needed for capture group limits,
1084 //           where they indicate that a group has not yet matched anything.
1085 //--------------------------------------------------------------------------------
resetStack()1086 REStackFrame *RegexMatcher::resetStack() {
1087     // Discard any previous contents of the state save stack, and initialize a
1088     //  new stack frame to all -1.  The -1s are needed for capture group limits, where
1089     //  they indicate that a group has not yet matched anything.
1090     fStack->removeAllElements();
1091 
1092     int32_t *iFrame = fStack->reserveBlock(fPattern->fFrameSize, fDeferredStatus);
1093     int32_t i;
1094     for (i=0; i<fPattern->fFrameSize; i++) {
1095         iFrame[i] = -1;
1096     }
1097     return (REStackFrame *)iFrame;
1098 }
1099 
1100 
1101 
1102 //--------------------------------------------------------------------------------
1103 //
1104 //   isWordBoundary
1105 //                     in perl, "xab..cd..", \b is true at positions 0,3,5,7
1106 //                     For us,
1107 //                       If the current char is a combining mark,
1108 //                          \b is FALSE.
1109 //                       Else Scan backwards to the first non-combining char.
1110 //                            We are at a boundary if the this char and the original chars are
1111 //                               opposite in membership in \w set
1112 //
1113 //          parameters:   pos   - the current position in the input buffer
1114 //
1115 //              TODO:  double-check edge cases at region boundaries.
1116 //
1117 //--------------------------------------------------------------------------------
isWordBoundary(int32_t pos)1118 UBool RegexMatcher::isWordBoundary(int32_t pos) {
1119     UBool isBoundary = FALSE;
1120     UBool cIsWord    = FALSE;
1121 
1122     if (pos >= fLookLimit) {
1123         fHitEnd = TRUE;
1124     } else {
1125         // Determine whether char c at current position is a member of the word set of chars.
1126         // If we're off the end of the string, behave as though we're not at a word char.
1127         UChar32  c = fInput->char32At(pos);
1128         if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
1129             // Current char is a combining one.  Not a boundary.
1130             return FALSE;
1131         }
1132         cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
1133     }
1134 
1135     // Back up until we come to a non-combining char, determine whether
1136     //  that char is a word char.
1137     UBool prevCIsWord = FALSE;
1138     int32_t prevPos = pos;
1139     for (;;) {
1140         if (prevPos <= fLookStart) {
1141             break;
1142         }
1143         prevPos = fInput->moveIndex32(prevPos, -1);
1144         UChar32 prevChar = fInput->char32At(prevPos);
1145         if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
1146               || u_charType(prevChar) == U_FORMAT_CHAR)) {
1147             prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar);
1148             break;
1149         }
1150     }
1151     isBoundary = cIsWord ^ prevCIsWord;
1152     return isBoundary;
1153 }
1154 
1155 //--------------------------------------------------------------------------------
1156 //
1157 //   isUWordBoundary
1158 //
1159 //         Test for a word boundary using RBBI word break.
1160 //
1161 //          parameters:   pos   - the current position in the input buffer
1162 //
1163 //--------------------------------------------------------------------------------
isUWordBoundary(int32_t pos)1164 UBool RegexMatcher::isUWordBoundary(int32_t pos) {
1165     UBool       returnVal = FALSE;
1166 #if UCONFIG_NO_BREAK_ITERATION==0
1167 
1168     // If we haven't yet created a break iterator for this matcher, do it now.
1169     if (fWordBreakItr == NULL) {
1170         fWordBreakItr =
1171             (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), fDeferredStatus);
1172         if (U_FAILURE(fDeferredStatus)) {
1173             return FALSE;
1174         }
1175         fWordBreakItr->setText(*fInput);
1176     }
1177 
1178     if (pos >= fLookLimit) {
1179         fHitEnd = TRUE;
1180         returnVal = TRUE;   // With Unicode word rules, only positions within the interior of "real"
1181                             //    words are not boundaries.  All non-word chars stand by themselves,
1182                             //    with word boundaries on both sides.
1183     } else {
1184         returnVal = fWordBreakItr->isBoundary(pos);
1185     }
1186 #endif
1187     return   returnVal;
1188 }
1189 
1190 //--------------------------------------------------------------------------------
1191 //
1192 //   StateSave
1193 //       Make a new stack frame, initialized as a copy of the current stack frame.
1194 //       Set the pattern index in the original stack frame from the operand value
1195 //       in the opcode.  Execution of the engine continues with the state in
1196 //       the newly created stack frame
1197 //
1198 //       Note that reserveBlock() may grow the stack, resulting in the
1199 //       whole thing being relocated in memory.
1200 //
1201 //--------------------------------------------------------------------------------
StateSave(REStackFrame * fp,int32_t savePatIdx,int32_t frameSize,UErrorCode & status)1202 inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int32_t savePatIdx, int32_t frameSize, UErrorCode &status) {
1203     // push storage for a new frame.
1204     int32_t *newFP = fStack->reserveBlock(frameSize, status);
1205     fp = (REStackFrame *)(newFP - frameSize);  // in case of realloc of stack.
1206 
1207     // New stack frame = copy of old top frame.
1208     int32_t *source = (int32_t *)fp;
1209     int32_t *dest   = newFP;
1210     for (;;) {
1211         *dest++ = *source++;
1212         if (source == newFP) {
1213             break;
1214         }
1215     }
1216 
1217     fp->fPatIdx = savePatIdx;
1218     return (REStackFrame *)newFP;
1219 }
1220 
1221 
1222 //--------------------------------------------------------------------------------
1223 //
1224 //   MatchAt      This is the actual matching engine.
1225 //
1226 //                  startIdx:    begin matching a this index.
1227 //                  toEnd:       if true, match must extend to end of the input region
1228 //
1229 //--------------------------------------------------------------------------------
MatchAt(int32_t startIdx,UBool toEnd,UErrorCode & status)1230 void RegexMatcher::MatchAt(int32_t startIdx, UBool toEnd, UErrorCode &status) {
1231     UBool       isMatch  = FALSE;      // True if the we have a match.
1232 
1233     int32_t     op;                    // Operation from the compiled pattern, split into
1234     int32_t     opType;                //    the opcode
1235     int32_t     opValue;               //    and the operand value.
1236 
1237     #ifdef REGEX_RUN_DEBUG
1238     if (fTraceDebug)
1239     {
1240         printf("MatchAt(startIdx=%d)\n", startIdx);
1241         printf("Original Pattern: ");
1242         int32_t i;
1243         for (i=0; i<fPattern->fPattern.length(); i++) {
1244             printf("%c", fPattern->fPattern.charAt(i));
1245         }
1246         printf("\n");
1247         printf("Input String: ");
1248         for (i=0; i<fInput->length(); i++) {
1249             UChar c = fInput->charAt(i);
1250             if (c<32 || c>256) {
1251                 c = '.';
1252             }
1253             printf("%c", c);
1254         }
1255         printf("\n");
1256         printf("\n");
1257     }
1258     #endif
1259 
1260     if (U_FAILURE(status)) {
1261         return;
1262     }
1263 
1264     //  Cache frequently referenced items from the compiled pattern
1265     //  in local variables.
1266     //
1267     int32_t             *pat           = fPattern->fCompiledPat->getBuffer();
1268 
1269     const UChar         *litText       = fPattern->fLiteralText.getBuffer();
1270     UVector             *sets          = fPattern->fSets;
1271 
1272     const UChar         *inputBuf      = fInput->getBuffer();
1273 
1274     REStackFrame        *fp            = resetStack();
1275     int32_t              frameSize     = fPattern->fFrameSize;
1276 
1277     fp->fPatIdx   = 0;
1278     fp->fInputIdx = startIdx;
1279 
1280     // Zero out the pattern's static data
1281     int32_t i;
1282     for (i = 0; i<fPattern->fDataSize; i++) {
1283         fData[i] = 0;
1284     }
1285 
1286     //
1287     //  Main loop for interpreting the compiled pattern.
1288     //  One iteration of the loop per pattern operation performed.
1289     //
1290     for (;;) {
1291 #if 0
1292         if (_heapchk() != _HEAPOK) {
1293             fprintf(stderr, "Heap Trouble\n");
1294         }
1295 #endif
1296         op      = pat[fp->fPatIdx];
1297         opType  = URX_TYPE(op);
1298         opValue = URX_VAL(op);
1299         #ifdef REGEX_RUN_DEBUG
1300         if (fTraceDebug) {
1301             printf("inputIdx=%d   inputChar=%c   sp=%3d  ", fp->fInputIdx,
1302                 fInput->char32At(fp->fInputIdx), (int32_t *)fp-fStack->getBuffer());
1303             fPattern->dumpOp(fp->fPatIdx);
1304         }
1305         #endif
1306         fp->fPatIdx++;
1307 
1308         switch (opType) {
1309 
1310 
1311         case URX_NOP:
1312             break;
1313 
1314 
1315         case URX_BACKTRACK:
1316             // Force a backtrack.  In some circumstances, the pattern compiler
1317             //   will notice that the pattern can't possibly match anything, and will
1318             //   emit one of these at that point.
1319             fp = (REStackFrame *)fStack->popFrame(frameSize);
1320             break;
1321 
1322 
1323         case URX_ONECHAR:
1324             if (fp->fInputIdx < fActiveLimit) {
1325                 UChar32   c;
1326                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
1327                 if (c == opValue) {
1328                     break;
1329                 }
1330             } else {
1331                 fHitEnd = TRUE;
1332             }
1333             fp = (REStackFrame *)fStack->popFrame(frameSize);
1334             break;
1335 
1336 
1337         case URX_STRING:
1338             {
1339                 // Test input against a literal string.
1340                 // Strings require two slots in the compiled pattern, one for the
1341                 //   offset to the string text, and one for the length.
1342                 int32_t   stringStartIdx = opValue;
1343                 int32_t   stringLen;
1344 
1345                 op      = pat[fp->fPatIdx];     // Fetch the second operand
1346                 fp->fPatIdx++;
1347                 opType    = URX_TYPE(op);
1348                 stringLen = URX_VAL(op);
1349                 U_ASSERT(opType == URX_STRING_LEN);
1350                 U_ASSERT(stringLen >= 2);
1351 
1352                 if (fp->fInputIdx + stringLen > fActiveLimit) {
1353                     // No match.  String is longer than the remaining input text.
1354                     fHitEnd = TRUE;          //   TODO:  See ticket 6074
1355                     fp = (REStackFrame *)fStack->popFrame(frameSize);
1356                     break;
1357                 }
1358 
1359                 const UChar * pInp = inputBuf + fp->fInputIdx;
1360                 const UChar * pPat = litText+stringStartIdx;
1361                 const UChar * pEnd = pInp + stringLen;
1362                 for(;;) {
1363                     if (*pInp == *pPat) {
1364                         pInp++;
1365                         pPat++;
1366                         if (pInp == pEnd) {
1367                             // Successful Match.
1368                             fp->fInputIdx += stringLen;
1369                             break;
1370                         }
1371                     } else {
1372                         // Match failed.
1373                         fp = (REStackFrame *)fStack->popFrame(frameSize);
1374                         break;
1375                     }
1376                 }
1377             }
1378             break;
1379 
1380 
1381 
1382         case URX_STATE_SAVE:
1383             fp = StateSave(fp, opValue, frameSize, status);
1384             break;
1385 
1386 
1387         case URX_END:
1388             // The match loop will exit via this path on a successful match,
1389             //   when we reach the end of the pattern.
1390             if (toEnd && fp->fInputIdx != fActiveLimit) {
1391                 // The pattern matched, but not to the end of input.  Try some more.
1392                 fp = (REStackFrame *)fStack->popFrame(frameSize);
1393                 break;
1394             }
1395             isMatch = TRUE;
1396             goto  breakFromLoop;
1397 
1398         // Start and End Capture stack frame variables are layout out like this:
1399             //  fp->fExtra[opValue]  - The start of a completed capture group
1400             //             opValue+1 - The end   of a completed capture group
1401             //             opValue+2 - the start of a capture group whose end
1402             //                          has not yet been reached (and might not ever be).
1403         case URX_START_CAPTURE:
1404             U_ASSERT(opValue >= 0 && opValue < frameSize-3);
1405             fp->fExtra[opValue+2] = fp->fInputIdx;
1406             break;
1407 
1408 
1409         case URX_END_CAPTURE:
1410             U_ASSERT(opValue >= 0 && opValue < frameSize-3);
1411             U_ASSERT(fp->fExtra[opValue+2] >= 0);            // Start pos for this group must be set.
1412             fp->fExtra[opValue]   = fp->fExtra[opValue+2];   // Tentative start becomes real.
1413             fp->fExtra[opValue+1] = fp->fInputIdx;           // End position
1414             U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
1415             break;
1416 
1417 
1418         case URX_DOLLAR:                   //  $, test for End of line
1419                                            //     or for position before new line at end of input
1420             if (fp->fInputIdx < fAnchorLimit-2) {
1421                 // We are no where near the end of input.  Fail.
1422                 //   This is the common case.  Keep it first.
1423                 fp = (REStackFrame *)fStack->popFrame(frameSize);
1424                 break;
1425             }
1426             if (fp->fInputIdx >= fAnchorLimit) {
1427                 // We really are at the end of input.  Success.
1428                 fHitEnd = TRUE;
1429                 fRequireEnd = TRUE;
1430                 break;
1431             }
1432             // If we are positioned just before a new-line that is located at the
1433             //   end of input, succeed.
1434             if (fp->fInputIdx == fAnchorLimit-1) {
1435                 UChar32 c = fInput->char32At(fp->fInputIdx);
1436                 if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) {
1437                     // If not in the middle of a CR/LF sequence
1438                     if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
1439                         // At new-line at end of input. Success
1440                         fHitEnd = TRUE;
1441                         fRequireEnd = TRUE;
1442                         break;
1443                     }
1444                 }
1445             }
1446 
1447             if (fp->fInputIdx == fAnchorLimit-2 &&
1448                  fInput->char32At(fp->fInputIdx) == 0x0d && fInput->char32At(fp->fInputIdx+1) == 0x0a) {
1449                     fHitEnd = TRUE;
1450                     fRequireEnd = TRUE;
1451                     break;                         // At CR/LF at end of input.  Success
1452             }
1453 
1454             fp = (REStackFrame *)fStack->popFrame(frameSize);
1455 
1456             break;
1457 
1458 
1459          case URX_DOLLAR_D:                   //  $, test for End of Line, in UNIX_LINES mode.
1460             if (fp->fInputIdx >= fAnchorLimit-1) {
1461                 // Either at the last character of input, or off the end.
1462                 if (fp->fInputIdx == fAnchorLimit-1) {
1463                     // At last char of input.  Success if it's a new line.
1464                     if (fInput->char32At(fp->fInputIdx) == 0x0a) {
1465                         fHitEnd = TRUE;
1466                         fRequireEnd = TRUE;
1467                         break;
1468                     }
1469                 } else {
1470                     // Off the end of input.  Success.
1471                     fHitEnd = TRUE;
1472                     fRequireEnd = TRUE;
1473                     break;
1474                 }
1475             }
1476 
1477             // Not at end of input.  Back-track out.
1478             fp = (REStackFrame *)fStack->popFrame(frameSize);
1479             break;
1480 
1481 
1482          case URX_DOLLAR_M:                //  $, test for End of line in multi-line mode
1483              {
1484                  if (fp->fInputIdx >= fAnchorLimit) {
1485                      // We really are at the end of input.  Success.
1486                      fHitEnd = TRUE;
1487                      fRequireEnd = TRUE;
1488                      break;
1489                  }
1490                  // If we are positioned just before a new-line, succeed.
1491                  // It makes no difference where the new-line is within the input.
1492                  UChar32 c = inputBuf[fp->fInputIdx];
1493                  if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) {
1494                      // At a line end, except for the odd chance of  being in the middle of a CR/LF sequence
1495                      //  In multi-line mode, hitting a new-line just before the end of input does not
1496                      //   set the hitEnd or requireEnd flags
1497                      if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
1498                         break;
1499                      }
1500                  }
1501                  // not at a new line.  Fail.
1502                  fp = (REStackFrame *)fStack->popFrame(frameSize);
1503              }
1504              break;
1505 
1506 
1507          case URX_DOLLAR_MD:                //  $, test for End of line in multi-line and UNIX_LINES mode
1508              {
1509                  if (fp->fInputIdx >= fAnchorLimit) {
1510                      // We really are at the end of input.  Success.
1511                      fHitEnd = TRUE;
1512                      fRequireEnd = TRUE;  // Java set requireEnd in this case, even though
1513                      break;               //   adding a new-line would not lose the match.
1514                  }
1515                  // If we are not positioned just before a new-line, the test fails; backtrack out.
1516                  // It makes no difference where the new-line is within the input.
1517                  if (inputBuf[fp->fInputIdx] != 0x0a) {
1518                      fp = (REStackFrame *)fStack->popFrame(frameSize);
1519                  }
1520              }
1521              break;
1522 
1523 
1524        case URX_CARET:                    //  ^, test for start of line
1525             if (fp->fInputIdx != fAnchorStart) {
1526                 fp = (REStackFrame *)fStack->popFrame(frameSize);
1527             }
1528             break;
1529 
1530 
1531        case URX_CARET_M:                   //  ^, test for start of line in mulit-line mode
1532            {
1533                if (fp->fInputIdx == fAnchorStart) {
1534                    // We are at the start input.  Success.
1535                    break;
1536                }
1537                // Check whether character just before the current pos is a new-line
1538                //   unless we are at the end of input
1539                UChar  c = inputBuf[fp->fInputIdx - 1];
1540                if ((fp->fInputIdx < fAnchorLimit) &&
1541                    ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
1542                    //  It's a new-line.  ^ is true.  Success.
1543                    //  TODO:  what should be done with positions between a CR and LF?
1544                    break;
1545                }
1546                // Not at the start of a line.  Fail.
1547                fp = (REStackFrame *)fStack->popFrame(frameSize);
1548            }
1549            break;
1550 
1551 
1552        case URX_CARET_M_UNIX:       //  ^, test for start of line in mulit-line + Unix-line mode
1553            {
1554                U_ASSERT(fp->fInputIdx >= fAnchorStart);
1555                if (fp->fInputIdx <= fAnchorStart) {
1556                    // We are at the start input.  Success.
1557                    break;
1558                }
1559                // Check whether character just before the current pos is a new-line
1560                U_ASSERT(fp->fInputIdx <= fAnchorLimit);
1561                UChar  c = inputBuf[fp->fInputIdx - 1];
1562                if (c != 0x0a) {
1563                    // Not at the start of a line.  Back-track out.
1564                    fp = (REStackFrame *)fStack->popFrame(frameSize);
1565                }
1566            }
1567            break;
1568 
1569         case URX_BACKSLASH_B:          // Test for word boundaries
1570             {
1571                 UBool success = isWordBoundary(fp->fInputIdx);
1572                 success ^= (opValue != 0);     // flip sense for \B
1573                 if (!success) {
1574                     fp = (REStackFrame *)fStack->popFrame(frameSize);
1575                 }
1576             }
1577             break;
1578 
1579 
1580         case URX_BACKSLASH_BU:          // Test for word boundaries, Unicode-style
1581             {
1582                 UBool success = isUWordBoundary(fp->fInputIdx);
1583                 success ^= (opValue != 0);     // flip sense for \B
1584                 if (!success) {
1585                     fp = (REStackFrame *)fStack->popFrame(frameSize);
1586                 }
1587             }
1588             break;
1589 
1590 
1591         case URX_BACKSLASH_D:            // Test for decimal digit
1592             {
1593                 if (fp->fInputIdx >= fActiveLimit) {
1594                     fHitEnd = TRUE;
1595                     fp = (REStackFrame *)fStack->popFrame(frameSize);
1596                     break;
1597                 }
1598 
1599                 UChar32 c = fInput->char32At(fp->fInputIdx);
1600                 int8_t ctype = u_charType(c);     // TODO:  make a unicode set for this.  Will be faster.
1601                 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
1602                 success ^= (opValue != 0);        // flip sense for \D
1603                 if (success) {
1604                     fp->fInputIdx = fInput->moveIndex32(fp->fInputIdx, 1);
1605                 } else {
1606                     fp = (REStackFrame *)fStack->popFrame(frameSize);
1607                 }
1608             }
1609             break;
1610 
1611 
1612         case URX_BACKSLASH_G:          // Test for position at end of previous match
1613             if (!((fMatch && fp->fInputIdx==fMatchEnd) || fMatch==FALSE && fp->fInputIdx==fActiveStart)) {
1614                 fp = (REStackFrame *)fStack->popFrame(frameSize);
1615             }
1616             break;
1617 
1618 
1619         case URX_BACKSLASH_X:
1620             //  Match a Grapheme, as defined by Unicode TR 29.
1621             //  Differs slightly from Perl, which consumes combining marks independently
1622             //    of context.
1623             {
1624 
1625                 // Fail if at end of input
1626                 if (fp->fInputIdx >= fActiveLimit) {
1627                     fHitEnd = TRUE;
1628                     fp = (REStackFrame *)fStack->popFrame(frameSize);
1629                     break;
1630                 }
1631 
1632                 // Examine (and consume) the current char.
1633                 //   Dispatch into a little state machine, based on the char.
1634                 UChar32  c;
1635                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
1636                 UnicodeSet **sets = fPattern->fStaticSets;
1637                 if (sets[URX_GC_NORMAL]->contains(c))  goto GC_Extend;
1638                 if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control;
1639                 if (sets[URX_GC_L]->contains(c))       goto GC_L;
1640                 if (sets[URX_GC_LV]->contains(c))      goto GC_V;
1641                 if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
1642                 if (sets[URX_GC_V]->contains(c))       goto GC_V;
1643                 if (sets[URX_GC_T]->contains(c))       goto GC_T;
1644                 goto GC_Extend;
1645 
1646 
1647 
1648 GC_L:
1649                 if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
1650                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
1651                 if (sets[URX_GC_L]->contains(c))       goto GC_L;
1652                 if (sets[URX_GC_LV]->contains(c))      goto GC_V;
1653                 if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
1654                 if (sets[URX_GC_V]->contains(c))       goto GC_V;
1655                 U16_PREV(inputBuf, 0, fp->fInputIdx, c);
1656                 goto GC_Extend;
1657 
1658 GC_V:
1659                 if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
1660                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
1661                 if (sets[URX_GC_V]->contains(c))       goto GC_V;
1662                 if (sets[URX_GC_T]->contains(c))       goto GC_T;
1663                 U16_PREV(inputBuf, 0, fp->fInputIdx, c);
1664                 goto GC_Extend;
1665 
1666 GC_T:
1667                 if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
1668                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
1669                 if (sets[URX_GC_T]->contains(c))       goto GC_T;
1670                 U16_PREV(inputBuf, 0, fp->fInputIdx, c);
1671                 goto GC_Extend;
1672 
1673 GC_Extend:
1674                 // Combining characters are consumed here
1675                 for (;;) {
1676                     if (fp->fInputIdx >= fActiveLimit) {
1677                         break;
1678                     }
1679                     U16_GET(inputBuf, 0, fp->fInputIdx, fActiveLimit, c);
1680                     if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
1681                         break;
1682                     }
1683                     U16_FWD_1(inputBuf, fp->fInputIdx, fActiveLimit);
1684                 }
1685                 goto GC_Done;
1686 
1687 GC_Control:
1688                 // Most control chars stand alone (don't combine with combining chars),
1689                 //   except for that CR/LF sequence is a single grapheme cluster.
1690                 if (c == 0x0d && fp->fInputIdx < fActiveLimit && inputBuf[fp->fInputIdx] == 0x0a) {
1691                     fp->fInputIdx++;
1692                 }
1693 
1694 GC_Done:
1695                 if (fp->fInputIdx >= fActiveLimit) {
1696                     fHitEnd = TRUE;
1697                 }
1698                 break;
1699             }
1700 
1701 
1702 
1703 
1704         case URX_BACKSLASH_Z:          // Test for end of Input
1705             if (fp->fInputIdx < fAnchorLimit) {
1706                 fp = (REStackFrame *)fStack->popFrame(frameSize);
1707             } else {
1708                 fHitEnd = TRUE;
1709                 fRequireEnd = TRUE;
1710             }
1711             break;
1712 
1713 
1714 
1715         case URX_STATIC_SETREF:
1716             {
1717                 // Test input character against one of the predefined sets
1718                 //    (Word Characters, for example)
1719                 // The high bit of the op value is a flag for the match polarity.
1720                 //    0:   success if input char is in set.
1721                 //    1:   success if input char is not in set.
1722                 if (fp->fInputIdx >= fActiveLimit) {
1723                     fHitEnd = TRUE;
1724                     fp = (REStackFrame *)fStack->popFrame(frameSize);
1725                     break;
1726                 }
1727 
1728                 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
1729                 opValue &= ~URX_NEG_SET;
1730                 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
1731                 UChar32  c;
1732                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
1733                 if (c < 256) {
1734                     Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
1735                     if (s8->contains(c)) {
1736                         success = !success;
1737                     }
1738                 } else {
1739                     const UnicodeSet *s = fPattern->fStaticSets[opValue];
1740                     if (s->contains(c)) {
1741                         success = !success;
1742                     }
1743                 }
1744                 if (!success) {
1745                     fp = (REStackFrame *)fStack->popFrame(frameSize);
1746                 }
1747             }
1748             break;
1749 
1750 
1751         case URX_STAT_SETREF_N:
1752             {
1753                 // Test input character for NOT being a member of  one of
1754                 //    the predefined sets (Word Characters, for example)
1755                 if (fp->fInputIdx >= fActiveLimit) {
1756                     fHitEnd = TRUE;
1757                     fp = (REStackFrame *)fStack->popFrame(frameSize);
1758                     break;
1759                 }
1760 
1761                 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
1762                 UChar32  c;
1763                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
1764                 if (c < 256) {
1765                     Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
1766                     if (s8->contains(c) == FALSE) {
1767                         break;
1768                     }
1769                 } else {
1770                     const UnicodeSet *s = fPattern->fStaticSets[opValue];
1771                     if (s->contains(c) == FALSE) {
1772                         break;
1773                     }
1774                 }
1775 
1776                 fp = (REStackFrame *)fStack->popFrame(frameSize);
1777             }
1778             break;
1779 
1780 
1781         case URX_SETREF:
1782             if (fp->fInputIdx >= fActiveLimit) {
1783                 fHitEnd = TRUE;
1784                 fp = (REStackFrame *)fStack->popFrame(frameSize);
1785                 break;
1786             }
1787             // There is input left.  Pick up one char and test it for set membership.
1788             UChar32   c;
1789             U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
1790             U_ASSERT(opValue > 0 && opValue < sets->size());
1791             if (c<256) {
1792                 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
1793                 if (s8->contains(c)) {
1794                     break;
1795                 }
1796             } else {
1797                 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
1798                 if (s->contains(c)) {
1799                     // The character is in the set.  A Match.
1800                     break;
1801                 }
1802             }
1803             // the character wasn't in the set. Back track out.
1804             fp = (REStackFrame *)fStack->popFrame(frameSize);
1805             break;
1806 
1807 
1808         case URX_DOTANY:
1809             {
1810                 // . matches anything, but stops at end-of-line.
1811                 if (fp->fInputIdx >= fActiveLimit) {
1812                     // At end of input.  Match failed.  Backtrack out.
1813                     fHitEnd = TRUE;
1814                     fp = (REStackFrame *)fStack->popFrame(frameSize);
1815                     break;
1816                 }
1817                 // There is input left.  Advance over one char, unless we've hit end-of-line
1818                 UChar32 c;
1819                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
1820                 if (((c & 0x7f) <= 0x29) &&     // First quickly bypass as many chars as possible
1821                     ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
1822                     // End of line in normal mode.   . does not match.
1823                         fp = (REStackFrame *)fStack->popFrame(frameSize);
1824                     break;
1825                 }
1826             }
1827             break;
1828 
1829 
1830         case URX_DOTANY_ALL:
1831             {
1832                 // ., in dot-matches-all (including new lines) mode
1833                 if (fp->fInputIdx >= fActiveLimit) {
1834                     // At end of input.  Match failed.  Backtrack out.
1835                     fHitEnd = TRUE;
1836                     fp = (REStackFrame *)fStack->popFrame(frameSize);
1837                     break;
1838                 }
1839                 // There is input left.  Advance over one char, except if we are
1840                 //   at a cr/lf, advance over both of them.
1841                 UChar32 c;
1842                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
1843                 if (c==0x0d && fp->fInputIdx < fActiveLimit) {
1844                     // In the case of a CR/LF, we need to advance over both.
1845                     UChar nextc = inputBuf[fp->fInputIdx];
1846                     if (nextc == 0x0a) {
1847                         fp->fInputIdx++;
1848                     }
1849                 }
1850             }
1851             break;
1852 
1853 
1854         case URX_DOTANY_UNIX:
1855             {
1856                 // '.' operator, matches all, but stops at end-of-line.
1857                 //   UNIX_LINES mode, so 0x0a is the only recognized line ending.
1858                 if (fp->fInputIdx >= fActiveLimit) {
1859                     // At end of input.  Match failed.  Backtrack out.
1860                     fHitEnd = TRUE;
1861                     fp = (REStackFrame *)fStack->popFrame(frameSize);
1862                     break;
1863                 }
1864                 // There is input left.  Advance over one char, unless we've hit end-of-line
1865                 UChar32 c;
1866                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
1867                 if (c == 0x0a) {
1868                     // End of line in normal mode.   '.' does not match the \n
1869                     fp = (REStackFrame *)fStack->popFrame(frameSize);
1870                 }
1871             }
1872             break;
1873 
1874 
1875         case URX_JMP:
1876             fp->fPatIdx = opValue;
1877             break;
1878 
1879         case URX_FAIL:
1880             isMatch = FALSE;
1881             goto breakFromLoop;
1882 
1883         case URX_JMP_SAV:
1884             U_ASSERT(opValue < fPattern->fCompiledPat->size());
1885             fp = StateSave(fp, fp->fPatIdx, frameSize, status);  // State save to loc following current
1886             fp->fPatIdx = opValue;                               // Then JMP.
1887             break;
1888 
1889         case URX_JMP_SAV_X:
1890             // This opcode is used with (x)+, when x can match a zero length string.
1891             // Same as JMP_SAV, except conditional on the match having made forward progress.
1892             // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
1893             //   data address of the input position at the start of the loop.
1894             {
1895                 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
1896                 int32_t  stoOp = pat[opValue-1];
1897                 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
1898                 int32_t  frameLoc = URX_VAL(stoOp);
1899                 U_ASSERT(frameLoc >= 0 && frameLoc < frameSize);
1900                 int32_t prevInputIdx = fp->fExtra[frameLoc];
1901                 U_ASSERT(prevInputIdx <= fp->fInputIdx);
1902                 if (prevInputIdx < fp->fInputIdx) {
1903                     // The match did make progress.  Repeat the loop.
1904                     fp = StateSave(fp, fp->fPatIdx, frameSize, status);  // State save to loc following current
1905                     fp->fPatIdx = opValue;
1906                     fp->fExtra[frameLoc] = fp->fInputIdx;
1907                 }
1908                 // If the input position did not advance, we do nothing here,
1909                 //   execution will fall out of the loop.
1910             }
1911             break;
1912 
1913         case URX_CTR_INIT:
1914             {
1915                 U_ASSERT(opValue >= 0 && opValue < frameSize-2);
1916                 fp->fExtra[opValue] = 0;       //  Set the loop counter variable to zero
1917 
1918                 // Pick up the three extra operands that CTR_INIT has, and
1919                 //    skip the pattern location counter past
1920                 int32_t instrOperandLoc = fp->fPatIdx;
1921                 fp->fPatIdx += 3;
1922                 int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
1923                 int32_t minCount = pat[instrOperandLoc+1];
1924                 int32_t maxCount = pat[instrOperandLoc+2];
1925                 U_ASSERT(minCount>=0);
1926                 U_ASSERT(maxCount>=minCount || maxCount==-1);
1927                 U_ASSERT(loopLoc>fp->fPatIdx);
1928 
1929                 if (minCount == 0) {
1930                     fp = StateSave(fp, loopLoc+1, frameSize, status);
1931                 }
1932                 if (maxCount == 0) {
1933                     fp = (REStackFrame *)fStack->popFrame(frameSize);
1934                 }
1935             }
1936             break;
1937 
1938         case URX_CTR_LOOP:
1939             {
1940                 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
1941                 int32_t initOp = pat[opValue];
1942                 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
1943                 int32_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
1944                 int32_t minCount  = pat[opValue+2];
1945                 int32_t maxCount = pat[opValue+3];
1946                 // Increment the counter.  Note: we're not worrying about counter
1947                 //   overflow, since the data comes from UnicodeStrings, which
1948                 //   stores its length in an int32_t.
1949                 (*pCounter)++;
1950                 U_ASSERT(*pCounter > 0);
1951                 if ((uint32_t)*pCounter >= (uint32_t)maxCount) {
1952                     U_ASSERT(*pCounter == maxCount || maxCount == -1);
1953                     break;
1954                 }
1955                 if (*pCounter >= minCount) {
1956                     fp = StateSave(fp, fp->fPatIdx, frameSize, status);
1957                 }
1958                 fp->fPatIdx = opValue + 4;    // Loop back.
1959             }
1960             break;
1961 
1962         case URX_CTR_INIT_NG:
1963             {
1964                 // Initialize a non-greedy loop
1965                 U_ASSERT(opValue >= 0 && opValue < frameSize-2);
1966                 fp->fExtra[opValue] = 0;       //  Set the loop counter variable to zero
1967 
1968                 // Pick up the three extra operands that CTR_INIT has, and
1969                 //    skip the pattern location counter past
1970                 int32_t instrOperandLoc = fp->fPatIdx;
1971                 fp->fPatIdx += 3;
1972                 int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
1973                 int32_t minCount = pat[instrOperandLoc+1];
1974                 int32_t maxCount = pat[instrOperandLoc+2];
1975                 U_ASSERT(minCount>=0);
1976                 U_ASSERT(maxCount>=minCount || maxCount==-1);
1977                 U_ASSERT(loopLoc>fp->fPatIdx);
1978 
1979                 if (minCount == 0) {
1980                     if (maxCount != 0) {
1981                         fp = StateSave(fp, fp->fPatIdx, frameSize, status);
1982                     }
1983                     fp->fPatIdx = loopLoc+1;   // Continue with stuff after repeated block
1984                 }
1985             }
1986             break;
1987 
1988         case URX_CTR_LOOP_NG:
1989             {
1990                 // Non-greedy {min, max} loops
1991                 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
1992                 int32_t initOp = pat[opValue];
1993                 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
1994                 int32_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
1995                 int32_t minCount  = pat[opValue+2];
1996                 int32_t maxCount = pat[opValue+3];
1997                 // Increment the counter.  Note: we're not worrying about counter
1998                 //   overflow, since the data comes from UnicodeStrings, which
1999                 //   stores its length in an int32_t.
2000                 (*pCounter)++;
2001                 U_ASSERT(*pCounter > 0);
2002 
2003                 if ((uint32_t)*pCounter >= (uint32_t)maxCount) {
2004                     // The loop has matched the maximum permitted number of times.
2005                     //   Break out of here with no action.  Matching will
2006                     //   continue with the following pattern.
2007                     U_ASSERT(*pCounter == maxCount || maxCount == -1);
2008                     break;
2009                 }
2010 
2011                 if (*pCounter < minCount) {
2012                     // We haven't met the minimum number of matches yet.
2013                     //   Loop back for another one.
2014                     fp->fPatIdx = opValue + 4;    // Loop back.
2015                 } else {
2016                     // We do have the minimum number of matches.
2017                     //   Fall into the following pattern, but first do
2018                     //   a state save to the top of the loop, so that a failure
2019                     //   in the following pattern will try another iteration of the loop.
2020                     fp = StateSave(fp, opValue + 4, frameSize, status);
2021                 }
2022             }
2023             break;
2024 
2025         case URX_STO_SP:
2026             U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
2027             fData[opValue] = fStack->size();
2028             break;
2029 
2030         case URX_LD_SP:
2031             {
2032                 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
2033                 int32_t newStackSize = fData[opValue];
2034                 U_ASSERT(newStackSize <= fStack->size());
2035                 int32_t *newFP = fStack->getBuffer() + newStackSize - frameSize;
2036                 if (newFP == (int32_t *)fp) {
2037                     break;
2038                 }
2039                 int32_t i;
2040                 for (i=0; i<frameSize; i++) {
2041                     newFP[i] = ((int32_t *)fp)[i];
2042                 }
2043                 fp = (REStackFrame *)newFP;
2044                 fStack->setSize(newStackSize);
2045             }
2046             break;
2047 
2048         case URX_BACKREF:
2049         case URX_BACKREF_I:
2050             {
2051                 U_ASSERT(opValue < frameSize);
2052                 int32_t groupStartIdx = fp->fExtra[opValue];
2053                 int32_t groupEndIdx   = fp->fExtra[opValue+1];
2054                 U_ASSERT(groupStartIdx <= groupEndIdx);
2055                 int32_t len = groupEndIdx-groupStartIdx;
2056                 if (groupStartIdx < 0) {
2057                     // This capture group has not participated in the match thus far,
2058                     fp = (REStackFrame *)fStack->popFrame(frameSize);   // FAIL, no match.
2059                 }
2060 
2061                 if (len == 0) {
2062                         //   The capture group match was of an empty string.
2063                         //   Verified by testing:  Perl matches succeed in this case, so
2064                         //   we do too.
2065                         break;
2066                     }
2067 
2068                 UBool  haveMatch = FALSE;
2069                 if (fp->fInputIdx + len <= fActiveLimit) {
2070                     if (opType == URX_BACKREF) {
2071                         if (u_strncmp(inputBuf+groupStartIdx, inputBuf+fp->fInputIdx, len) == 0) {
2072                             haveMatch = TRUE;
2073                         }
2074                     } else {
2075                         if (u_strncasecmp(inputBuf+groupStartIdx, inputBuf+fp->fInputIdx,
2076                                   len, U_FOLD_CASE_DEFAULT) == 0) {
2077                             haveMatch = TRUE;
2078                         }
2079                     }
2080                 } else {
2081                     // TODO: probably need to do a partial string comparison, and only
2082                     //       set HitEnd if the available input matched.  Ticket #6074
2083                     fHitEnd = TRUE;
2084                 }
2085                 if (haveMatch) {
2086                     fp->fInputIdx += len;     // Match.  Advance current input position.
2087                 } else {
2088                     fp = (REStackFrame *)fStack->popFrame(frameSize);   // FAIL, no match.
2089                 }
2090             }
2091             break;
2092 
2093         case URX_STO_INP_LOC:
2094             {
2095                 U_ASSERT(opValue >= 0 && opValue < frameSize);
2096                 fp->fExtra[opValue] = fp->fInputIdx;
2097             }
2098             break;
2099 
2100         case URX_JMPX:
2101             {
2102                 int32_t instrOperandLoc = fp->fPatIdx;
2103                 fp->fPatIdx += 1;
2104                 int32_t dataLoc  = URX_VAL(pat[instrOperandLoc]);
2105                 U_ASSERT(dataLoc >= 0 && dataLoc < frameSize);
2106                 int32_t savedInputIdx = fp->fExtra[dataLoc];
2107                 U_ASSERT(savedInputIdx <= fp->fInputIdx);
2108                 if (savedInputIdx < fp->fInputIdx) {
2109                     fp->fPatIdx = opValue;                               // JMP
2110                 } else {
2111                      fp = (REStackFrame *)fStack->popFrame(frameSize);   // FAIL, no progress in loop.
2112                 }
2113             }
2114             break;
2115 
2116         case URX_LA_START:
2117             {
2118                 // Entering a lookahead block.
2119                 // Save Stack Ptr, Input Pos.
2120                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
2121                 fData[opValue]   = fStack->size();
2122                 fData[opValue+1] = fp->fInputIdx;
2123                 fActiveStart     = fLookStart;          // Set the match region change for
2124                 fActiveLimit     = fLookLimit;          //   transparent bounds.
2125             }
2126             break;
2127 
2128         case URX_LA_END:
2129             {
2130                 // Leaving a look-ahead block.
2131                 //  restore Stack Ptr, Input Pos to positions they had on entry to block.
2132                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
2133                 int32_t stackSize = fStack->size();
2134                 int32_t newStackSize = fData[opValue];
2135                 U_ASSERT(stackSize >= newStackSize);
2136                 if (stackSize > newStackSize) {
2137                     // Copy the current top frame back to the new (cut back) top frame.
2138                     //   This makes the capture groups from within the look-ahead
2139                     //   expression available.
2140                     int32_t *newFP = fStack->getBuffer() + newStackSize - frameSize;
2141                     int32_t i;
2142                     for (i=0; i<frameSize; i++) {
2143                         newFP[i] = ((int32_t *)fp)[i];
2144                     }
2145                     fp = (REStackFrame *)newFP;
2146                     fStack->setSize(newStackSize);
2147                 }
2148                 fp->fInputIdx = fData[opValue+1];
2149 
2150                 // Restore the active region bounds in the input string; they may have
2151                 //    been changed because of transparent bounds on a Region.
2152                 fActiveStart = fRegionStart;
2153                 fActiveLimit = fRegionLimit;
2154             }
2155             break;
2156 
2157         case URX_ONECHAR_I:
2158             if (fp->fInputIdx < fActiveLimit) {
2159                 UChar32   c;
2160                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
2161                 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
2162                     break;
2163                 }
2164             } else {
2165                 fHitEnd = TRUE;
2166             }
2167             fp = (REStackFrame *)fStack->popFrame(frameSize);
2168             break;
2169 
2170         case URX_STRING_I:
2171             {
2172                 // Test input against a literal string.
2173                 // Strings require two slots in the compiled pattern, one for the
2174                 //   offset to the string text, and one for the length.
2175                 int32_t stringStartIdx, stringLen;
2176                 stringStartIdx = opValue;
2177 
2178                 op      = pat[fp->fPatIdx];
2179                 fp->fPatIdx++;
2180                 opType  = URX_TYPE(op);
2181                 opValue = URX_VAL(op);
2182                 U_ASSERT(opType == URX_STRING_LEN);
2183                 stringLen = opValue;
2184 
2185                 int32_t stringEndIndex = fp->fInputIdx + stringLen;
2186                 if (stringEndIndex <= fActiveLimit) {
2187                     if (u_strncasecmp(inputBuf+fp->fInputIdx, litText+stringStartIdx,
2188                         stringLen, U_FOLD_CASE_DEFAULT) == 0) {
2189                         // Success.  Advance the current input position.
2190                         fp->fInputIdx = stringEndIndex;
2191                         break;
2192                     }
2193                 } else {
2194                     // Insufficent input left for a match.
2195                     fHitEnd = TRUE;    // See ticket 6074
2196                 }
2197                 // No match.  Back up matching to a saved state
2198                 fp = (REStackFrame *)fStack->popFrame(frameSize);
2199             }
2200             break;
2201 
2202         case URX_LB_START:
2203             {
2204                 // Entering a look-behind block.
2205                 // Save Stack Ptr, Input Pos.
2206                 //   TODO:  implement transparent bounds.  Ticket #6067
2207                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
2208                 fData[opValue]   = fStack->size();
2209                 fData[opValue+1] = fp->fInputIdx;
2210                 // Init the variable containing the start index for attempted matches.
2211                 fData[opValue+2] = -1;
2212                 // Save input string length, then reset to pin any matches to end at
2213                 //   the current position.
2214                 fData[opValue+3] = fActiveLimit;
2215                 fActiveLimit     = fp->fInputIdx;
2216             }
2217             break;
2218 
2219 
2220         case URX_LB_CONT:
2221             {
2222                 // Positive Look-Behind, at top of loop checking for matches of LB expression
2223                 //    at all possible input starting positions.
2224 
2225                 // Fetch the min and max possible match lengths.  They are the operands
2226                 //   of this op in the pattern.
2227                 int32_t minML = pat[fp->fPatIdx++];
2228                 int32_t maxML = pat[fp->fPatIdx++];
2229                 U_ASSERT(minML <= maxML);
2230                 U_ASSERT(minML >= 0);
2231 
2232                 // Fetch (from data) the last input index where a match was attempted.
2233                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
2234                 int32_t  *lbStartIdx = &fData[opValue+2];
2235                 if (*lbStartIdx < 0) {
2236                     // First time through loop.
2237                     *lbStartIdx = fp->fInputIdx - minML;
2238                 } else {
2239                     // 2nd through nth time through the loop.
2240                     // Back up start position for match by one.
2241                     if (*lbStartIdx == 0) {
2242                         (*lbStartIdx)--;   // Because U16_BACK is unsafe starting at 0.
2243                     } else {
2244                         U16_BACK_1(inputBuf, 0, *lbStartIdx);
2245                     }
2246                 }
2247 
2248                 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
2249                     // We have tried all potential match starting points without
2250                     //  getting a match.  Backtrack out, and out of the
2251                     //   Look Behind altogether.
2252                     fp = (REStackFrame *)fStack->popFrame(frameSize);
2253                     int32_t restoreInputLen = fData[opValue+3];
2254                     U_ASSERT(restoreInputLen >= fActiveLimit);
2255                     U_ASSERT(restoreInputLen <= fInput->length());
2256                     fActiveLimit = restoreInputLen;
2257                     break;
2258                 }
2259 
2260                 //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
2261                 //      (successful match will fall off the end of the loop.)
2262                 fp = StateSave(fp, fp->fPatIdx-3, frameSize, status);
2263                 fp->fInputIdx =  *lbStartIdx;
2264             }
2265             break;
2266 
2267         case URX_LB_END:
2268             // End of a look-behind block, after a successful match.
2269             {
2270                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
2271                 if (fp->fInputIdx != fActiveLimit) {
2272                     //  The look-behind expression matched, but the match did not
2273                     //    extend all the way to the point that we are looking behind from.
2274                     //  FAIL out of here, which will take us back to the LB_CONT, which
2275                     //     will retry the match starting at another position or fail
2276                     //     the look-behind altogether, whichever is appropriate.
2277                     fp = (REStackFrame *)fStack->popFrame(frameSize);
2278                     break;
2279                 }
2280 
2281                 // Look-behind match is good.  Restore the orignal input string length,
2282                 //   which had been truncated to pin the end of the lookbehind match to the
2283                 //   position being looked-behind.
2284                 int32_t originalInputLen = fData[opValue+3];
2285                 U_ASSERT(originalInputLen >= fActiveLimit);
2286                 U_ASSERT(originalInputLen <= fInput->length());
2287                 fActiveLimit = originalInputLen;
2288             }
2289             break;
2290 
2291 
2292         case URX_LBN_CONT:
2293             {
2294                 // Negative Look-Behind, at top of loop checking for matches of LB expression
2295                 //    at all possible input starting positions.
2296 
2297                 // Fetch the extra parameters of this op.
2298                 int32_t minML       = pat[fp->fPatIdx++];
2299                 int32_t maxML       = pat[fp->fPatIdx++];
2300                 int32_t continueLoc = pat[fp->fPatIdx++];
2301                         continueLoc = URX_VAL(continueLoc);
2302                 U_ASSERT(minML <= maxML);
2303                 U_ASSERT(minML >= 0);
2304                 U_ASSERT(continueLoc > fp->fPatIdx);
2305 
2306                 // Fetch (from data) the last input index where a match was attempted.
2307                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
2308                 int32_t  *lbStartIdx = &fData[opValue+2];
2309                 if (*lbStartIdx < 0) {
2310                     // First time through loop.
2311                     *lbStartIdx = fp->fInputIdx - minML;
2312                 } else {
2313                     // 2nd through nth time through the loop.
2314                     // Back up start position for match by one.
2315                     if (*lbStartIdx == 0) {
2316                         (*lbStartIdx)--;   // Because U16_BACK is unsafe starting at 0.
2317                     } else {
2318                         U16_BACK_1(inputBuf, 0, *lbStartIdx);
2319                     }
2320                 }
2321 
2322                 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
2323                     // We have tried all potential match starting points without
2324                     //  getting a match, which means that the negative lookbehind as
2325                     //  a whole has succeeded.  Jump forward to the continue location
2326                     int32_t restoreInputLen = fData[opValue+3];
2327                     U_ASSERT(restoreInputLen >= fActiveLimit);
2328                     U_ASSERT(restoreInputLen <= fInput->length());
2329                     fActiveLimit = restoreInputLen;
2330                     fp->fPatIdx = continueLoc;
2331                     break;
2332                 }
2333 
2334                 //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
2335                 //      (successful match will cause a FAIL out of the loop altogether.)
2336                 fp = StateSave(fp, fp->fPatIdx-4, frameSize, status);
2337                 fp->fInputIdx =  *lbStartIdx;
2338             }
2339             break;
2340 
2341         case URX_LBN_END:
2342             // End of a negative look-behind block, after a successful match.
2343             {
2344                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
2345                 if (fp->fInputIdx != fActiveLimit) {
2346                     //  The look-behind expression matched, but the match did not
2347                     //    extend all the way to the point that we are looking behind from.
2348                     //  FAIL out of here, which will take us back to the LB_CONT, which
2349                     //     will retry the match starting at another position or succeed
2350                     //     the look-behind altogether, whichever is appropriate.
2351                     fp = (REStackFrame *)fStack->popFrame(frameSize);
2352                     break;
2353                 }
2354 
2355                 // Look-behind expression matched, which means look-behind test as
2356                 //   a whole Fails
2357 
2358                 //   Restore the orignal input string length, which had been truncated
2359                 //   inorder to pin the end of the lookbehind match
2360                 //   to the position being looked-behind.
2361                 int32_t originalInputLen = fData[opValue+3];
2362                 U_ASSERT(originalInputLen >= fActiveLimit);
2363                 U_ASSERT(originalInputLen <= fInput->length());
2364                 fActiveLimit = originalInputLen;
2365 
2366                 // Restore original stack position, discarding any state saved
2367                 //   by the successful pattern match.
2368                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
2369                 int32_t newStackSize = fData[opValue];
2370                 U_ASSERT(fStack->size() > newStackSize);
2371                 fStack->setSize(newStackSize);
2372 
2373                 //  FAIL, which will take control back to someplace
2374                 //  prior to entering the look-behind test.
2375                 fp = (REStackFrame *)fStack->popFrame(frameSize);
2376             }
2377             break;
2378 
2379 
2380         case URX_LOOP_SR_I:
2381             // Loop Initialization for the optimized implementation of
2382             //     [some character set]*
2383             //   This op scans through all matching input.
2384             //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
2385             {
2386                 U_ASSERT(opValue > 0 && opValue < sets->size());
2387                 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
2388                 UnicodeSet   *s  = (UnicodeSet *)sets->elementAt(opValue);
2389 
2390                 // Loop through input, until either the input is exhausted or
2391                 //   we reach a character that is not a member of the set.
2392                 int32_t ix = fp->fInputIdx;
2393                 for (;;) {
2394                     if (ix >= fActiveLimit) {
2395                         fHitEnd = TRUE;
2396                         break;
2397                     }
2398                     UChar32   c;
2399                     U16_NEXT(inputBuf, ix, fActiveLimit, c);
2400                     if (c<256) {
2401                         if (s8->contains(c) == FALSE) {
2402                             U16_BACK_1(inputBuf, 0, ix);
2403                             break;
2404                         }
2405                     } else {
2406                         if (s->contains(c) == FALSE) {
2407                             U16_BACK_1(inputBuf, 0, ix);
2408                             break;
2409                         }
2410                     }
2411                 }
2412 
2413                 // If there were no matching characters, skip over the loop altogether.
2414                 //   The loop doesn't run at all, a * op always succeeds.
2415                 if (ix == fp->fInputIdx) {
2416                     fp->fPatIdx++;   // skip the URX_LOOP_C op.
2417                     break;
2418                 }
2419 
2420                 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
2421                 //   must follow.  It's operand is the stack location
2422                 //   that holds the starting input index for the match of this [set]*
2423                 int32_t loopcOp = pat[fp->fPatIdx];
2424                 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
2425                 int32_t stackLoc = URX_VAL(loopcOp);
2426                 U_ASSERT(stackLoc >= 0 && stackLoc < frameSize);
2427                 fp->fExtra[stackLoc] = fp->fInputIdx;
2428                 fp->fInputIdx = ix;
2429 
2430                 // Save State to the URX_LOOP_C op that follows this one,
2431                 //   so that match failures in the following code will return to there.
2432                 //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
2433                 fp = StateSave(fp, fp->fPatIdx, frameSize, status);
2434                 fp->fPatIdx++;
2435             }
2436             break;
2437 
2438 
2439         case URX_LOOP_DOT_I:
2440             // Loop Initialization for the optimized implementation of .*
2441             //   This op scans through all remaining input.
2442             //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
2443             {
2444                 // Loop through input until the input is exhausted (we reach an end-of-line)
2445                 // In DOTALL mode, we can just go straight to the end of the input.
2446                 int32_t ix;
2447                 if ((opValue & 1) == 1) {
2448                     // Dot-matches-All mode.  Jump straight to the end of the string.
2449                     ix = fActiveLimit;
2450                     fHitEnd = TRUE;
2451                 } else {
2452                     // NOT DOT ALL mode.  Line endings do not match '.'
2453                     // Scan forward until a line ending or end of input.
2454                     ix = fp->fInputIdx;
2455                     for (;;) {
2456                         if (ix >= fActiveLimit) {
2457                             fHitEnd = TRUE;
2458                             ix = fActiveLimit;
2459                             break;
2460                         }
2461                         UChar32   c;
2462                         U16_NEXT(inputBuf, ix, fActiveLimit, c);   // c = inputBuf[ix++]
2463                         if ((c & 0x7f) <= 0x29) {        // Fast filter of non-new-line-s
2464                             if ((c == 0x0a) ||            //  0x0a is newline in both modes.
2465                                ((opValue & 2) == 0) &&    // IF not UNIX_LINES mode
2466                                     (c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029) {
2467                                 //  char is a line ending.  Put the input pos back to the
2468                                 //    line ending char, and exit the scanning loop.
2469                                 U16_BACK_1(inputBuf, 0, ix);
2470                                 break;
2471                             }
2472                         }
2473                     }
2474                 }
2475 
2476                 // If there were no matching characters, skip over the loop altogether.
2477                 //   The loop doesn't run at all, a * op always succeeds.
2478                 if (ix == fp->fInputIdx) {
2479                     fp->fPatIdx++;   // skip the URX_LOOP_C op.
2480                     break;
2481                 }
2482 
2483                 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
2484                 //   must follow.  It's operand is the stack location
2485                 //   that holds the starting input index for the match of this .*
2486                 int32_t loopcOp = pat[fp->fPatIdx];
2487                 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
2488                 int32_t stackLoc = URX_VAL(loopcOp);
2489                 U_ASSERT(stackLoc >= 0 && stackLoc < frameSize);
2490                 fp->fExtra[stackLoc] = fp->fInputIdx;
2491                 fp->fInputIdx = ix;
2492 
2493                 // Save State to the URX_LOOP_C op that follows this one,
2494                 //   so that match failures in the following code will return to there.
2495                 //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
2496                 fp = StateSave(fp, fp->fPatIdx, frameSize, status);
2497                 fp->fPatIdx++;
2498             }
2499             break;
2500 
2501 
2502         case URX_LOOP_C:
2503             {
2504                 U_ASSERT(opValue>=0 && opValue<frameSize);
2505                 int32_t   terminalIdx =  fp->fExtra[opValue];
2506                 U_ASSERT(terminalIdx <= fp->fInputIdx);
2507                 if (terminalIdx == fp->fInputIdx) {
2508                     // We've backed up the input idx to the point that the loop started.
2509                     // The loop is done.  Leave here without saving state.
2510                     //  Subsequent failures won't come back here.
2511                     break;
2512                 }
2513                 // Set up for the next iteration of the loop, with input index
2514                 //   backed up by one from the last time through,
2515                 //   and a state save to this instruction in case the following code fails again.
2516                 //   (We're going backwards because this loop emulates stack unwinding, not
2517                 //    the initial scan forward.)
2518                 U_ASSERT(fp->fInputIdx > 0);
2519                 U16_BACK_1(inputBuf, 0, fp->fInputIdx);
2520                 if (inputBuf[fp->fInputIdx] == 0x0a &&
2521                     fp->fInputIdx > terminalIdx &&
2522                     inputBuf[fp->fInputIdx-1] == 0x0d) {
2523                     int32_t prevOp = pat[fp->fPatIdx-2];
2524                     if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
2525                         // .*, stepping back over CRLF pair.
2526                         fp->fInputIdx--;
2527                     }
2528                 }
2529 
2530 
2531                 fp = StateSave(fp, fp->fPatIdx-1, frameSize, status);
2532             }
2533             break;
2534 
2535 
2536 
2537         default:
2538             // Trouble.  The compiled pattern contains an entry with an
2539             //           unrecognized type tag.
2540             U_ASSERT(FALSE);
2541         }
2542 
2543         if (U_FAILURE(status)) {
2544             break;
2545         }
2546     }
2547 
2548 breakFromLoop:
2549     fMatch = isMatch;
2550     if (isMatch) {
2551         fLastMatchEnd = fMatchEnd;
2552         fMatchStart   = startIdx;
2553         fMatchEnd     = fp->fInputIdx;
2554         if (fTraceDebug) {
2555             REGEX_RUN_DEBUG_PRINTF(("Match.  start=%d   end=%d\n\n", fMatchStart, fMatchEnd));
2556         }
2557     }
2558     else
2559     {
2560         if (fTraceDebug) {
2561             REGEX_RUN_DEBUG_PRINTF(("No match\n\n"));
2562         }
2563     }
2564 
2565     fFrame = fp;                // The active stack frame when the engine stopped.
2566                                 //   Contains the capture group results that we need to
2567                                 //    access later.
2568 
2569     return;
2570 }
2571 
2572 
2573 
2574 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher)
2575 
2576 U_NAMESPACE_END
2577 
2578 #endif  // !UCONFIG_NO_REGULAR_EXPRESSIONS
2579 
2580