• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 **************************************************************************
3 *   Copyright (C) 2002-2011 International Business Machines Corporation  *
4 *   and others. All rights reserved.                                     *
5 **************************************************************************
6 */
7 //
8 //  file:  rematch.cpp
9 //
10 //         Contains the implementation of class RegexMatcher,
11 //         which is one of the main API classes for the ICU regular expression package.
12 //
13 
14 #include "unicode/utypes.h"
15 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
16 
17 #include "unicode/regex.h"
18 #include "unicode/uniset.h"
19 #include "unicode/uchar.h"
20 #include "unicode/ustring.h"
21 #include "unicode/rbbi.h"
22 #include "uassert.h"
23 #include "cmemory.h"
24 #include "uvector.h"
25 #include "uvectr32.h"
26 #include "uvectr64.h"
27 #include "regeximp.h"
28 #include "regexst.h"
29 #include "regextxt.h"
30 #include "ucase.h"
31 
32 // #include <malloc.h>        // Needed for heapcheck testing
33 
34 
35 // Find progress callback
36 // ----------------------
37 // Macro to inline test & call to ReportFindProgress().  Eliminates unnecessary function call.
38 //
39 #define REGEXFINDPROGRESS_INTERRUPT(pos, status)     \
40     (fFindProgressCallbackFn != NULL) && (ReportFindProgress(pos, status) == FALSE)
41 
42 
43 // Smart Backtracking
44 // ------------------
45 // When a failure would go back to a LOOP_C instruction,
46 // strings, characters, and setrefs scan backwards for a valid start
47 // character themselves, pop the stack, and save state, emulating the
48 // LOOP_C's effect but assured that the next character of input is a
49 // possible matching character.
50 //
51 // Good idea in theory; unfortunately it only helps out a few specific
52 // cases and slows the engine down a little in the rest.
53 
54 //#define REGEX_SMART_BACKTRACKING 1
55 
56 U_NAMESPACE_BEGIN
57 
58 // Default limit for the size of the back track stack, to avoid system
59 //    failures causedby heap exhaustion.  Units are in 32 bit words, not bytes.
60 // This value puts ICU's limits higher than most other regexp implementations,
61 //    which use recursion rather than the heap, and take more storage per
62 //    backtrack point.
63 //
64 static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY = 8000000;
65 
66 // Time limit counter constant.
67 //   Time limits for expression evaluation are in terms of quanta of work by
68 //   the engine, each of which is 10,000 state saves.
69 //   This constant determines that state saves per tick number.
70 static const int32_t TIMER_INITIAL_VALUE = 10000;
71 
72 //-----------------------------------------------------------------------------
73 //
74 //   Constructor and Destructor
75 //
76 //-----------------------------------------------------------------------------
RegexMatcher(const RegexPattern * pat)77 RegexMatcher::RegexMatcher(const RegexPattern *pat)  {
78     fDeferredStatus = U_ZERO_ERROR;
79     init(fDeferredStatus);
80     if (U_FAILURE(fDeferredStatus)) {
81         return;
82     }
83     if (pat==NULL) {
84         fDeferredStatus = U_ILLEGAL_ARGUMENT_ERROR;
85         return;
86     }
87     fPattern = pat;
88     init2(RegexStaticSets::gStaticSets->fEmptyText, fDeferredStatus);
89 }
90 
91 
92 
RegexMatcher(const UnicodeString & regexp,const UnicodeString & input,uint32_t flags,UErrorCode & status)93 RegexMatcher::RegexMatcher(const UnicodeString &regexp, const UnicodeString &input,
94                            uint32_t flags, UErrorCode &status) {
95     init(status);
96     if (U_FAILURE(status)) {
97         return;
98     }
99     UParseError    pe;
100     fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
101     fPattern           = fPatternOwned;
102 
103     UText inputText = UTEXT_INITIALIZER;
104     utext_openConstUnicodeString(&inputText, &input, &status);
105     init2(&inputText, status);
106     utext_close(&inputText);
107 
108     fInputUniStrMaybeMutable = TRUE;
109 }
110 
111 
RegexMatcher(UText * regexp,UText * input,uint32_t flags,UErrorCode & status)112 RegexMatcher::RegexMatcher(UText *regexp, UText *input,
113                            uint32_t flags, UErrorCode &status) {
114     init(status);
115     if (U_FAILURE(status)) {
116         return;
117     }
118     UParseError    pe;
119     fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
120     if (U_FAILURE(status)) {
121         return;
122     }
123 
124     fPattern           = fPatternOwned;
125     init2(input, status);
126 }
127 
128 
RegexMatcher(const UnicodeString & regexp,uint32_t flags,UErrorCode & status)129 RegexMatcher::RegexMatcher(const UnicodeString &regexp,
130                            uint32_t flags, UErrorCode &status) {
131     init(status);
132     if (U_FAILURE(status)) {
133         return;
134     }
135     UParseError    pe;
136     fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
137     if (U_FAILURE(status)) {
138         return;
139     }
140     fPattern           = fPatternOwned;
141     init2(RegexStaticSets::gStaticSets->fEmptyText, status);
142 }
143 
RegexMatcher(UText * regexp,uint32_t flags,UErrorCode & status)144 RegexMatcher::RegexMatcher(UText *regexp,
145                            uint32_t flags, UErrorCode &status) {
146     init(status);
147     if (U_FAILURE(status)) {
148         return;
149     }
150     UParseError    pe;
151     fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
152         if (U_FAILURE(status)) {
153         return;
154     }
155 
156     fPattern           = fPatternOwned;
157     init2(RegexStaticSets::gStaticSets->fEmptyText, status);
158 }
159 
160 
161 
162 
~RegexMatcher()163 RegexMatcher::~RegexMatcher() {
164     delete fStack;
165     if (fData != fSmallData) {
166         uprv_free(fData);
167         fData = NULL;
168     }
169     if (fPatternOwned) {
170         delete fPatternOwned;
171         fPatternOwned = NULL;
172         fPattern = NULL;
173     }
174 
175     if (fInput) {
176         delete fInput;
177     }
178     if (fInputText) {
179         utext_close(fInputText);
180     }
181     if (fAltInputText) {
182         utext_close(fAltInputText);
183     }
184 
185     #if UCONFIG_NO_BREAK_ITERATION==0
186     delete fWordBreakItr;
187     #endif
188 }
189 
190 //
191 //   init()   common initialization for use by all constructors.
192 //            Initialize all fields, get the object into a consistent state.
193 //            This must be done even when the initial status shows an error,
194 //            so that the object is initialized sufficiently well for the destructor
195 //            to run safely.
196 //
init(UErrorCode & status)197 void RegexMatcher::init(UErrorCode &status) {
198     fPattern           = NULL;
199     fPatternOwned      = NULL;
200     fFrameSize         = 0;
201     fRegionStart       = 0;
202     fRegionLimit       = 0;
203     fAnchorStart       = 0;
204     fAnchorLimit       = 0;
205     fLookStart         = 0;
206     fLookLimit         = 0;
207     fActiveStart       = 0;
208     fActiveLimit       = 0;
209     fTransparentBounds = FALSE;
210     fAnchoringBounds   = TRUE;
211     fMatch             = FALSE;
212     fMatchStart        = 0;
213     fMatchEnd          = 0;
214     fLastMatchEnd      = -1;
215     fAppendPosition    = 0;
216     fHitEnd            = FALSE;
217     fRequireEnd        = FALSE;
218     fStack             = NULL;
219     fFrame             = NULL;
220     fTimeLimit         = 0;
221     fTime              = 0;
222     fTickCounter       = 0;
223     fStackLimit        = DEFAULT_BACKTRACK_STACK_CAPACITY;
224     fCallbackFn        = NULL;
225     fCallbackContext   = NULL;
226     fFindProgressCallbackFn      = NULL;
227     fFindProgressCallbackContext = NULL;
228     fTraceDebug        = FALSE;
229     fDeferredStatus    = status;
230     fData              = fSmallData;
231     fWordBreakItr      = NULL;
232 
233     fStack             = new UVector64(status);
234     fInputText         = NULL;
235     fAltInputText      = NULL;
236     fInput             = NULL;
237     fInputLength       = 0;
238     fInputUniStrMaybeMutable = FALSE;
239 
240     if (U_FAILURE(status)) {
241         fDeferredStatus = status;
242     }
243 }
244 
245 //
246 //  init2()   Common initialization for use by RegexMatcher constructors, part 2.
247 //            This handles the common setup to be done after the Pattern is available.
248 //
init2(UText * input,UErrorCode & status)249 void RegexMatcher::init2(UText *input, UErrorCode &status) {
250     if (U_FAILURE(status)) {
251         fDeferredStatus = status;
252         return;
253     }
254 
255     if (fPattern->fDataSize > (int32_t)(sizeof(fSmallData)/sizeof(fSmallData[0]))) {
256         fData = (int64_t *)uprv_malloc(fPattern->fDataSize * sizeof(int64_t));
257         if (fData == NULL) {
258             status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
259             return;
260         }
261     }
262 
263     reset(input);
264     setStackLimit(DEFAULT_BACKTRACK_STACK_CAPACITY, status);
265     if (U_FAILURE(status)) {
266         fDeferredStatus = status;
267         return;
268     }
269 }
270 
271 
272 static const UChar BACKSLASH  = 0x5c;
273 static const UChar DOLLARSIGN = 0x24;
274 //--------------------------------------------------------------------------------
275 //
276 //    appendReplacement
277 //
278 //--------------------------------------------------------------------------------
appendReplacement(UnicodeString & dest,const UnicodeString & replacement,UErrorCode & status)279 RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest,
280                                               const UnicodeString &replacement,
281                                               UErrorCode &status) {
282     UText replacementText = UTEXT_INITIALIZER;
283 
284     utext_openConstUnicodeString(&replacementText, &replacement, &status);
285     if (U_SUCCESS(status)) {
286         UText resultText = UTEXT_INITIALIZER;
287         utext_openUnicodeString(&resultText, &dest, &status);
288 
289         if (U_SUCCESS(status)) {
290             appendReplacement(&resultText, &replacementText, status);
291             utext_close(&resultText);
292         }
293         utext_close(&replacementText);
294     }
295 
296     return *this;
297 }
298 
299 //
300 //    appendReplacement, UText mode
301 //
appendReplacement(UText * dest,UText * replacement,UErrorCode & status)302 RegexMatcher &RegexMatcher::appendReplacement(UText *dest,
303                                               UText *replacement,
304                                               UErrorCode &status) {
305     if (U_FAILURE(status)) {
306         return *this;
307     }
308     if (U_FAILURE(fDeferredStatus)) {
309         status = fDeferredStatus;
310         return *this;
311     }
312     if (fMatch == FALSE) {
313         status = U_REGEX_INVALID_STATE;
314         return *this;
315     }
316 
317     // Copy input string from the end of previous match to start of current match
318     int64_t  destLen = utext_nativeLength(dest);
319     if (fMatchStart > fAppendPosition) {
320         if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
321             destLen += utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
322                                      (int32_t)(fMatchStart-fAppendPosition), &status);
323         } else {
324             int32_t len16;
325             if (UTEXT_USES_U16(fInputText)) {
326                 len16 = (int32_t)(fMatchStart-fAppendPosition);
327             } else {
328                 UErrorCode lengthStatus = U_ZERO_ERROR;
329                 len16 = utext_extract(fInputText, fAppendPosition, fMatchStart, NULL, 0, &lengthStatus);
330             }
331             UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
332             if (inputChars == NULL) {
333                 status = U_MEMORY_ALLOCATION_ERROR;
334                 return *this;
335             }
336             utext_extract(fInputText, fAppendPosition, fMatchStart, inputChars, len16+1, &status);
337             destLen += utext_replace(dest, destLen, destLen, inputChars, len16, &status);
338             uprv_free(inputChars);
339         }
340     }
341     fAppendPosition = fMatchEnd;
342 
343 
344     // scan the replacement text, looking for substitutions ($n) and \escapes.
345     //  TODO:  optimize this loop by efficiently scanning for '$' or '\',
346     //         move entire ranges not containing substitutions.
347     UTEXT_SETNATIVEINDEX(replacement, 0);
348     UChar32 c = UTEXT_NEXT32(replacement);
349     while (c != U_SENTINEL) {
350         if (c == BACKSLASH) {
351             // Backslash Escape.  Copy the following char out without further checks.
352             //                    Note:  Surrogate pairs don't need any special handling
353             //                           The second half wont be a '$' or a '\', and
354             //                           will move to the dest normally on the next
355             //                           loop iteration.
356             c = UTEXT_CURRENT32(replacement);
357             if (c == U_SENTINEL) {
358                 break;
359             }
360 
361             if (c==0x55/*U*/ || c==0x75/*u*/) {
362                 // We have a \udddd or \Udddddddd escape sequence.
363                 int32_t offset = 0;
364                 struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(replacement);
365                 UChar32 escapedChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context);
366                 if (escapedChar != (UChar32)0xFFFFFFFF) {
367                     if (U_IS_BMP(escapedChar)) {
368                         UChar c16 = (UChar)escapedChar;
369                         destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
370                     } else {
371                         UChar surrogate[2];
372                         surrogate[0] = U16_LEAD(escapedChar);
373                         surrogate[1] = U16_TRAIL(escapedChar);
374                         if (U_SUCCESS(status)) {
375                             destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
376                         }
377                     }
378                     // TODO:  Report errors for mal-formed \u escapes?
379                     //        As this is, the original sequence is output, which may be OK.
380                     if (context.lastOffset == offset) {
381                         (void)UTEXT_PREVIOUS32(replacement);
382                     } else if (context.lastOffset != offset-1) {
383                         utext_moveIndex32(replacement, offset - context.lastOffset - 1);
384                     }
385                 }
386             } else {
387                 (void)UTEXT_NEXT32(replacement);
388                 // Plain backslash escape.  Just put out the escaped character.
389                 if (U_IS_BMP(c)) {
390                     UChar c16 = (UChar)c;
391                     destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
392                 } else {
393                     UChar surrogate[2];
394                     surrogate[0] = U16_LEAD(c);
395                     surrogate[1] = U16_TRAIL(c);
396                     if (U_SUCCESS(status)) {
397                         destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
398                     }
399                 }
400             }
401         } else if (c != DOLLARSIGN) {
402             // Normal char, not a $.  Copy it out without further checks.
403             if (U_IS_BMP(c)) {
404                 UChar c16 = (UChar)c;
405                 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
406             } else {
407                 UChar surrogate[2];
408                 surrogate[0] = U16_LEAD(c);
409                 surrogate[1] = U16_TRAIL(c);
410                 if (U_SUCCESS(status)) {
411                     destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
412                 }
413             }
414         } else {
415             // We've got a $.  Pick up a capture group number if one follows.
416             // Consume at most the number of digits necessary for the largest capture
417             // number that is valid for this pattern.
418 
419             int32_t numDigits = 0;
420             int32_t groupNum  = 0;
421             UChar32 digitC;
422             for (;;) {
423                 digitC = UTEXT_CURRENT32(replacement);
424                 if (digitC == U_SENTINEL) {
425                     break;
426                 }
427                 if (u_isdigit(digitC) == FALSE) {
428                     break;
429                 }
430                 (void)UTEXT_NEXT32(replacement);
431                 groupNum=groupNum*10 + u_charDigitValue(digitC);
432                 numDigits++;
433                 if (numDigits >= fPattern->fMaxCaptureDigits) {
434                     break;
435                 }
436             }
437 
438 
439             if (numDigits == 0) {
440                 // The $ didn't introduce a group number at all.
441                 // Treat it as just part of the substitution text.
442                 UChar c16 = DOLLARSIGN;
443                 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
444             } else {
445                 // Finally, append the capture group data to the destination.
446                 destLen += appendGroup(groupNum, dest, status);
447                 if (U_FAILURE(status)) {
448                     // Can fail if group number is out of range.
449                     break;
450                 }
451             }
452         }
453 
454         if (U_FAILURE(status)) {
455             break;
456         } else {
457             c = UTEXT_NEXT32(replacement);
458         }
459     }
460 
461     return *this;
462 }
463 
464 
465 
466 //--------------------------------------------------------------------------------
467 //
468 //    appendTail     Intended to be used in conjunction with appendReplacement()
469 //                   To the destination string, append everything following
470 //                   the last match position from the input string.
471 //
472 //                   Note:  Match ranges do not affect appendTail or appendReplacement
473 //
474 //--------------------------------------------------------------------------------
appendTail(UnicodeString & dest)475 UnicodeString &RegexMatcher::appendTail(UnicodeString &dest) {
476     UErrorCode status = U_ZERO_ERROR;
477     UText resultText = UTEXT_INITIALIZER;
478     utext_openUnicodeString(&resultText, &dest, &status);
479 
480     if (U_SUCCESS(status)) {
481         appendTail(&resultText, status);
482         utext_close(&resultText);
483     }
484 
485     return dest;
486 }
487 
488 //
489 //   appendTail, UText mode
490 //
appendTail(UText * dest,UErrorCode & status)491 UText *RegexMatcher::appendTail(UText *dest, UErrorCode &status) {
492     UBool bailOut = FALSE;
493     if (U_FAILURE(status)) {
494         bailOut = TRUE;
495     }
496     if (U_FAILURE(fDeferredStatus)) {
497         status = fDeferredStatus;
498         bailOut = TRUE;
499     }
500 
501     if (bailOut) {
502         //  dest must not be NULL
503         if (dest) {
504             utext_replace(dest, utext_nativeLength(dest), utext_nativeLength(dest), NULL, 0, &status);
505             return dest;
506         }
507     }
508 
509     if (fInputLength > fAppendPosition) {
510         if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
511             int64_t destLen = utext_nativeLength(dest);
512             utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
513                           (int32_t)(fInputLength-fAppendPosition), &status);
514         } else {
515             int32_t len16;
516             if (UTEXT_USES_U16(fInputText)) {
517                 len16 = (int32_t)(fInputLength-fAppendPosition);
518             } else {
519                 len16 = utext_extract(fInputText, fAppendPosition, fInputLength, NULL, 0, &status);
520                 status = U_ZERO_ERROR; // buffer overflow
521             }
522 
523             UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16));
524             if (inputChars == NULL) {
525                 fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
526             } else {
527                 utext_extract(fInputText, fAppendPosition, fInputLength, inputChars, len16, &status); // unterminated
528                 int64_t destLen = utext_nativeLength(dest);
529                 utext_replace(dest, destLen, destLen, inputChars, len16, &status);
530                 uprv_free(inputChars);
531             }
532         }
533     }
534     return dest;
535 }
536 
537 
538 
539 //--------------------------------------------------------------------------------
540 //
541 //   end
542 //
543 //--------------------------------------------------------------------------------
end(UErrorCode & err) const544 int32_t RegexMatcher::end(UErrorCode &err) const {
545     return end(0, err);
546 }
547 
end64(UErrorCode & err) const548 int64_t RegexMatcher::end64(UErrorCode &err) const {
549     return end64(0, err);
550 }
551 
end64(int32_t group,UErrorCode & err) const552 int64_t RegexMatcher::end64(int32_t group, UErrorCode &err) const {
553     if (U_FAILURE(err)) {
554         return -1;
555     }
556     if (fMatch == FALSE) {
557         err = U_REGEX_INVALID_STATE;
558         return -1;
559     }
560     if (group < 0 || group > fPattern->fGroupMap->size()) {
561         err = U_INDEX_OUTOFBOUNDS_ERROR;
562         return -1;
563     }
564     int64_t e = -1;
565     if (group == 0) {
566         e = fMatchEnd;
567     } else {
568         // Get the position within the stack frame of the variables for
569         //    this capture group.
570         int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
571         U_ASSERT(groupOffset < fPattern->fFrameSize);
572         U_ASSERT(groupOffset >= 0);
573         e = fFrame->fExtra[groupOffset + 1];
574     }
575 
576         return e;
577 }
578 
end(int32_t group,UErrorCode & err) const579 int32_t RegexMatcher::end(int32_t group, UErrorCode &err) const {
580     return (int32_t)end64(group, err);
581 }
582 
583 
584 //--------------------------------------------------------------------------------
585 //
586 //   find()
587 //
588 //--------------------------------------------------------------------------------
find()589 UBool RegexMatcher::find() {
590     // Start at the position of the last match end.  (Will be zero if the
591     //   matcher has been reset.)
592     //
593     if (U_FAILURE(fDeferredStatus)) {
594         return FALSE;
595     }
596 
597     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
598         return findUsingChunk();
599     }
600 
601     int64_t startPos = fMatchEnd;
602     if (startPos==0) {
603         startPos = fActiveStart;
604     }
605 
606     if (fMatch) {
607         // Save the position of any previous successful match.
608         fLastMatchEnd = fMatchEnd;
609 
610         if (fMatchStart == fMatchEnd) {
611             // Previous match had zero length.  Move start position up one position
612             //  to avoid sending find() into a loop on zero-length matches.
613             if (startPos >= fActiveLimit) {
614                 fMatch = FALSE;
615                 fHitEnd = TRUE;
616                 return FALSE;
617             }
618             UTEXT_SETNATIVEINDEX(fInputText, startPos);
619             (void)UTEXT_NEXT32(fInputText);
620             startPos = UTEXT_GETNATIVEINDEX(fInputText);
621         }
622     } else {
623         if (fLastMatchEnd >= 0) {
624             // A previous find() failed to match.  Don't try again.
625             //   (without this test, a pattern with a zero-length match
626             //    could match again at the end of an input string.)
627             fHitEnd = TRUE;
628             return FALSE;
629         }
630     }
631 
632 
633     // Compute the position in the input string beyond which a match can not begin, because
634     //   the minimum length match would extend past the end of the input.
635     //   Note:  some patterns that cannot match anything will have fMinMatchLength==Max Int.
636     //          Be aware of possible overflows if making changes here.
637     int64_t testStartLimit;
638     if (UTEXT_USES_U16(fInputText)) {
639         testStartLimit = fActiveLimit - fPattern->fMinMatchLen;
640         if (startPos > testStartLimit) {
641             fMatch = FALSE;
642             fHitEnd = TRUE;
643             return FALSE;
644         }
645     } else {
646         // For now, let the matcher discover that it can't match on its own
647         // We don't know how long the match len is in native characters
648         testStartLimit = fActiveLimit;
649     }
650 
651     UChar32  c;
652     U_ASSERT(startPos >= 0);
653 
654     switch (fPattern->fStartType) {
655     case START_NO_INFO:
656         // No optimization was found.
657         //  Try a match at each input position.
658         for (;;) {
659             MatchAt(startPos, FALSE, fDeferredStatus);
660             if (U_FAILURE(fDeferredStatus)) {
661                 return FALSE;
662             }
663             if (fMatch) {
664                 return TRUE;
665             }
666             if (startPos >= testStartLimit) {
667                 fHitEnd = TRUE;
668                 return FALSE;
669             }
670             UTEXT_SETNATIVEINDEX(fInputText, startPos);
671             (void)UTEXT_NEXT32(fInputText);
672             startPos = UTEXT_GETNATIVEINDEX(fInputText);
673             // Note that it's perfectly OK for a pattern to have a zero-length
674             //   match at the end of a string, so we must make sure that the loop
675             //   runs with startPos == testStartLimit the last time through.
676             if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
677                 return FALSE;
678         }
679         U_ASSERT(FALSE);
680 
681     case START_START:
682         // Matches are only possible at the start of the input string
683         //   (pattern begins with ^ or \A)
684         if (startPos > fActiveStart) {
685             fMatch = FALSE;
686             return FALSE;
687         }
688         MatchAt(startPos, FALSE, fDeferredStatus);
689         if (U_FAILURE(fDeferredStatus)) {
690             return FALSE;
691         }
692         return fMatch;
693 
694 
695     case START_SET:
696         {
697             // Match may start on any char from a pre-computed set.
698             U_ASSERT(fPattern->fMinMatchLen > 0);
699             int64_t pos;
700             UTEXT_SETNATIVEINDEX(fInputText, startPos);
701             for (;;) {
702                 c = UTEXT_NEXT32(fInputText);
703                 pos = UTEXT_GETNATIVEINDEX(fInputText);
704                 // c will be -1 (U_SENTINEL) at end of text, in which case we
705                 // skip this next block (so we don't have a negative array index)
706                 // and handle end of text in the following block.
707                 if (c >= 0 && ((c<256 && fPattern->fInitialChars8->contains(c)) ||
708                               (c>=256 && fPattern->fInitialChars->contains(c)))) {
709                     MatchAt(startPos, FALSE, fDeferredStatus);
710                     if (U_FAILURE(fDeferredStatus)) {
711                         return FALSE;
712                     }
713                     if (fMatch) {
714                         return TRUE;
715                     }
716                     UTEXT_SETNATIVEINDEX(fInputText, pos);
717                 }
718                 if (startPos >= testStartLimit) {
719                     fMatch = FALSE;
720                     fHitEnd = TRUE;
721                     return FALSE;
722                 }
723                 startPos = pos;
724 	            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
725                     return FALSE;
726             }
727         }
728         U_ASSERT(FALSE);
729 
730     case START_STRING:
731     case START_CHAR:
732         {
733             // Match starts on exactly one char.
734             U_ASSERT(fPattern->fMinMatchLen > 0);
735             UChar32 theChar = fPattern->fInitialChar;
736             int64_t pos;
737             UTEXT_SETNATIVEINDEX(fInputText, startPos);
738             for (;;) {
739                 c = UTEXT_NEXT32(fInputText);
740                 pos = UTEXT_GETNATIVEINDEX(fInputText);
741                 if (c == theChar) {
742                     MatchAt(startPos, FALSE, fDeferredStatus);
743                     if (U_FAILURE(fDeferredStatus)) {
744                         return FALSE;
745                     }
746                     if (fMatch) {
747                         return TRUE;
748                     }
749                     UTEXT_SETNATIVEINDEX(fInputText, pos);
750                 }
751                 if (startPos >= testStartLimit) {
752                     fMatch = FALSE;
753                     fHitEnd = TRUE;
754                     return FALSE;
755                 }
756                 startPos = pos;
757 	            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
758                     return FALSE;
759            }
760         }
761         U_ASSERT(FALSE);
762 
763     case START_LINE:
764         {
765             UChar32  c;
766             if (startPos == fAnchorStart) {
767                 MatchAt(startPos, FALSE, fDeferredStatus);
768                 if (U_FAILURE(fDeferredStatus)) {
769                     return FALSE;
770                 }
771                 if (fMatch) {
772                     return TRUE;
773                 }
774                 UTEXT_SETNATIVEINDEX(fInputText, startPos);
775                 c = UTEXT_NEXT32(fInputText);
776                 startPos = UTEXT_GETNATIVEINDEX(fInputText);
777             } else {
778                 UTEXT_SETNATIVEINDEX(fInputText, startPos);
779                 c = UTEXT_PREVIOUS32(fInputText);
780                 UTEXT_SETNATIVEINDEX(fInputText, startPos);
781             }
782 
783             if (fPattern->fFlags & UREGEX_UNIX_LINES) {
784                 for (;;) {
785                     if (c == 0x0a) {
786                             MatchAt(startPos, FALSE, fDeferredStatus);
787                             if (U_FAILURE(fDeferredStatus)) {
788                                 return FALSE;
789                             }
790                             if (fMatch) {
791                                 return TRUE;
792                             }
793                             UTEXT_SETNATIVEINDEX(fInputText, startPos);
794                     }
795                     if (startPos >= testStartLimit) {
796                         fMatch = FALSE;
797                         fHitEnd = TRUE;
798                         return FALSE;
799                     }
800                     c = UTEXT_NEXT32(fInputText);
801                     startPos = UTEXT_GETNATIVEINDEX(fInputText);
802                     // Note that it's perfectly OK for a pattern to have a zero-length
803                     //   match at the end of a string, so we must make sure that the loop
804                     //   runs with startPos == testStartLimit the last time through.
805 		            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
806                         return FALSE;
807                 }
808             } else {
809                 for (;;) {
810                     if (((c & 0x7f) <= 0x29) &&     // First quickly bypass as many chars as possible
811                         ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 )) {
812                             if (c == 0x0d && startPos < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
813                                 (void)UTEXT_NEXT32(fInputText);
814                                 startPos = UTEXT_GETNATIVEINDEX(fInputText);
815                             }
816                             MatchAt(startPos, FALSE, fDeferredStatus);
817                             if (U_FAILURE(fDeferredStatus)) {
818                                 return FALSE;
819                             }
820                             if (fMatch) {
821                                 return TRUE;
822                             }
823                             UTEXT_SETNATIVEINDEX(fInputText, startPos);
824                     }
825                     if (startPos >= testStartLimit) {
826                         fMatch = FALSE;
827                         fHitEnd = TRUE;
828                         return FALSE;
829                     }
830                     c = UTEXT_NEXT32(fInputText);
831                     startPos = UTEXT_GETNATIVEINDEX(fInputText);
832                     // Note that it's perfectly OK for a pattern to have a zero-length
833                     //   match at the end of a string, so we must make sure that the loop
834                     //   runs with startPos == testStartLimit the last time through.
835 		            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
836                         return FALSE;
837                 }
838             }
839         }
840 
841     default:
842         U_ASSERT(FALSE);
843     }
844 
845     U_ASSERT(FALSE);
846     return FALSE;
847 }
848 
849 
850 
find(int64_t start,UErrorCode & status)851 UBool RegexMatcher::find(int64_t start, UErrorCode &status) {
852     if (U_FAILURE(status)) {
853         return FALSE;
854     }
855     if (U_FAILURE(fDeferredStatus)) {
856         status = fDeferredStatus;
857         return FALSE;
858     }
859     this->reset();                        // Note:  Reset() is specified by Java Matcher documentation.
860                                           //        This will reset the region to be the full input length.
861     if (start < 0) {
862         status = U_INDEX_OUTOFBOUNDS_ERROR;
863         return FALSE;
864     }
865 
866     int64_t nativeStart = start;
867     if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
868         status = U_INDEX_OUTOFBOUNDS_ERROR;
869         return FALSE;
870     }
871     fMatchEnd = nativeStart;
872     return find();
873 }
874 
875 
876 //--------------------------------------------------------------------------------
877 //
878 //   findUsingChunk() -- like find(), but with the advance knowledge that the
879 //                       entire string is available in the UText's chunk buffer.
880 //
881 //--------------------------------------------------------------------------------
findUsingChunk()882 UBool RegexMatcher::findUsingChunk() {
883     // Start at the position of the last match end.  (Will be zero if the
884     //   matcher has been reset.
885     //
886 
887     int32_t startPos = (int32_t)fMatchEnd;
888     if (startPos==0) {
889         startPos = (int32_t)fActiveStart;
890     }
891 
892     const UChar *inputBuf = fInputText->chunkContents;
893 
894     if (fMatch) {
895         // Save the position of any previous successful match.
896         fLastMatchEnd = fMatchEnd;
897 
898         if (fMatchStart == fMatchEnd) {
899             // Previous match had zero length.  Move start position up one position
900             //  to avoid sending find() into a loop on zero-length matches.
901             if (startPos >= fActiveLimit) {
902                 fMatch = FALSE;
903                 fHitEnd = TRUE;
904                 return FALSE;
905             }
906             U16_FWD_1(inputBuf, startPos, fInputLength);
907         }
908     } else {
909         if (fLastMatchEnd >= 0) {
910             // A previous find() failed to match.  Don't try again.
911             //   (without this test, a pattern with a zero-length match
912             //    could match again at the end of an input string.)
913             fHitEnd = TRUE;
914             return FALSE;
915         }
916     }
917 
918 
919     // Compute the position in the input string beyond which a match can not begin, because
920     //   the minimum length match would extend past the end of the input.
921     //   Note:  some patterns that cannot match anything will have fMinMatchLength==Max Int.
922     //          Be aware of possible overflows if making changes here.
923     int32_t testLen  = (int32_t)(fActiveLimit - fPattern->fMinMatchLen);
924     if (startPos > testLen) {
925         fMatch = FALSE;
926         fHitEnd = TRUE;
927         return FALSE;
928     }
929 
930     UChar32  c;
931     U_ASSERT(startPos >= 0);
932 
933     switch (fPattern->fStartType) {
934     case START_NO_INFO:
935         // No optimization was found.
936         //  Try a match at each input position.
937         for (;;) {
938             MatchChunkAt(startPos, FALSE, fDeferredStatus);
939             if (U_FAILURE(fDeferredStatus)) {
940                 return FALSE;
941             }
942             if (fMatch) {
943                 return TRUE;
944             }
945             if (startPos >= testLen) {
946                 fHitEnd = TRUE;
947                 return FALSE;
948             }
949             U16_FWD_1(inputBuf, startPos, fActiveLimit);
950             // Note that it's perfectly OK for a pattern to have a zero-length
951             //   match at the end of a string, so we must make sure that the loop
952             //   runs with startPos == testLen the last time through.
953             if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
954                 return FALSE;
955         }
956         U_ASSERT(FALSE);
957 
958     case START_START:
959         // Matches are only possible at the start of the input string
960         //   (pattern begins with ^ or \A)
961         if (startPos > fActiveStart) {
962             fMatch = FALSE;
963             return FALSE;
964         }
965         MatchChunkAt(startPos, FALSE, fDeferredStatus);
966         if (U_FAILURE(fDeferredStatus)) {
967             return FALSE;
968         }
969         return fMatch;
970 
971 
972     case START_SET:
973     {
974         // Match may start on any char from a pre-computed set.
975         U_ASSERT(fPattern->fMinMatchLen > 0);
976         for (;;) {
977             int32_t pos = startPos;
978             U16_NEXT(inputBuf, startPos, fActiveLimit, c);  // like c = inputBuf[startPos++];
979             if ((c<256 && fPattern->fInitialChars8->contains(c)) ||
980                 (c>=256 && fPattern->fInitialChars->contains(c))) {
981                 MatchChunkAt(pos, FALSE, fDeferredStatus);
982                 if (U_FAILURE(fDeferredStatus)) {
983                     return FALSE;
984                 }
985                 if (fMatch) {
986                     return TRUE;
987                 }
988             }
989             if (pos >= testLen) {
990                 fMatch = FALSE;
991                 fHitEnd = TRUE;
992                 return FALSE;
993             }
994             if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
995                 return FALSE;
996         }
997     }
998         U_ASSERT(FALSE);
999 
1000     case START_STRING:
1001     case START_CHAR:
1002     {
1003         // Match starts on exactly one char.
1004         U_ASSERT(fPattern->fMinMatchLen > 0);
1005         UChar32 theChar = fPattern->fInitialChar;
1006         for (;;) {
1007             int32_t pos = startPos;
1008             U16_NEXT(inputBuf, startPos, fActiveLimit, c);  // like c = inputBuf[startPos++];
1009             if (c == theChar) {
1010                 MatchChunkAt(pos, FALSE, fDeferredStatus);
1011                 if (U_FAILURE(fDeferredStatus)) {
1012                     return FALSE;
1013                 }
1014                 if (fMatch) {
1015                     return TRUE;
1016                 }
1017             }
1018             if (pos >= testLen) {
1019                 fMatch = FALSE;
1020                 fHitEnd = TRUE;
1021                 return FALSE;
1022             }
1023             if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
1024                 return FALSE;
1025         }
1026     }
1027         U_ASSERT(FALSE);
1028 
1029     case START_LINE:
1030     {
1031         UChar32  c;
1032         if (startPos == fAnchorStart) {
1033             MatchChunkAt(startPos, FALSE, fDeferredStatus);
1034             if (U_FAILURE(fDeferredStatus)) {
1035                 return FALSE;
1036             }
1037             if (fMatch) {
1038                 return TRUE;
1039             }
1040             U16_FWD_1(inputBuf, startPos, fActiveLimit);
1041         }
1042 
1043         if (fPattern->fFlags & UREGEX_UNIX_LINES) {
1044             for (;;) {
1045                 c = inputBuf[startPos-1];
1046                 if (c == 0x0a) {
1047                     MatchChunkAt(startPos, FALSE, fDeferredStatus);
1048                     if (U_FAILURE(fDeferredStatus)) {
1049                         return FALSE;
1050                     }
1051                     if (fMatch) {
1052                         return TRUE;
1053                     }
1054                 }
1055                 if (startPos >= testLen) {
1056                     fMatch = FALSE;
1057                     fHitEnd = TRUE;
1058                     return FALSE;
1059                 }
1060                 U16_FWD_1(inputBuf, startPos, fActiveLimit);
1061                 // Note that it's perfectly OK for a pattern to have a zero-length
1062                 //   match at the end of a string, so we must make sure that the loop
1063                 //   runs with startPos == testLen the last time through.
1064 	            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
1065                     return FALSE;
1066             }
1067         } else {
1068             for (;;) {
1069                 c = inputBuf[startPos-1];
1070                 if (((c & 0x7f) <= 0x29) &&     // First quickly bypass as many chars as possible
1071                     ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 )) {
1072                     if (c == 0x0d && startPos < fActiveLimit && inputBuf[startPos] == 0x0a) {
1073                         startPos++;
1074                     }
1075                     MatchChunkAt(startPos, FALSE, fDeferredStatus);
1076                     if (U_FAILURE(fDeferredStatus)) {
1077                         return FALSE;
1078                     }
1079                     if (fMatch) {
1080                         return TRUE;
1081                     }
1082                 }
1083                 if (startPos >= testLen) {
1084                     fMatch = FALSE;
1085                     fHitEnd = TRUE;
1086                     return FALSE;
1087                 }
1088                 U16_FWD_1(inputBuf, startPos, fActiveLimit);
1089                 // Note that it's perfectly OK for a pattern to have a zero-length
1090                 //   match at the end of a string, so we must make sure that the loop
1091                 //   runs with startPos == testLen the last time through.
1092 	            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
1093                     return FALSE;
1094             }
1095         }
1096     }
1097 
1098     default:
1099         U_ASSERT(FALSE);
1100     }
1101 
1102     U_ASSERT(FALSE);
1103     return FALSE;
1104 }
1105 
1106 
1107 
1108 //--------------------------------------------------------------------------------
1109 //
1110 //  group()
1111 //
1112 //--------------------------------------------------------------------------------
group(UErrorCode & status) const1113 UnicodeString RegexMatcher::group(UErrorCode &status) const {
1114     return group(0, status);
1115 }
1116 
1117 //  Return immutable shallow clone
group(UText * dest,int64_t & group_len,UErrorCode & status) const1118 UText *RegexMatcher::group(UText *dest, int64_t &group_len, UErrorCode &status) const {
1119     return group(0, dest, group_len, status);
1120 }
1121 
1122 //  Return immutable shallow clone
group(int32_t groupNum,UText * dest,int64_t & group_len,UErrorCode & status) const1123 UText *RegexMatcher::group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const {
1124     group_len = 0;
1125     UBool bailOut = FALSE;
1126     if (U_FAILURE(status)) {
1127         return dest;
1128     }
1129     if (U_FAILURE(fDeferredStatus)) {
1130         status = fDeferredStatus;
1131         bailOut = TRUE;
1132     }
1133     if (fMatch == FALSE) {
1134         status = U_REGEX_INVALID_STATE;
1135         bailOut = TRUE;
1136     }
1137     if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
1138         status = U_INDEX_OUTOFBOUNDS_ERROR;
1139         bailOut = TRUE;
1140     }
1141 
1142     if (bailOut) {
1143         return (dest) ? dest : utext_openUChars(NULL, NULL, 0, &status);
1144     }
1145 
1146     int64_t s, e;
1147     if (groupNum == 0) {
1148         s = fMatchStart;
1149         e = fMatchEnd;
1150     } else {
1151         int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1152         U_ASSERT(groupOffset < fPattern->fFrameSize);
1153         U_ASSERT(groupOffset >= 0);
1154         s = fFrame->fExtra[groupOffset];
1155         e = fFrame->fExtra[groupOffset+1];
1156     }
1157 
1158     if (s < 0) {
1159         // A capture group wasn't part of the match
1160         return utext_clone(dest, fInputText, FALSE, TRUE, &status);
1161     }
1162     U_ASSERT(s <= e);
1163     group_len = e - s;
1164 
1165     dest = utext_clone(dest, fInputText, FALSE, TRUE, &status);
1166     if (dest)
1167         UTEXT_SETNATIVEINDEX(dest, s);
1168     return dest;
1169 }
1170 
group(int32_t groupNum,UErrorCode & status) const1171 UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const {
1172     UnicodeString result;
1173     if (U_FAILURE(status)) {
1174         return result;
1175     }
1176     UText resultText = UTEXT_INITIALIZER;
1177     utext_openUnicodeString(&resultText, &result, &status);
1178     group(groupNum, &resultText, status);
1179     utext_close(&resultText);
1180     return result;
1181 }
1182 
1183 
1184 //  Return deep (mutable) clone
1185 //		Technology Preview (as an API), but note that the UnicodeString API is implemented
1186 //		using this function.
group(int32_t groupNum,UText * dest,UErrorCode & status) const1187 UText *RegexMatcher::group(int32_t groupNum, UText *dest, UErrorCode &status) const {
1188     UBool bailOut = FALSE;
1189     if (U_FAILURE(status)) {
1190         return dest;
1191     }
1192     if (U_FAILURE(fDeferredStatus)) {
1193         status = fDeferredStatus;
1194         bailOut = TRUE;
1195     }
1196 
1197     if (fMatch == FALSE) {
1198         status = U_REGEX_INVALID_STATE;
1199         bailOut = TRUE;
1200     }
1201     if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
1202         status = U_INDEX_OUTOFBOUNDS_ERROR;
1203         bailOut = TRUE;
1204     }
1205 
1206     if (bailOut) {
1207         if (dest) {
1208             utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status);
1209             return dest;
1210         } else {
1211             return utext_openUChars(NULL, NULL, 0, &status);
1212         }
1213     }
1214 
1215     int64_t s, e;
1216     if (groupNum == 0) {
1217         s = fMatchStart;
1218         e = fMatchEnd;
1219     } else {
1220         int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1221         U_ASSERT(groupOffset < fPattern->fFrameSize);
1222         U_ASSERT(groupOffset >= 0);
1223         s = fFrame->fExtra[groupOffset];
1224         e = fFrame->fExtra[groupOffset+1];
1225     }
1226 
1227     if (s < 0) {
1228         // A capture group wasn't part of the match
1229         if (dest) {
1230             utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status);
1231             return dest;
1232         } else {
1233             return utext_openUChars(NULL, NULL, 0, &status);
1234         }
1235     }
1236     U_ASSERT(s <= e);
1237 
1238     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1239         U_ASSERT(e <= fInputLength);
1240         if (dest) {
1241             utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents+s, (int32_t)(e-s), &status);
1242         } else {
1243             UText groupText = UTEXT_INITIALIZER;
1244             utext_openUChars(&groupText, fInputText->chunkContents+s, e-s, &status);
1245             dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status);
1246             utext_close(&groupText);
1247         }
1248     } else {
1249         int32_t len16;
1250         if (UTEXT_USES_U16(fInputText)) {
1251             len16 = (int32_t)(e-s);
1252         } else {
1253             UErrorCode lengthStatus = U_ZERO_ERROR;
1254             len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus);
1255         }
1256         UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
1257         if (groupChars == NULL) {
1258             status = U_MEMORY_ALLOCATION_ERROR;
1259             return dest;
1260         }
1261         utext_extract(fInputText, s, e, groupChars, len16+1, &status);
1262 
1263         if (dest) {
1264             utext_replace(dest, 0, utext_nativeLength(dest), groupChars, len16, &status);
1265         } else {
1266             UText groupText = UTEXT_INITIALIZER;
1267             utext_openUChars(&groupText, groupChars, len16, &status);
1268             dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status);
1269             utext_close(&groupText);
1270         }
1271 
1272         uprv_free(groupChars);
1273     }
1274     return dest;
1275 }
1276 
1277 //--------------------------------------------------------------------------------
1278 //
1279 //  appendGroup() -- currently internal only, appends a group to a UText rather
1280 //                   than replacing its contents
1281 //
1282 //--------------------------------------------------------------------------------
1283 
appendGroup(int32_t groupNum,UText * dest,UErrorCode & status) const1284 int64_t RegexMatcher::appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const {
1285     if (U_FAILURE(status)) {
1286         return 0;
1287     }
1288     if (U_FAILURE(fDeferredStatus)) {
1289         status = fDeferredStatus;
1290         return 0;
1291     }
1292     int64_t destLen = utext_nativeLength(dest);
1293 
1294     if (fMatch == FALSE) {
1295         status = U_REGEX_INVALID_STATE;
1296         return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1297     }
1298     if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
1299         status = U_INDEX_OUTOFBOUNDS_ERROR;
1300         return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1301     }
1302 
1303     int64_t s, e;
1304     if (groupNum == 0) {
1305         s = fMatchStart;
1306         e = fMatchEnd;
1307     } else {
1308         int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1309         U_ASSERT(groupOffset < fPattern->fFrameSize);
1310         U_ASSERT(groupOffset >= 0);
1311         s = fFrame->fExtra[groupOffset];
1312         e = fFrame->fExtra[groupOffset+1];
1313     }
1314 
1315     if (s < 0) {
1316         // A capture group wasn't part of the match
1317         return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1318     }
1319     U_ASSERT(s <= e);
1320 
1321     int64_t deltaLen;
1322     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1323         U_ASSERT(e <= fInputLength);
1324         deltaLen = utext_replace(dest, destLen, destLen, fInputText->chunkContents+s, (int32_t)(e-s), &status);
1325     } else {
1326         int32_t len16;
1327         if (UTEXT_USES_U16(fInputText)) {
1328             len16 = (int32_t)(e-s);
1329         } else {
1330             UErrorCode lengthStatus = U_ZERO_ERROR;
1331             len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus);
1332         }
1333         UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
1334         if (groupChars == NULL) {
1335             status = U_MEMORY_ALLOCATION_ERROR;
1336             return 0;
1337         }
1338         utext_extract(fInputText, s, e, groupChars, len16+1, &status);
1339 
1340         deltaLen = utext_replace(dest, destLen, destLen, groupChars, len16, &status);
1341         uprv_free(groupChars);
1342     }
1343     return deltaLen;
1344 }
1345 
1346 
1347 
1348 //--------------------------------------------------------------------------------
1349 //
1350 //  groupCount()
1351 //
1352 //--------------------------------------------------------------------------------
groupCount() const1353 int32_t RegexMatcher::groupCount() const {
1354     return fPattern->fGroupMap->size();
1355 }
1356 
1357 
1358 
1359 //--------------------------------------------------------------------------------
1360 //
1361 //  hasAnchoringBounds()
1362 //
1363 //--------------------------------------------------------------------------------
hasAnchoringBounds() const1364 UBool RegexMatcher::hasAnchoringBounds() const {
1365     return fAnchoringBounds;
1366 }
1367 
1368 
1369 //--------------------------------------------------------------------------------
1370 //
1371 //  hasTransparentBounds()
1372 //
1373 //--------------------------------------------------------------------------------
hasTransparentBounds() const1374 UBool RegexMatcher::hasTransparentBounds() const {
1375     return fTransparentBounds;
1376 }
1377 
1378 
1379 
1380 //--------------------------------------------------------------------------------
1381 //
1382 //  hitEnd()
1383 //
1384 //--------------------------------------------------------------------------------
hitEnd() const1385 UBool RegexMatcher::hitEnd() const {
1386     return fHitEnd;
1387 }
1388 
1389 
1390 //--------------------------------------------------------------------------------
1391 //
1392 //  input()
1393 //
1394 //--------------------------------------------------------------------------------
input() const1395 const UnicodeString &RegexMatcher::input() const {
1396     if (!fInput) {
1397         UErrorCode status = U_ZERO_ERROR;
1398         int32_t len16;
1399         if (UTEXT_USES_U16(fInputText)) {
1400             len16 = (int32_t)fInputLength;
1401         } else {
1402             len16 = utext_extract(fInputText, 0, fInputLength, NULL, 0, &status);
1403             status = U_ZERO_ERROR; // overflow, length status
1404         }
1405         UnicodeString *result = new UnicodeString(len16, 0, 0);
1406 
1407         UChar *inputChars = result->getBuffer(len16);
1408         utext_extract(fInputText, 0, fInputLength, inputChars, len16, &status); // unterminated warning
1409         result->releaseBuffer(len16);
1410 
1411         (*(const UnicodeString **)&fInput) = result; // pointer assignment, rather than operator=
1412     }
1413 
1414     return *fInput;
1415 }
1416 
1417 //--------------------------------------------------------------------------------
1418 //
1419 //  inputText()
1420 //
1421 //--------------------------------------------------------------------------------
inputText() const1422 UText *RegexMatcher::inputText() const {
1423     return fInputText;
1424 }
1425 
1426 
1427 //--------------------------------------------------------------------------------
1428 //
1429 //  getInput() -- like inputText(), but makes a clone or copies into another UText
1430 //
1431 //--------------------------------------------------------------------------------
getInput(UText * dest,UErrorCode & status) const1432 UText *RegexMatcher::getInput (UText *dest, UErrorCode &status) const {
1433     UBool bailOut = FALSE;
1434     if (U_FAILURE(status)) {
1435         return dest;
1436     }
1437     if (U_FAILURE(fDeferredStatus)) {
1438         status = fDeferredStatus;
1439         bailOut = TRUE;
1440     }
1441 
1442     if (bailOut) {
1443         if (dest) {
1444             utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status);
1445             return dest;
1446         } else {
1447             return utext_clone(NULL, fInputText, FALSE, TRUE, &status);
1448         }
1449     }
1450 
1451     if (dest) {
1452         if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1453             utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents, (int32_t)fInputLength, &status);
1454         } else {
1455             int32_t input16Len;
1456             if (UTEXT_USES_U16(fInputText)) {
1457                 input16Len = (int32_t)fInputLength;
1458             } else {
1459                 UErrorCode lengthStatus = U_ZERO_ERROR;
1460                 input16Len = utext_extract(fInputText, 0, fInputLength, NULL, 0, &lengthStatus); // buffer overflow error
1461             }
1462             UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(input16Len));
1463             if (inputChars == NULL) {
1464                 return dest;
1465             }
1466 
1467             status = U_ZERO_ERROR;
1468             utext_extract(fInputText, 0, fInputLength, inputChars, input16Len, &status); // not terminated warning
1469             status = U_ZERO_ERROR;
1470             utext_replace(dest, 0, utext_nativeLength(dest), inputChars, input16Len, &status);
1471 
1472             uprv_free(inputChars);
1473         }
1474         return dest;
1475     } else {
1476         return utext_clone(NULL, fInputText, FALSE, TRUE, &status);
1477     }
1478 }
1479 
1480 
1481 static UBool compat_SyncMutableUTextContents(UText *ut);
compat_SyncMutableUTextContents(UText * ut)1482 static UBool compat_SyncMutableUTextContents(UText *ut) {
1483     UBool retVal = FALSE;
1484 
1485     //  In the following test, we're really only interested in whether the UText should switch
1486     //  between heap and stack allocation.  If length hasn't changed, we won't, so the chunkContents
1487     //  will still point to the correct data.
1488     if (utext_nativeLength(ut) != ut->nativeIndexingLimit) {
1489         UnicodeString *us=(UnicodeString *)ut->context;
1490 
1491         // Update to the latest length.
1492         // For example, (utext_nativeLength(ut) != ut->nativeIndexingLimit).
1493         int32_t newLength = us->length();
1494 
1495         // Update the chunk description.
1496         // The buffer may have switched between stack- and heap-based.
1497         ut->chunkContents    = us->getBuffer();
1498         ut->chunkLength      = newLength;
1499         ut->chunkNativeLimit = newLength;
1500         ut->nativeIndexingLimit = newLength;
1501         retVal = TRUE;
1502     }
1503 
1504     return retVal;
1505 }
1506 
1507 //--------------------------------------------------------------------------------
1508 //
1509 //  lookingAt()
1510 //
1511 //--------------------------------------------------------------------------------
lookingAt(UErrorCode & status)1512 UBool RegexMatcher::lookingAt(UErrorCode &status) {
1513     if (U_FAILURE(status)) {
1514         return FALSE;
1515     }
1516     if (U_FAILURE(fDeferredStatus)) {
1517         status = fDeferredStatus;
1518         return FALSE;
1519     }
1520 
1521     if (fInputUniStrMaybeMutable) {
1522         if (compat_SyncMutableUTextContents(fInputText)) {
1523         fInputLength = utext_nativeLength(fInputText);
1524         reset();
1525         }
1526     }
1527     else {
1528         resetPreserveRegion();
1529     }
1530     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1531         MatchChunkAt((int32_t)fActiveStart, FALSE, status);
1532     } else {
1533         MatchAt(fActiveStart, FALSE, status);
1534     }
1535     return fMatch;
1536 }
1537 
1538 
lookingAt(int64_t start,UErrorCode & status)1539 UBool RegexMatcher::lookingAt(int64_t start, UErrorCode &status) {
1540     if (U_FAILURE(status)) {
1541         return FALSE;
1542     }
1543     if (U_FAILURE(fDeferredStatus)) {
1544         status = fDeferredStatus;
1545         return FALSE;
1546     }
1547     reset();
1548 
1549     if (start < 0) {
1550         status = U_INDEX_OUTOFBOUNDS_ERROR;
1551         return FALSE;
1552     }
1553 
1554     if (fInputUniStrMaybeMutable) {
1555         if (compat_SyncMutableUTextContents(fInputText)) {
1556         fInputLength = utext_nativeLength(fInputText);
1557         reset();
1558         }
1559     }
1560 
1561     int64_t nativeStart;
1562     nativeStart = start;
1563     if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
1564         status = U_INDEX_OUTOFBOUNDS_ERROR;
1565         return FALSE;
1566     }
1567 
1568     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1569         MatchChunkAt((int32_t)nativeStart, FALSE, status);
1570     } else {
1571         MatchAt(nativeStart, FALSE, status);
1572     }
1573     return fMatch;
1574 }
1575 
1576 
1577 
1578 //--------------------------------------------------------------------------------
1579 //
1580 //  matches()
1581 //
1582 //--------------------------------------------------------------------------------
matches(UErrorCode & status)1583 UBool RegexMatcher::matches(UErrorCode &status) {
1584     if (U_FAILURE(status)) {
1585         return FALSE;
1586     }
1587     if (U_FAILURE(fDeferredStatus)) {
1588         status = fDeferredStatus;
1589         return FALSE;
1590     }
1591 
1592     if (fInputUniStrMaybeMutable) {
1593         if (compat_SyncMutableUTextContents(fInputText)) {
1594         fInputLength = utext_nativeLength(fInputText);
1595         reset();
1596         }
1597     }
1598     else {
1599         resetPreserveRegion();
1600     }
1601 
1602     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1603         MatchChunkAt((int32_t)fActiveStart, TRUE, status);
1604     } else {
1605         MatchAt(fActiveStart, TRUE, status);
1606     }
1607     return fMatch;
1608 }
1609 
1610 
matches(int64_t start,UErrorCode & status)1611 UBool RegexMatcher::matches(int64_t start, UErrorCode &status) {
1612     if (U_FAILURE(status)) {
1613         return FALSE;
1614     }
1615     if (U_FAILURE(fDeferredStatus)) {
1616         status = fDeferredStatus;
1617         return FALSE;
1618     }
1619     reset();
1620 
1621     if (start < 0) {
1622         status = U_INDEX_OUTOFBOUNDS_ERROR;
1623         return FALSE;
1624     }
1625 
1626     if (fInputUniStrMaybeMutable) {
1627         if (compat_SyncMutableUTextContents(fInputText)) {
1628         fInputLength = utext_nativeLength(fInputText);
1629         reset();
1630         }
1631     }
1632 
1633     int64_t nativeStart;
1634     nativeStart = start;
1635     if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
1636         status = U_INDEX_OUTOFBOUNDS_ERROR;
1637         return FALSE;
1638     }
1639 
1640     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1641         MatchChunkAt((int32_t)nativeStart, TRUE, status);
1642     } else {
1643         MatchAt(nativeStart, TRUE, status);
1644     }
1645     return fMatch;
1646 }
1647 
1648 
1649 
1650 //--------------------------------------------------------------------------------
1651 //
1652 //    pattern
1653 //
1654 //--------------------------------------------------------------------------------
pattern() const1655 const RegexPattern &RegexMatcher::pattern() const {
1656     return *fPattern;
1657 }
1658 
1659 
1660 
1661 //--------------------------------------------------------------------------------
1662 //
1663 //    region
1664 //
1665 //--------------------------------------------------------------------------------
region(int64_t regionStart,int64_t regionLimit,int64_t startIndex,UErrorCode & status)1666 RegexMatcher &RegexMatcher::region(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status) {
1667     if (U_FAILURE(status)) {
1668         return *this;
1669     }
1670 
1671     if (regionStart>regionLimit || regionStart<0 || regionLimit<0) {
1672         status = U_ILLEGAL_ARGUMENT_ERROR;
1673     }
1674 
1675     int64_t nativeStart = regionStart;
1676     int64_t nativeLimit = regionLimit;
1677     if (nativeStart > fInputLength || nativeLimit > fInputLength) {
1678       status = U_ILLEGAL_ARGUMENT_ERROR;
1679     }
1680 
1681     if (startIndex == -1)
1682       this->reset();
1683     else
1684       resetPreserveRegion();
1685 
1686     fRegionStart = nativeStart;
1687     fRegionLimit = nativeLimit;
1688     fActiveStart = nativeStart;
1689     fActiveLimit = nativeLimit;
1690 
1691     if (startIndex != -1) {
1692       if (startIndex < fActiveStart || startIndex > fActiveLimit) {
1693           status = U_INDEX_OUTOFBOUNDS_ERROR;
1694       }
1695       fMatchEnd = startIndex;
1696     }
1697 
1698     if (!fTransparentBounds) {
1699         fLookStart = nativeStart;
1700         fLookLimit = nativeLimit;
1701     }
1702     if (fAnchoringBounds) {
1703         fAnchorStart = nativeStart;
1704         fAnchorLimit = nativeLimit;
1705     }
1706     return *this;
1707 }
1708 
region(int64_t start,int64_t limit,UErrorCode & status)1709 RegexMatcher &RegexMatcher::region(int64_t start, int64_t limit, UErrorCode &status) {
1710   return region(start, limit, -1, status);
1711 }
1712 
1713 //--------------------------------------------------------------------------------
1714 //
1715 //    regionEnd
1716 //
1717 //--------------------------------------------------------------------------------
regionEnd() const1718 int32_t RegexMatcher::regionEnd() const {
1719     return (int32_t)fRegionLimit;
1720 }
1721 
regionEnd64() const1722 int64_t RegexMatcher::regionEnd64() const {
1723     return fRegionLimit;
1724 }
1725 
1726 //--------------------------------------------------------------------------------
1727 //
1728 //    regionStart
1729 //
1730 //--------------------------------------------------------------------------------
regionStart() const1731 int32_t RegexMatcher::regionStart() const {
1732     return (int32_t)fRegionStart;
1733 }
1734 
regionStart64() const1735 int64_t RegexMatcher::regionStart64() const {
1736     return fRegionStart;
1737 }
1738 
1739 
1740 //--------------------------------------------------------------------------------
1741 //
1742 //    replaceAll
1743 //
1744 //--------------------------------------------------------------------------------
replaceAll(const UnicodeString & replacement,UErrorCode & status)1745 UnicodeString RegexMatcher::replaceAll(const UnicodeString &replacement, UErrorCode &status) {
1746     UText replacementText = UTEXT_INITIALIZER;
1747     UText resultText = UTEXT_INITIALIZER;
1748     UnicodeString resultString;
1749     if (U_FAILURE(status)) {
1750         return resultString;
1751     }
1752 
1753     utext_openConstUnicodeString(&replacementText, &replacement, &status);
1754     utext_openUnicodeString(&resultText, &resultString, &status);
1755 
1756     replaceAll(&replacementText, &resultText, status);
1757 
1758     utext_close(&resultText);
1759     utext_close(&replacementText);
1760 
1761     return resultString;
1762 }
1763 
1764 
1765 //
1766 //    replaceAll, UText mode
1767 //
replaceAll(UText * replacement,UText * dest,UErrorCode & status)1768 UText *RegexMatcher::replaceAll(UText *replacement, UText *dest, UErrorCode &status) {
1769     if (U_FAILURE(status)) {
1770         return dest;
1771     }
1772     if (U_FAILURE(fDeferredStatus)) {
1773         status = fDeferredStatus;
1774         return dest;
1775     }
1776 
1777     if (dest == NULL) {
1778         UnicodeString emptyString;
1779         UText empty = UTEXT_INITIALIZER;
1780 
1781         utext_openUnicodeString(&empty, &emptyString, &status);
1782         dest = utext_clone(NULL, &empty, TRUE, FALSE, &status);
1783         utext_close(&empty);
1784     }
1785 
1786     if (U_SUCCESS(status)) {
1787         reset();
1788         while (find()) {
1789             appendReplacement(dest, replacement, status);
1790             if (U_FAILURE(status)) {
1791                 break;
1792             }
1793         }
1794         appendTail(dest, status);
1795     }
1796 
1797     return dest;
1798 }
1799 
1800 
1801 //--------------------------------------------------------------------------------
1802 //
1803 //    replaceFirst
1804 //
1805 //--------------------------------------------------------------------------------
replaceFirst(const UnicodeString & replacement,UErrorCode & status)1806 UnicodeString RegexMatcher::replaceFirst(const UnicodeString &replacement, UErrorCode &status) {
1807     UText replacementText = UTEXT_INITIALIZER;
1808     UText resultText = UTEXT_INITIALIZER;
1809     UnicodeString resultString;
1810 
1811     utext_openConstUnicodeString(&replacementText, &replacement, &status);
1812     utext_openUnicodeString(&resultText, &resultString, &status);
1813 
1814     replaceFirst(&replacementText, &resultText, status);
1815 
1816     utext_close(&resultText);
1817     utext_close(&replacementText);
1818 
1819     return resultString;
1820 }
1821 
1822 //
1823 //    replaceFirst, UText mode
1824 //
replaceFirst(UText * replacement,UText * dest,UErrorCode & status)1825 UText *RegexMatcher::replaceFirst(UText *replacement, UText *dest, UErrorCode &status) {
1826     if (U_FAILURE(status)) {
1827         return dest;
1828     }
1829     if (U_FAILURE(fDeferredStatus)) {
1830         status = fDeferredStatus;
1831         return dest;
1832     }
1833 
1834     reset();
1835     if (!find()) {
1836         return getInput(dest, status);
1837     }
1838 
1839     if (dest == NULL) {
1840         UnicodeString emptyString;
1841         UText empty = UTEXT_INITIALIZER;
1842 
1843         utext_openUnicodeString(&empty, &emptyString, &status);
1844         dest = utext_clone(NULL, &empty, TRUE, FALSE, &status);
1845         utext_close(&empty);
1846     }
1847 
1848     appendReplacement(dest, replacement, status);
1849     appendTail(dest, status);
1850 
1851     return dest;
1852 }
1853 
1854 
1855 //--------------------------------------------------------------------------------
1856 //
1857 //     requireEnd
1858 //
1859 //--------------------------------------------------------------------------------
requireEnd() const1860 UBool RegexMatcher::requireEnd() const {
1861     return fRequireEnd;
1862 }
1863 
1864 
1865 //--------------------------------------------------------------------------------
1866 //
1867 //     reset
1868 //
1869 //--------------------------------------------------------------------------------
reset()1870 RegexMatcher &RegexMatcher::reset() {
1871     fRegionStart    = 0;
1872     fRegionLimit    = fInputLength;
1873     fActiveStart    = 0;
1874     fActiveLimit    = fInputLength;
1875     fAnchorStart    = 0;
1876     fAnchorLimit    = fInputLength;
1877     fLookStart      = 0;
1878     fLookLimit      = fInputLength;
1879     resetPreserveRegion();
1880     return *this;
1881 }
1882 
1883 
1884 
resetPreserveRegion()1885 void RegexMatcher::resetPreserveRegion() {
1886     fMatchStart     = 0;
1887     fMatchEnd       = 0;
1888     fLastMatchEnd   = -1;
1889     fAppendPosition = 0;
1890     fMatch          = FALSE;
1891     fHitEnd         = FALSE;
1892     fRequireEnd     = FALSE;
1893     fTime           = 0;
1894     fTickCounter    = TIMER_INITIAL_VALUE;
1895     //resetStack(); // more expensive than it looks...
1896 }
1897 
1898 
reset(const UnicodeString & input)1899 RegexMatcher &RegexMatcher::reset(const UnicodeString &input) {
1900     fInputText = utext_openConstUnicodeString(fInputText, &input, &fDeferredStatus);
1901     if (fPattern->fNeedsAltInput) {
1902         fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
1903     }
1904     fInputLength = utext_nativeLength(fInputText);
1905 
1906     reset();
1907     delete fInput;
1908     fInput = NULL;
1909 
1910     //  Do the following for any UnicodeString.
1911     //  This is for compatibility for those clients who modify the input string "live" during regex operations.
1912     fInputUniStrMaybeMutable = TRUE;
1913 
1914     if (fWordBreakItr != NULL) {
1915 #if UCONFIG_NO_BREAK_ITERATION==0
1916         UErrorCode status = U_ZERO_ERROR;
1917         fWordBreakItr->setText(fInputText, status);
1918 #endif
1919     }
1920     return *this;
1921 }
1922 
1923 
reset(UText * input)1924 RegexMatcher &RegexMatcher::reset(UText *input) {
1925     if (fInputText != input) {
1926         fInputText = utext_clone(fInputText, input, FALSE, TRUE, &fDeferredStatus);
1927         if (fPattern->fNeedsAltInput) fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
1928         fInputLength = utext_nativeLength(fInputText);
1929 
1930         delete fInput;
1931         fInput = NULL;
1932 
1933         if (fWordBreakItr != NULL) {
1934 #if UCONFIG_NO_BREAK_ITERATION==0
1935             UErrorCode status = U_ZERO_ERROR;
1936             fWordBreakItr->setText(input, status);
1937 #endif
1938         }
1939     }
1940     reset();
1941     fInputUniStrMaybeMutable = FALSE;
1942 
1943     return *this;
1944 }
1945 
1946 /*RegexMatcher &RegexMatcher::reset(const UChar *) {
1947     fDeferredStatus = U_INTERNAL_PROGRAM_ERROR;
1948     return *this;
1949 }*/
1950 
reset(int64_t position,UErrorCode & status)1951 RegexMatcher &RegexMatcher::reset(int64_t position, UErrorCode &status) {
1952     if (U_FAILURE(status)) {
1953         return *this;
1954     }
1955     reset();       // Reset also resets the region to be the entire string.
1956 
1957     if (position < 0 || position > fActiveLimit) {
1958         status = U_INDEX_OUTOFBOUNDS_ERROR;
1959         return *this;
1960     }
1961     fMatchEnd = position;
1962     return *this;
1963 }
1964 
1965 
1966 //--------------------------------------------------------------------------------
1967 //
1968 //    refresh
1969 //
1970 //--------------------------------------------------------------------------------
refreshInputText(UText * input,UErrorCode & status)1971 RegexMatcher &RegexMatcher::refreshInputText(UText *input, UErrorCode &status) {
1972     if (U_FAILURE(status)) {
1973         return *this;
1974     }
1975     if (input == NULL) {
1976         status = U_ILLEGAL_ARGUMENT_ERROR;
1977         return *this;
1978     }
1979     if (utext_nativeLength(fInputText) != utext_nativeLength(input)) {
1980         status = U_ILLEGAL_ARGUMENT_ERROR;
1981         return *this;
1982     }
1983     int64_t  pos = utext_getNativeIndex(fInputText);
1984     //  Shallow read-only clone of the new UText into the existing input UText
1985     fInputText = utext_clone(fInputText, input, FALSE, TRUE, &status);
1986     if (U_FAILURE(status)) {
1987         return *this;
1988     }
1989     utext_setNativeIndex(fInputText, pos);
1990 
1991     if (fAltInputText != NULL) {
1992         pos = utext_getNativeIndex(fAltInputText);
1993         fAltInputText = utext_clone(fAltInputText, input, FALSE, TRUE, &status);
1994         if (U_FAILURE(status)) {
1995             return *this;
1996         }
1997         utext_setNativeIndex(fAltInputText, pos);
1998     }
1999     return *this;
2000 }
2001 
2002 
2003 
2004 //--------------------------------------------------------------------------------
2005 //
2006 //    setTrace
2007 //
2008 //--------------------------------------------------------------------------------
setTrace(UBool state)2009 void RegexMatcher::setTrace(UBool state) {
2010     fTraceDebug = state;
2011 }
2012 
2013 
2014 
2015 //---------------------------------------------------------------------
2016 //
2017 //   split
2018 //
2019 //---------------------------------------------------------------------
split(const UnicodeString & input,UnicodeString dest[],int32_t destCapacity,UErrorCode & status)2020 int32_t  RegexMatcher::split(const UnicodeString &input,
2021         UnicodeString    dest[],
2022         int32_t          destCapacity,
2023         UErrorCode      &status)
2024 {
2025     UText inputText = UTEXT_INITIALIZER;
2026     utext_openConstUnicodeString(&inputText, &input, &status);
2027     if (U_FAILURE(status)) {
2028         return 0;
2029     }
2030 
2031     UText **destText = (UText **)uprv_malloc(sizeof(UText*)*destCapacity);
2032     if (destText == NULL) {
2033         status = U_MEMORY_ALLOCATION_ERROR;
2034         return 0;
2035     }
2036     int32_t i;
2037     for (i = 0; i < destCapacity; i++) {
2038         destText[i] = utext_openUnicodeString(NULL, &dest[i], &status);
2039     }
2040 
2041     int32_t fieldCount = split(&inputText, destText, destCapacity, status);
2042 
2043     for (i = 0; i < destCapacity; i++) {
2044         utext_close(destText[i]);
2045     }
2046 
2047     uprv_free(destText);
2048     utext_close(&inputText);
2049     return fieldCount;
2050 }
2051 
2052 //
2053 //   split, UText mode
2054 //
split(UText * input,UText * dest[],int32_t destCapacity,UErrorCode & status)2055 int32_t  RegexMatcher::split(UText *input,
2056         UText           *dest[],
2057         int32_t          destCapacity,
2058         UErrorCode      &status)
2059 {
2060     //
2061     // Check arguements for validity
2062     //
2063     if (U_FAILURE(status)) {
2064         return 0;
2065     };
2066 
2067     if (destCapacity < 1) {
2068         status = U_ILLEGAL_ARGUMENT_ERROR;
2069         return 0;
2070     }
2071 
2072     //
2073     // Reset for the input text
2074     //
2075     reset(input);
2076     int64_t   nextOutputStringStart = 0;
2077     if (fActiveLimit == 0) {
2078         return 0;
2079     }
2080 
2081     //
2082     // Loop through the input text, searching for the delimiter pattern
2083     //
2084     int32_t i;
2085     int32_t numCaptureGroups = fPattern->fGroupMap->size();
2086     for (i=0; ; i++) {
2087         if (i>=destCapacity-1) {
2088             // There is one or zero output string left.
2089             // Fill the last output string with whatever is left from the input, then exit the loop.
2090             //  ( i will be == destCapacity if we filled the output array while processing
2091             //    capture groups of the delimiter expression, in which case we will discard the
2092             //    last capture group saved in favor of the unprocessed remainder of the
2093             //    input string.)
2094             i = destCapacity-1;
2095             if (fActiveLimit > nextOutputStringStart) {
2096                 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2097                     if (dest[i]) {
2098                         utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2099                                       input->chunkContents+nextOutputStringStart,
2100                                       (int32_t)(fActiveLimit-nextOutputStringStart), &status);
2101                     } else {
2102                         UText remainingText = UTEXT_INITIALIZER;
2103                         utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2104                                          fActiveLimit-nextOutputStringStart, &status);
2105                         dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2106                         utext_close(&remainingText);
2107                     }
2108                 } else {
2109                     UErrorCode lengthStatus = U_ZERO_ERROR;
2110                     int32_t remaining16Length =
2111                         utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus);
2112                     UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2113                     if (remainingChars == NULL) {
2114                         status = U_MEMORY_ALLOCATION_ERROR;
2115                         break;
2116                     }
2117 
2118                     utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
2119                     if (dest[i]) {
2120                         utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2121                     } else {
2122                         UText remainingText = UTEXT_INITIALIZER;
2123                         utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2124                         dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2125                         utext_close(&remainingText);
2126                     }
2127 
2128                     uprv_free(remainingChars);
2129                 }
2130             }
2131             break;
2132         }
2133         if (find()) {
2134             // We found another delimiter.  Move everything from where we started looking
2135             //  up until the start of the delimiter into the next output string.
2136             if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2137                 if (dest[i]) {
2138                     utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2139                                   input->chunkContents+nextOutputStringStart,
2140                                   (int32_t)(fMatchStart-nextOutputStringStart), &status);
2141                 } else {
2142                     UText remainingText = UTEXT_INITIALIZER;
2143                     utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2144                                       fMatchStart-nextOutputStringStart, &status);
2145                     dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2146                     utext_close(&remainingText);
2147                 }
2148             } else {
2149                 UErrorCode lengthStatus = U_ZERO_ERROR;
2150                 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fMatchStart, NULL, 0, &lengthStatus);
2151                 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2152                 if (remainingChars == NULL) {
2153                     status = U_MEMORY_ALLOCATION_ERROR;
2154                     break;
2155                 }
2156                 utext_extract(input, nextOutputStringStart, fMatchStart, remainingChars, remaining16Length+1, &status);
2157                 if (dest[i]) {
2158                     utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2159                 } else {
2160                     UText remainingText = UTEXT_INITIALIZER;
2161                     utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2162                     dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2163                     utext_close(&remainingText);
2164                 }
2165 
2166                 uprv_free(remainingChars);
2167             }
2168             nextOutputStringStart = fMatchEnd;
2169 
2170             // If the delimiter pattern has capturing parentheses, the captured
2171             //  text goes out into the next n destination strings.
2172             int32_t groupNum;
2173             for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
2174                 if (i >= destCapacity-2) {
2175                     // Never fill the last available output string with capture group text.
2176                     // It will filled with the last field, the remainder of the
2177                     //  unsplit input text.
2178                     break;
2179                 }
2180                 i++;
2181                 dest[i] = group(groupNum, dest[i], status);
2182             }
2183 
2184             if (nextOutputStringStart == fActiveLimit) {
2185                 // The delimiter was at the end of the string.  We're done, but first
2186                 // we output one last empty string, for the empty field following
2187                 //   the delimiter at the end of input.
2188                 if (i+1 < destCapacity) {
2189                     ++i;
2190                     if (dest[i] == NULL) {
2191                         dest[i] = utext_openUChars(NULL, NULL, 0, &status);
2192                     } else {
2193                         static UChar emptyString[] = {(UChar)0};
2194                         utext_replace(dest[i], 0, utext_nativeLength(dest[i]), emptyString, 0, &status);
2195                     }
2196                 }
2197                 break;
2198 
2199             }
2200         }
2201         else
2202         {
2203             // We ran off the end of the input while looking for the next delimiter.
2204             // All the remaining text goes into the current output string.
2205             if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2206                 if (dest[i]) {
2207                     utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2208                                   input->chunkContents+nextOutputStringStart,
2209                                   (int32_t)(fActiveLimit-nextOutputStringStart), &status);
2210                 } else {
2211                     UText remainingText = UTEXT_INITIALIZER;
2212                     utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2213                                      fActiveLimit-nextOutputStringStart, &status);
2214                     dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2215                     utext_close(&remainingText);
2216                 }
2217             } else {
2218                 UErrorCode lengthStatus = U_ZERO_ERROR;
2219                 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus);
2220                 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2221                 if (remainingChars == NULL) {
2222                     status = U_MEMORY_ALLOCATION_ERROR;
2223                     break;
2224                 }
2225 
2226                 utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
2227                 if (dest[i]) {
2228                     utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2229                 } else {
2230                     UText remainingText = UTEXT_INITIALIZER;
2231                     utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2232                     dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2233                     utext_close(&remainingText);
2234                 }
2235 
2236                 uprv_free(remainingChars);
2237             }
2238             break;
2239         }
2240         if (U_FAILURE(status)) {
2241             break;
2242         }
2243     }   // end of for loop
2244     return i+1;
2245 }
2246 
2247 
2248 //--------------------------------------------------------------------------------
2249 //
2250 //     start
2251 //
2252 //--------------------------------------------------------------------------------
start(UErrorCode & status) const2253 int32_t RegexMatcher::start(UErrorCode &status) const {
2254     return start(0, status);
2255 }
2256 
start64(UErrorCode & status) const2257 int64_t RegexMatcher::start64(UErrorCode &status) const {
2258     return start64(0, status);
2259 }
2260 
2261 //--------------------------------------------------------------------------------
2262 //
2263 //     start(int32_t group, UErrorCode &status)
2264 //
2265 //--------------------------------------------------------------------------------
2266 
start64(int32_t group,UErrorCode & status) const2267 int64_t RegexMatcher::start64(int32_t group, UErrorCode &status) const {
2268     if (U_FAILURE(status)) {
2269         return -1;
2270     }
2271     if (U_FAILURE(fDeferredStatus)) {
2272         status = fDeferredStatus;
2273         return -1;
2274     }
2275     if (fMatch == FALSE) {
2276         status = U_REGEX_INVALID_STATE;
2277         return -1;
2278     }
2279     if (group < 0 || group > fPattern->fGroupMap->size()) {
2280         status = U_INDEX_OUTOFBOUNDS_ERROR;
2281         return -1;
2282     }
2283     int64_t s;
2284     if (group == 0) {
2285         s = fMatchStart;
2286     } else {
2287         int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
2288         U_ASSERT(groupOffset < fPattern->fFrameSize);
2289         U_ASSERT(groupOffset >= 0);
2290         s = fFrame->fExtra[groupOffset];
2291     }
2292 
2293     return s;
2294 }
2295 
2296 
start(int32_t group,UErrorCode & status) const2297 int32_t RegexMatcher::start(int32_t group, UErrorCode &status) const {
2298     return (int32_t)start64(group, status);
2299 }
2300 
2301 //--------------------------------------------------------------------------------
2302 //
2303 //     useAnchoringBounds
2304 //
2305 //--------------------------------------------------------------------------------
useAnchoringBounds(UBool b)2306 RegexMatcher &RegexMatcher::useAnchoringBounds(UBool b) {
2307     fAnchoringBounds = b;
2308     fAnchorStart = (fAnchoringBounds ? fRegionStart : 0);
2309     fAnchorLimit = (fAnchoringBounds ? fRegionLimit : fInputLength);
2310     return *this;
2311 }
2312 
2313 
2314 //--------------------------------------------------------------------------------
2315 //
2316 //     useTransparentBounds
2317 //
2318 //--------------------------------------------------------------------------------
useTransparentBounds(UBool b)2319 RegexMatcher &RegexMatcher::useTransparentBounds(UBool b) {
2320     fTransparentBounds = b;
2321     fLookStart = (fTransparentBounds ? 0 : fRegionStart);
2322     fLookLimit = (fTransparentBounds ? fInputLength : fRegionLimit);
2323     return *this;
2324 }
2325 
2326 //--------------------------------------------------------------------------------
2327 //
2328 //     setTimeLimit
2329 //
2330 //--------------------------------------------------------------------------------
setTimeLimit(int32_t limit,UErrorCode & status)2331 void RegexMatcher::setTimeLimit(int32_t limit, UErrorCode &status) {
2332     if (U_FAILURE(status)) {
2333         return;
2334     }
2335     if (U_FAILURE(fDeferredStatus)) {
2336         status = fDeferredStatus;
2337         return;
2338     }
2339     if (limit < 0) {
2340         status = U_ILLEGAL_ARGUMENT_ERROR;
2341         return;
2342     }
2343     fTimeLimit = limit;
2344 }
2345 
2346 
2347 //--------------------------------------------------------------------------------
2348 //
2349 //     getTimeLimit
2350 //
2351 //--------------------------------------------------------------------------------
getTimeLimit() const2352 int32_t RegexMatcher::getTimeLimit() const {
2353     return fTimeLimit;
2354 }
2355 
2356 
2357 //--------------------------------------------------------------------------------
2358 //
2359 //     setStackLimit
2360 //
2361 //--------------------------------------------------------------------------------
setStackLimit(int32_t limit,UErrorCode & status)2362 void RegexMatcher::setStackLimit(int32_t limit, UErrorCode &status) {
2363     if (U_FAILURE(status)) {
2364         return;
2365     }
2366     if (U_FAILURE(fDeferredStatus)) {
2367         status = fDeferredStatus;
2368         return;
2369     }
2370     if (limit < 0) {
2371         status = U_ILLEGAL_ARGUMENT_ERROR;
2372         return;
2373     }
2374 
2375     // Reset the matcher.  This is needed here in case there is a current match
2376     //    whose final stack frame (containing the match results, pointed to by fFrame)
2377     //    would be lost by resizing to a smaller stack size.
2378     reset();
2379 
2380     if (limit == 0) {
2381         // Unlimited stack expansion
2382         fStack->setMaxCapacity(0);
2383     } else {
2384         // Change the units of the limit  from bytes to ints, and bump the size up
2385         //   to be big enough to hold at least one stack frame for the pattern,
2386         //   if it isn't there already.
2387         int32_t adjustedLimit = limit / sizeof(int32_t);
2388         if (adjustedLimit < fPattern->fFrameSize) {
2389             adjustedLimit = fPattern->fFrameSize;
2390         }
2391         fStack->setMaxCapacity(adjustedLimit);
2392     }
2393     fStackLimit = limit;
2394 }
2395 
2396 
2397 //--------------------------------------------------------------------------------
2398 //
2399 //     getStackLimit
2400 //
2401 //--------------------------------------------------------------------------------
getStackLimit() const2402 int32_t RegexMatcher::getStackLimit() const {
2403     return fStackLimit;
2404 }
2405 
2406 
2407 //--------------------------------------------------------------------------------
2408 //
2409 //     setMatchCallback
2410 //
2411 //--------------------------------------------------------------------------------
setMatchCallback(URegexMatchCallback * callback,const void * context,UErrorCode & status)2412 void RegexMatcher::setMatchCallback(URegexMatchCallback     *callback,
2413                                     const void              *context,
2414                                     UErrorCode              &status) {
2415     if (U_FAILURE(status)) {
2416         return;
2417     }
2418     fCallbackFn = callback;
2419     fCallbackContext = context;
2420 }
2421 
2422 
2423 //--------------------------------------------------------------------------------
2424 //
2425 //     getMatchCallback
2426 //
2427 //--------------------------------------------------------------------------------
getMatchCallback(URegexMatchCallback * & callback,const void * & context,UErrorCode & status)2428 void RegexMatcher::getMatchCallback(URegexMatchCallback   *&callback,
2429                                   const void              *&context,
2430                                   UErrorCode              &status) {
2431     if (U_FAILURE(status)) {
2432        return;
2433     }
2434     callback = fCallbackFn;
2435     context  = fCallbackContext;
2436 }
2437 
2438 
2439 //--------------------------------------------------------------------------------
2440 //
2441 //     setMatchCallback
2442 //
2443 //--------------------------------------------------------------------------------
setFindProgressCallback(URegexFindProgressCallback * callback,const void * context,UErrorCode & status)2444 void RegexMatcher::setFindProgressCallback(URegexFindProgressCallback      *callback,
2445                                                 const void                      *context,
2446                                                 UErrorCode                      &status) {
2447     if (U_FAILURE(status)) {
2448         return;
2449     }
2450     fFindProgressCallbackFn = callback;
2451     fFindProgressCallbackContext = context;
2452 }
2453 
2454 
2455 //--------------------------------------------------------------------------------
2456 //
2457 //     getMatchCallback
2458 //
2459 //--------------------------------------------------------------------------------
getFindProgressCallback(URegexFindProgressCallback * & callback,const void * & context,UErrorCode & status)2460 void RegexMatcher::getFindProgressCallback(URegexFindProgressCallback    *&callback,
2461                                                 const void                    *&context,
2462                                                 UErrorCode                    &status) {
2463     if (U_FAILURE(status)) {
2464        return;
2465     }
2466     callback = fFindProgressCallbackFn;
2467     context  = fFindProgressCallbackContext;
2468 }
2469 
2470 
2471 //================================================================================
2472 //
2473 //    Code following this point in this file is the internal
2474 //    Match Engine Implementation.
2475 //
2476 //================================================================================
2477 
2478 
2479 //--------------------------------------------------------------------------------
2480 //
2481 //   resetStack
2482 //           Discard any previous contents of the state save stack, and initialize a
2483 //           new stack frame to all -1.  The -1s are needed for capture group limits,
2484 //           where they indicate that a group has not yet matched anything.
2485 //--------------------------------------------------------------------------------
resetStack()2486 REStackFrame *RegexMatcher::resetStack() {
2487     // Discard any previous contents of the state save stack, and initialize a
2488     //  new stack frame with all -1 data.  The -1s are needed for capture group limits,
2489     //  where they indicate that a group has not yet matched anything.
2490     fStack->removeAllElements();
2491 
2492     REStackFrame *iFrame = (REStackFrame *)fStack->reserveBlock(fPattern->fFrameSize, fDeferredStatus);
2493     int32_t i;
2494     for (i=0; i<fPattern->fFrameSize-RESTACKFRAME_HDRCOUNT; i++) {
2495         iFrame->fExtra[i] = -1;
2496     }
2497     return iFrame;
2498 }
2499 
2500 
2501 
2502 //--------------------------------------------------------------------------------
2503 //
2504 //   isWordBoundary
2505 //                     in perl, "xab..cd..", \b is true at positions 0,3,5,7
2506 //                     For us,
2507 //                       If the current char is a combining mark,
2508 //                          \b is FALSE.
2509 //                       Else Scan backwards to the first non-combining char.
2510 //                            We are at a boundary if the this char and the original chars are
2511 //                               opposite in membership in \w set
2512 //
2513 //          parameters:   pos   - the current position in the input buffer
2514 //
2515 //              TODO:  double-check edge cases at region boundaries.
2516 //
2517 //--------------------------------------------------------------------------------
isWordBoundary(int64_t pos)2518 UBool RegexMatcher::isWordBoundary(int64_t pos) {
2519     UBool isBoundary = FALSE;
2520     UBool cIsWord    = FALSE;
2521 
2522     if (pos >= fLookLimit) {
2523         fHitEnd = TRUE;
2524     } else {
2525         // Determine whether char c at current position is a member of the word set of chars.
2526         // If we're off the end of the string, behave as though we're not at a word char.
2527         UTEXT_SETNATIVEINDEX(fInputText, pos);
2528         UChar32  c = UTEXT_CURRENT32(fInputText);
2529         if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
2530             // Current char is a combining one.  Not a boundary.
2531             return FALSE;
2532         }
2533         cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
2534     }
2535 
2536     // Back up until we come to a non-combining char, determine whether
2537     //  that char is a word char.
2538     UBool prevCIsWord = FALSE;
2539     for (;;) {
2540         if (UTEXT_GETNATIVEINDEX(fInputText) <= fLookStart) {
2541             break;
2542         }
2543         UChar32 prevChar = UTEXT_PREVIOUS32(fInputText);
2544         if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
2545               || u_charType(prevChar) == U_FORMAT_CHAR)) {
2546             prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar);
2547             break;
2548         }
2549     }
2550     isBoundary = cIsWord ^ prevCIsWord;
2551     return isBoundary;
2552 }
2553 
isChunkWordBoundary(int32_t pos)2554 UBool RegexMatcher::isChunkWordBoundary(int32_t pos) {
2555     UBool isBoundary = FALSE;
2556     UBool cIsWord    = FALSE;
2557 
2558     const UChar *inputBuf = fInputText->chunkContents;
2559 
2560     if (pos >= fLookLimit) {
2561         fHitEnd = TRUE;
2562     } else {
2563         // Determine whether char c at current position is a member of the word set of chars.
2564         // If we're off the end of the string, behave as though we're not at a word char.
2565         UChar32 c;
2566         U16_GET(inputBuf, fLookStart, pos, fLookLimit, c);
2567         if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
2568             // Current char is a combining one.  Not a boundary.
2569             return FALSE;
2570         }
2571         cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
2572     }
2573 
2574     // Back up until we come to a non-combining char, determine whether
2575     //  that char is a word char.
2576     UBool prevCIsWord = FALSE;
2577     for (;;) {
2578         if (pos <= fLookStart) {
2579             break;
2580         }
2581         UChar32 prevChar;
2582         U16_PREV(inputBuf, fLookStart, pos, prevChar);
2583         if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
2584               || u_charType(prevChar) == U_FORMAT_CHAR)) {
2585             prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar);
2586             break;
2587         }
2588     }
2589     isBoundary = cIsWord ^ prevCIsWord;
2590     return isBoundary;
2591 }
2592 
2593 //--------------------------------------------------------------------------------
2594 //
2595 //   isUWordBoundary
2596 //
2597 //         Test for a word boundary using RBBI word break.
2598 //
2599 //          parameters:   pos   - the current position in the input buffer
2600 //
2601 //--------------------------------------------------------------------------------
isUWordBoundary(int64_t pos)2602 UBool RegexMatcher::isUWordBoundary(int64_t pos) {
2603     UBool       returnVal = FALSE;
2604 #if UCONFIG_NO_BREAK_ITERATION==0
2605 
2606     // If we haven't yet created a break iterator for this matcher, do it now.
2607     if (fWordBreakItr == NULL) {
2608         fWordBreakItr =
2609             (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), fDeferredStatus);
2610         if (U_FAILURE(fDeferredStatus)) {
2611             return FALSE;
2612         }
2613         fWordBreakItr->setText(fInputText, fDeferredStatus);
2614     }
2615 
2616     if (pos >= fLookLimit) {
2617         fHitEnd = TRUE;
2618         returnVal = TRUE;   // With Unicode word rules, only positions within the interior of "real"
2619                             //    words are not boundaries.  All non-word chars stand by themselves,
2620                             //    with word boundaries on both sides.
2621     } else {
2622         if (!UTEXT_USES_U16(fInputText)) {
2623             // !!!: Would like a better way to do this!
2624             UErrorCode status = U_ZERO_ERROR;
2625             pos = utext_extract(fInputText, 0, pos, NULL, 0, &status);
2626         }
2627         returnVal = fWordBreakItr->isBoundary((int32_t)pos);
2628     }
2629 #endif
2630     return   returnVal;
2631 }
2632 
2633 //--------------------------------------------------------------------------------
2634 //
2635 //   IncrementTime     This function is called once each TIMER_INITIAL_VALUE state
2636 //                     saves. Increment the "time" counter, and call the
2637 //                     user callback function if there is one installed.
2638 //
2639 //                     If the match operation needs to be aborted, either for a time-out
2640 //                     or because the user callback asked for it, just set an error status.
2641 //                     The engine will pick that up and stop in its outer loop.
2642 //
2643 //--------------------------------------------------------------------------------
IncrementTime(UErrorCode & status)2644 void RegexMatcher::IncrementTime(UErrorCode &status) {
2645     fTickCounter = TIMER_INITIAL_VALUE;
2646     fTime++;
2647     if (fCallbackFn != NULL) {
2648         if ((*fCallbackFn)(fCallbackContext, fTime) == FALSE) {
2649             status = U_REGEX_STOPPED_BY_CALLER;
2650             return;
2651         }
2652     }
2653     if (fTimeLimit > 0 && fTime >= fTimeLimit) {
2654         status = U_REGEX_TIME_OUT;
2655     }
2656 }
2657 
2658 //--------------------------------------------------------------------------------
2659 //
2660 //   ReportFindProgress     This function is called once for each advance in the target
2661 //                          string from the find() function, and calls the user progress callback
2662 //                          function if there is one installed.
2663 //
2664 //                          NOTE:
2665 //
2666 //                          If the match operation needs to be aborted because the user
2667 //                          callback asked for it, just set an error status.
2668 //                          The engine will pick that up and stop in its outer loop.
2669 //
2670 //--------------------------------------------------------------------------------
ReportFindProgress(int64_t matchIndex,UErrorCode & status)2671 UBool RegexMatcher::ReportFindProgress(int64_t matchIndex, UErrorCode &status) {
2672     if (fFindProgressCallbackFn != NULL) {
2673         if ((*fFindProgressCallbackFn)(fFindProgressCallbackContext, matchIndex) == FALSE) {
2674             status = U_ZERO_ERROR /*U_REGEX_STOPPED_BY_CALLER*/;
2675             return FALSE;
2676         }
2677     }
2678     return TRUE;
2679 }
2680 
2681 //--------------------------------------------------------------------------------
2682 //
2683 //   StateSave
2684 //       Make a new stack frame, initialized as a copy of the current stack frame.
2685 //       Set the pattern index in the original stack frame from the operand value
2686 //       in the opcode.  Execution of the engine continues with the state in
2687 //       the newly created stack frame
2688 //
2689 //       Note that reserveBlock() may grow the stack, resulting in the
2690 //       whole thing being relocated in memory.
2691 //
2692 //    Parameters:
2693 //       fp           The top frame pointer when called.  At return, a new
2694 //                    fame will be present
2695 //       savePatIdx   An index into the compiled pattern.  Goes into the original
2696 //                    (not new) frame.  If execution ever back-tracks out of the
2697 //                    new frame, this will be where we continue from in the pattern.
2698 //    Return
2699 //                    The new frame pointer.
2700 //
2701 //--------------------------------------------------------------------------------
StateSave(REStackFrame * fp,int64_t savePatIdx,UErrorCode & status)2702 inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status) {
2703     // push storage for a new frame.
2704     int64_t *newFP = fStack->reserveBlock(fFrameSize, status);
2705     if (newFP == NULL) {
2706         // Failure on attempted stack expansion.
2707         //   Stack function set some other error code, change it to a more
2708         //   specific one for regular expressions.
2709         status = U_REGEX_STACK_OVERFLOW;
2710         // We need to return a writable stack frame, so just return the
2711         //    previous frame.  The match operation will stop quickly
2712         //    because of the error status, after which the frame will never
2713         //    be looked at again.
2714         return fp;
2715     }
2716     fp = (REStackFrame *)(newFP - fFrameSize);  // in case of realloc of stack.
2717 
2718     // New stack frame = copy of old top frame.
2719     int64_t *source = (int64_t *)fp;
2720     int64_t *dest   = newFP;
2721     for (;;) {
2722         *dest++ = *source++;
2723         if (source == newFP) {
2724             break;
2725         }
2726     }
2727 
2728     fTickCounter--;
2729     if (fTickCounter <= 0) {
2730        IncrementTime(status);    // Re-initializes fTickCounter
2731     }
2732     fp->fPatIdx = savePatIdx;
2733     return (REStackFrame *)newFP;
2734 }
2735 
2736 
2737 //--------------------------------------------------------------------------------
2738 //
2739 //   MatchAt      This is the actual matching engine.
2740 //
2741 //                  startIdx:    begin matching a this index.
2742 //                  toEnd:       if true, match must extend to end of the input region
2743 //
2744 //--------------------------------------------------------------------------------
MatchAt(int64_t startIdx,UBool toEnd,UErrorCode & status)2745 void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) {
2746     UBool       isMatch  = FALSE;      // True if the we have a match.
2747 
2748     int64_t     backSearchIndex = U_INT64_MAX; // used after greedy single-character matches for searching backwards
2749 
2750     int32_t     op;                    // Operation from the compiled pattern, split into
2751     int32_t     opType;                //    the opcode
2752     int32_t     opValue;               //    and the operand value.
2753 
2754     #ifdef REGEX_RUN_DEBUG
2755     if (fTraceDebug)
2756     {
2757         printf("MatchAt(startIdx=%ld)\n", startIdx);
2758         printf("Original Pattern: ");
2759         UChar32 c = utext_next32From(fPattern->fPattern, 0);
2760         while (c != U_SENTINEL) {
2761             if (c<32 || c>256) {
2762                 c = '.';
2763             }
2764             REGEX_DUMP_DEBUG_PRINTF(("%c", c));
2765 
2766             c = UTEXT_NEXT32(fPattern->fPattern);
2767         }
2768         printf("\n");
2769         printf("Input String: ");
2770         c = utext_next32From(fInputText, 0);
2771         while (c != U_SENTINEL) {
2772             if (c<32 || c>256) {
2773                 c = '.';
2774             }
2775             printf("%c", c);
2776 
2777             c = UTEXT_NEXT32(fInputText);
2778         }
2779         printf("\n");
2780         printf("\n");
2781     }
2782     #endif
2783 
2784     if (U_FAILURE(status)) {
2785         return;
2786     }
2787 
2788     //  Cache frequently referenced items from the compiled pattern
2789     //
2790     int64_t             *pat           = fPattern->fCompiledPat->getBuffer();
2791 
2792     const UChar         *litText       = fPattern->fLiteralText.getBuffer();
2793     UVector             *sets          = fPattern->fSets;
2794 
2795     fFrameSize = fPattern->fFrameSize;
2796     REStackFrame        *fp            = resetStack();
2797 
2798     fp->fPatIdx   = 0;
2799     fp->fInputIdx = startIdx;
2800 
2801     // Zero out the pattern's static data
2802     int32_t i;
2803     for (i = 0; i<fPattern->fDataSize; i++) {
2804         fData[i] = 0;
2805     }
2806 
2807     //
2808     //  Main loop for interpreting the compiled pattern.
2809     //  One iteration of the loop per pattern operation performed.
2810     //
2811     for (;;) {
2812 #if 0
2813         if (_heapchk() != _HEAPOK) {
2814             fprintf(stderr, "Heap Trouble\n");
2815         }
2816 #endif
2817 
2818         op      = (int32_t)pat[fp->fPatIdx];
2819         opType  = URX_TYPE(op);
2820         opValue = URX_VAL(op);
2821         #ifdef REGEX_RUN_DEBUG
2822         if (fTraceDebug) {
2823             UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2824             printf("inputIdx=%d   inputChar=%x   sp=%3d   activeLimit=%d  ", fp->fInputIdx,
2825                 UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
2826             fPattern->dumpOp(fp->fPatIdx);
2827         }
2828         #endif
2829         fp->fPatIdx++;
2830 
2831         switch (opType) {
2832 
2833 
2834         case URX_NOP:
2835             break;
2836 
2837 
2838         case URX_BACKTRACK:
2839             // Force a backtrack.  In some circumstances, the pattern compiler
2840             //   will notice that the pattern can't possibly match anything, and will
2841             //   emit one of these at that point.
2842             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2843             break;
2844 
2845 
2846         case URX_ONECHAR:
2847             if (fp->fInputIdx < fActiveLimit) {
2848                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2849                 UChar32 c = UTEXT_NEXT32(fInputText);
2850                 if (c == opValue) {
2851                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
2852                     break;
2853                 }
2854             } else {
2855                 fHitEnd = TRUE;
2856             }
2857 
2858             #ifdef REGEX_SMART_BACKTRACKING
2859             if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
2860                 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
2861                 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
2862                     UBool success = FALSE;
2863                     UChar32 c = UTEXT_PREVIOUS32(fInputText);
2864                     while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex) {
2865                         if (c == opValue) {
2866                             success = TRUE;
2867                             break;
2868                         } else if (c == U_SENTINEL) {
2869                             break;
2870                         }
2871                         c = UTEXT_PREVIOUS32(fInputText);
2872                     }
2873                     if (success) {
2874                         fHitEnd = FALSE;
2875                         fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2876                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
2877                         if (fp->fInputIdx > backSearchIndex) {
2878                             fp = StateSave(fp, fp->fPatIdx, status);
2879                         }
2880                         fp->fPatIdx++; // Skip the LOOP_C, we just did that
2881                         break;
2882                     }
2883                 }
2884             }
2885             #endif
2886 
2887             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2888             break;
2889 
2890 
2891         case URX_STRING:
2892             {
2893                 // Test input against a literal string.
2894                 // Strings require two slots in the compiled pattern, one for the
2895                 //   offset to the string text, and one for the length.
2896                 int32_t   stringStartIdx = opValue;
2897                 int32_t   stringLen;
2898 
2899                 op      = (int32_t)pat[fp->fPatIdx];     // Fetch the second operand
2900                 fp->fPatIdx++;
2901                 opType    = URX_TYPE(op);
2902                 stringLen = URX_VAL(op);
2903                 U_ASSERT(opType == URX_STRING_LEN);
2904                 U_ASSERT(stringLen >= 2);
2905 
2906                 const UChar *patternChars = litText+stringStartIdx;
2907                 const UChar *patternEnd = patternChars+stringLen;
2908 
2909                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2910                 UChar32 c;
2911                 UBool success = TRUE;
2912 
2913                 while (patternChars < patternEnd && success) {
2914                     c = UTEXT_NEXT32(fInputText);
2915 
2916                     if (c != U_SENTINEL && UTEXT_GETNATIVEINDEX(fInputText) <= fActiveLimit) {
2917                         if (U_IS_BMP(c)) {
2918                             success = (*patternChars == c);
2919                             patternChars += 1;
2920                         } else if (patternChars+1 < patternEnd) {
2921                             success = (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c));
2922                             patternChars += 2;
2923                         }
2924                     } else {
2925                         success = FALSE;
2926                         fHitEnd = TRUE;          //   TODO:  See ticket 6074
2927                     }
2928                 }
2929 
2930                 if (success) {
2931                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
2932                 } else {
2933                     #ifdef REGEX_SMART_BACKTRACKING
2934                     if (fp->fInputIdx > backSearchIndex && fStack->size()) {
2935                         REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
2936                         if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
2937                             // Reset to last start point
2938                             UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2939                             patternChars = litText+stringStartIdx;
2940 
2941                             // Search backwards for a possible start
2942                             do {
2943                                 c = UTEXT_PREVIOUS32(fInputText);
2944                                 if (c == U_SENTINEL) {
2945                                     break;
2946                                 } else if ((U_IS_BMP(c) && *patternChars == c) ||
2947                                     (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c))) {
2948                                     success = TRUE;
2949                                     break;
2950                                 }
2951                             } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex);
2952 
2953                             // And try again
2954                             if (success) {
2955                                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2956                                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
2957                                 if (fp->fInputIdx > backSearchIndex) {
2958                                     fp = StateSave(fp, fp->fPatIdx, status);
2959                                 }
2960                                 fp->fPatIdx++; // Skip the LOOP_C, we just did that
2961                                 break;
2962                             }
2963                         }
2964                     }
2965                     #endif
2966                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2967                 }
2968             }
2969             break;
2970 
2971 
2972         case URX_STATE_SAVE:
2973             fp = StateSave(fp, opValue, status);
2974             break;
2975 
2976 
2977         case URX_END:
2978             // The match loop will exit via this path on a successful match,
2979             //   when we reach the end of the pattern.
2980             if (toEnd && fp->fInputIdx != fActiveLimit) {
2981                 // The pattern matched, but not to the end of input.  Try some more.
2982                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2983                 break;
2984             }
2985             isMatch = TRUE;
2986             goto  breakFromLoop;
2987 
2988         // Start and End Capture stack frame variables are laid out out like this:
2989             //  fp->fExtra[opValue]  - The start of a completed capture group
2990             //             opValue+1 - The end   of a completed capture group
2991             //             opValue+2 - the start of a capture group whose end
2992             //                          has not yet been reached (and might not ever be).
2993         case URX_START_CAPTURE:
2994             U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
2995             fp->fExtra[opValue+2] = fp->fInputIdx;
2996             break;
2997 
2998 
2999         case URX_END_CAPTURE:
3000             U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
3001             U_ASSERT(fp->fExtra[opValue+2] >= 0);            // Start pos for this group must be set.
3002             fp->fExtra[opValue]   = fp->fExtra[opValue+2];   // Tentative start becomes real.
3003             fp->fExtra[opValue+1] = fp->fInputIdx;           // End position
3004             U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
3005             break;
3006 
3007 
3008         case URX_DOLLAR:                   //  $, test for End of line
3009                                            //     or for position before new line at end of input
3010             {
3011                 if (fp->fInputIdx >= fAnchorLimit) {
3012                     // We really are at the end of input.  Success.
3013                     fHitEnd = TRUE;
3014                     fRequireEnd = TRUE;
3015                     break;
3016                 }
3017 
3018                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3019 
3020                 // If we are positioned just before a new-line that is located at the
3021                 //   end of input, succeed.
3022                 UChar32 c = UTEXT_NEXT32(fInputText);
3023                 if (UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
3024                     if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) {
3025                         // If not in the middle of a CR/LF sequence
3026                       if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && ((void)UTEXT_PREVIOUS32(fInputText), UTEXT_PREVIOUS32(fInputText))==0x0d)) {
3027                             // At new-line at end of input. Success
3028                             fHitEnd = TRUE;
3029                             fRequireEnd = TRUE;
3030 
3031                             break;
3032                         }
3033                     }
3034                 } else {
3035                     UChar32 nextC = UTEXT_NEXT32(fInputText);
3036                     if (c == 0x0d && nextC == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
3037                         fHitEnd = TRUE;
3038                         fRequireEnd = TRUE;
3039                         break;                         // At CR/LF at end of input.  Success
3040                     }
3041                 }
3042 
3043                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3044             }
3045             break;
3046 
3047 
3048          case URX_DOLLAR_D:                   //  $, test for End of Line, in UNIX_LINES mode.
3049             if (fp->fInputIdx >= fAnchorLimit) {
3050                 // Off the end of input.  Success.
3051                 fHitEnd = TRUE;
3052                 fRequireEnd = TRUE;
3053                 break;
3054             } else {
3055                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3056                 UChar32 c = UTEXT_NEXT32(fInputText);
3057                 // Either at the last character of input, or off the end.
3058                 if (c == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) == fAnchorLimit) {
3059                     fHitEnd = TRUE;
3060                     fRequireEnd = TRUE;
3061                     break;
3062                 }
3063             }
3064 
3065             // Not at end of input.  Back-track out.
3066             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3067             break;
3068 
3069 
3070          case URX_DOLLAR_M:                //  $, test for End of line in multi-line mode
3071              {
3072                  if (fp->fInputIdx >= fAnchorLimit) {
3073                      // We really are at the end of input.  Success.
3074                      fHitEnd = TRUE;
3075                      fRequireEnd = TRUE;
3076                      break;
3077                  }
3078                  // If we are positioned just before a new-line, succeed.
3079                  // It makes no difference where the new-line is within the input.
3080                  UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3081                  UChar32 c = UTEXT_CURRENT32(fInputText);
3082                  if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) {
3083                      // At a line end, except for the odd chance of  being in the middle of a CR/LF sequence
3084                      //  In multi-line mode, hitting a new-line just before the end of input does not
3085                      //   set the hitEnd or requireEnd flags
3086                      if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && UTEXT_PREVIOUS32(fInputText)==0x0d)) {
3087                         break;
3088                      }
3089                  }
3090                  // not at a new line.  Fail.
3091                  fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3092              }
3093              break;
3094 
3095 
3096          case URX_DOLLAR_MD:                //  $, test for End of line in multi-line and UNIX_LINES mode
3097              {
3098                  if (fp->fInputIdx >= fAnchorLimit) {
3099                      // We really are at the end of input.  Success.
3100                      fHitEnd = TRUE;
3101                      fRequireEnd = TRUE;  // Java set requireEnd in this case, even though
3102                      break;               //   adding a new-line would not lose the match.
3103                  }
3104                  // If we are not positioned just before a new-line, the test fails; backtrack out.
3105                  // It makes no difference where the new-line is within the input.
3106                  UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3107                  if (UTEXT_CURRENT32(fInputText) != 0x0a) {
3108                      fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3109                  }
3110              }
3111              break;
3112 
3113 
3114        case URX_CARET:                    //  ^, test for start of line
3115             if (fp->fInputIdx != fAnchorStart) {
3116                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3117             }
3118             break;
3119 
3120 
3121        case URX_CARET_M:                   //  ^, test for start of line in mulit-line mode
3122            {
3123                if (fp->fInputIdx == fAnchorStart) {
3124                    // We are at the start input.  Success.
3125                    break;
3126                }
3127                // Check whether character just before the current pos is a new-line
3128                //   unless we are at the end of input
3129                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3130                UChar32  c = UTEXT_PREVIOUS32(fInputText);
3131                if ((fp->fInputIdx < fAnchorLimit) &&
3132                    ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
3133                    //  It's a new-line.  ^ is true.  Success.
3134                    //  TODO:  what should be done with positions between a CR and LF?
3135                    break;
3136                }
3137                // Not at the start of a line.  Fail.
3138                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3139            }
3140            break;
3141 
3142 
3143        case URX_CARET_M_UNIX:       //  ^, test for start of line in mulit-line + Unix-line mode
3144            {
3145                U_ASSERT(fp->fInputIdx >= fAnchorStart);
3146                if (fp->fInputIdx <= fAnchorStart) {
3147                    // We are at the start input.  Success.
3148                    break;
3149                }
3150                // Check whether character just before the current pos is a new-line
3151                U_ASSERT(fp->fInputIdx <= fAnchorLimit);
3152                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3153                UChar32  c = UTEXT_PREVIOUS32(fInputText);
3154                if (c != 0x0a) {
3155                    // Not at the start of a line.  Back-track out.
3156                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3157                }
3158            }
3159            break;
3160 
3161         case URX_BACKSLASH_B:          // Test for word boundaries
3162             {
3163                 UBool success = isWordBoundary(fp->fInputIdx);
3164                 success ^= (opValue != 0);     // flip sense for \B
3165                 if (!success) {
3166                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3167                 }
3168             }
3169             break;
3170 
3171 
3172         case URX_BACKSLASH_BU:          // Test for word boundaries, Unicode-style
3173             {
3174                 UBool success = isUWordBoundary(fp->fInputIdx);
3175                 success ^= (opValue != 0);     // flip sense for \B
3176                 if (!success) {
3177                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3178                 }
3179             }
3180             break;
3181 
3182 
3183         case URX_BACKSLASH_D:            // Test for decimal digit
3184             {
3185                 if (fp->fInputIdx >= fActiveLimit) {
3186                     fHitEnd = TRUE;
3187                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3188                     break;
3189                 }
3190 
3191                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3192 
3193                 UChar32 c = UTEXT_NEXT32(fInputText);
3194                 int8_t ctype = u_charType(c);     // TODO:  make a unicode set for this.  Will be faster.
3195                 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
3196                 success ^= (opValue != 0);        // flip sense for \D
3197                 if (success) {
3198                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3199                 } else {
3200                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3201                 }
3202             }
3203             break;
3204 
3205 
3206         case URX_BACKSLASH_G:          // Test for position at end of previous match
3207             if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) {
3208                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3209             }
3210             break;
3211 
3212 
3213         case URX_BACKSLASH_X:
3214             //  Match a Grapheme, as defined by Unicode TR 29.
3215             //  Differs slightly from Perl, which consumes combining marks independently
3216             //    of context.
3217             {
3218 
3219                 // Fail if at end of input
3220                 if (fp->fInputIdx >= fActiveLimit) {
3221                     fHitEnd = TRUE;
3222                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3223                     break;
3224                 }
3225 
3226                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3227 
3228                 // Examine (and consume) the current char.
3229                 //   Dispatch into a little state machine, based on the char.
3230                 UChar32  c;
3231                 c = UTEXT_NEXT32(fInputText);
3232                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3233                 UnicodeSet **sets = fPattern->fStaticSets;
3234                 if (sets[URX_GC_NORMAL]->contains(c))  goto GC_Extend;
3235                 if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control;
3236                 if (sets[URX_GC_L]->contains(c))       goto GC_L;
3237                 if (sets[URX_GC_LV]->contains(c))      goto GC_V;
3238                 if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
3239                 if (sets[URX_GC_V]->contains(c))       goto GC_V;
3240                 if (sets[URX_GC_T]->contains(c))       goto GC_T;
3241                 goto GC_Extend;
3242 
3243 
3244 
3245 GC_L:
3246                 if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
3247                 c = UTEXT_NEXT32(fInputText);
3248                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3249                 if (sets[URX_GC_L]->contains(c))       goto GC_L;
3250                 if (sets[URX_GC_LV]->contains(c))      goto GC_V;
3251                 if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
3252                 if (sets[URX_GC_V]->contains(c))       goto GC_V;
3253                 (void)UTEXT_PREVIOUS32(fInputText);
3254                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3255                 goto GC_Extend;
3256 
3257 GC_V:
3258                 if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
3259                 c = UTEXT_NEXT32(fInputText);
3260                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3261                 if (sets[URX_GC_V]->contains(c))       goto GC_V;
3262                 if (sets[URX_GC_T]->contains(c))       goto GC_T;
3263                 (void)UTEXT_PREVIOUS32(fInputText);
3264                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3265                 goto GC_Extend;
3266 
3267 GC_T:
3268                 if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
3269                 c = UTEXT_NEXT32(fInputText);
3270                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3271                 if (sets[URX_GC_T]->contains(c))       goto GC_T;
3272                 (void)UTEXT_PREVIOUS32(fInputText);
3273                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3274                 goto GC_Extend;
3275 
3276 GC_Extend:
3277                 // Combining characters are consumed here
3278                 for (;;) {
3279                     if (fp->fInputIdx >= fActiveLimit) {
3280                         break;
3281                     }
3282                     c = UTEXT_CURRENT32(fInputText);
3283                     if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
3284                         break;
3285                     }
3286                     (void)UTEXT_NEXT32(fInputText);
3287                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3288                 }
3289                 goto GC_Done;
3290 
3291 GC_Control:
3292                 // Most control chars stand alone (don't combine with combining chars),
3293                 //   except for that CR/LF sequence is a single grapheme cluster.
3294                 if (c == 0x0d && fp->fInputIdx < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
3295                     c = UTEXT_NEXT32(fInputText);
3296                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3297                 }
3298 
3299 GC_Done:
3300                 if (fp->fInputIdx >= fActiveLimit) {
3301                     fHitEnd = TRUE;
3302                 }
3303                 break;
3304             }
3305 
3306 
3307 
3308 
3309         case URX_BACKSLASH_Z:          // Test for end of Input
3310             if (fp->fInputIdx < fAnchorLimit) {
3311                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3312             } else {
3313                 fHitEnd = TRUE;
3314                 fRequireEnd = TRUE;
3315             }
3316             break;
3317 
3318 
3319 
3320         case URX_STATIC_SETREF:
3321             {
3322                 // Test input character against one of the predefined sets
3323                 //    (Word Characters, for example)
3324                 // The high bit of the op value is a flag for the match polarity.
3325                 //    0:   success if input char is in set.
3326                 //    1:   success if input char is not in set.
3327                 if (fp->fInputIdx >= fActiveLimit) {
3328                     fHitEnd = TRUE;
3329                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3330                     break;
3331                 }
3332 
3333                 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
3334                 opValue &= ~URX_NEG_SET;
3335                 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
3336 
3337                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3338                 UChar32 c = UTEXT_NEXT32(fInputText);
3339                 if (c < 256) {
3340                     Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
3341                     if (s8->contains(c)) {
3342                         success = !success;
3343                     }
3344                 } else {
3345                     const UnicodeSet *s = fPattern->fStaticSets[opValue];
3346                     if (s->contains(c)) {
3347                         success = !success;
3348                     }
3349                 }
3350                 if (success) {
3351                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3352                 } else {
3353                     // the character wasn't in the set.
3354                     #ifdef REGEX_SMART_BACKTRACKING
3355                     if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
3356                         REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
3357                         if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
3358                             // Try to find it, backwards
3359                             UTEXT_PREVIOUS32(fInputText); // skip the first character we tried
3360                             success = ((opValue & URX_NEG_SET) == URX_NEG_SET); // reset
3361                             do {
3362                                 c = UTEXT_PREVIOUS32(fInputText);
3363                                 if (c == U_SENTINEL) {
3364                                     break;
3365                                 } else if (c < 256) {
3366                                     Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
3367                                     if (s8->contains(c)) {
3368                                         success = !success;
3369                                     }
3370                                 } else {
3371                                     const UnicodeSet *s = fPattern->fStaticSets[opValue];
3372                                     if (s->contains(c)) {
3373                                         success = !success;
3374                                     }
3375                                 }
3376                             } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex && !success);
3377 
3378                             if (success && c != U_SENTINEL) {
3379                                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3380                                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3381                                 if (fp->fInputIdx > backSearchIndex) {
3382                                     fp = StateSave(fp, fp->fPatIdx, status);
3383                                 }
3384                                 fp->fPatIdx++; // Skip the LOOP_C, we just did that
3385                                 break;
3386                             }
3387                         }
3388                     }
3389                     #endif
3390                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3391                 }
3392             }
3393             break;
3394 
3395 
3396         case URX_STAT_SETREF_N:
3397             {
3398                 // Test input character for NOT being a member of  one of
3399                 //    the predefined sets (Word Characters, for example)
3400                 if (fp->fInputIdx >= fActiveLimit) {
3401                     fHitEnd = TRUE;
3402                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3403                     break;
3404                 }
3405 
3406                 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
3407 
3408                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3409 
3410                 UChar32 c = UTEXT_NEXT32(fInputText);
3411                 if (c < 256) {
3412                     Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
3413                     if (s8->contains(c) == FALSE) {
3414                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3415                         break;
3416                     }
3417                 } else {
3418                     const UnicodeSet *s = fPattern->fStaticSets[opValue];
3419                     if (s->contains(c) == FALSE) {
3420                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3421                         break;
3422                     }
3423                 }
3424                 // the character wasn't in the set.
3425                 #ifdef REGEX_SMART_BACKTRACKING
3426                 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
3427                     REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
3428                     if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
3429                         // Try to find it, backwards
3430                         UTEXT_PREVIOUS32(fInputText); // skip the first character we tried
3431                         UBool success = FALSE;
3432                         do {
3433                             c = UTEXT_PREVIOUS32(fInputText);
3434                             if (c == U_SENTINEL) {
3435                                 break;
3436                             } else if (c < 256) {
3437                                 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
3438                                 if (s8->contains(c) == FALSE) {
3439                                     success = TRUE;
3440                                     break;
3441                                 }
3442                             } else {
3443                                 const UnicodeSet *s = fPattern->fStaticSets[opValue];
3444                                 if (s->contains(c) == FALSE) {
3445                                     success = TRUE;
3446                                     break;
3447                                 }
3448                             }
3449                         } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex);
3450 
3451                         if (success) {
3452                             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3453                             fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3454                             if (fp->fInputIdx > backSearchIndex) {
3455                                 fp = StateSave(fp, fp->fPatIdx, status);
3456                             }
3457                             fp->fPatIdx++; // Skip the LOOP_C, we just did that
3458                             break;
3459                         }
3460                     }
3461                 }
3462                 #endif
3463                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3464             }
3465             break;
3466 
3467 
3468         case URX_SETREF:
3469             if (fp->fInputIdx >= fActiveLimit) {
3470                 fHitEnd = TRUE;
3471                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3472                 break;
3473             } else {
3474                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3475 
3476                 // There is input left.  Pick up one char and test it for set membership.
3477                 UChar32 c = UTEXT_NEXT32(fInputText);
3478                 U_ASSERT(opValue > 0 && opValue < sets->size());
3479                 if (c<256) {
3480                     Regex8BitSet *s8 = &fPattern->fSets8[opValue];
3481                     if (s8->contains(c)) {
3482                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3483                         break;
3484                     }
3485                 } else {
3486                     UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
3487                     if (s->contains(c)) {
3488                         // The character is in the set.  A Match.
3489                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3490                         break;
3491                     }
3492                 }
3493 
3494                 // the character wasn't in the set.
3495                 #ifdef REGEX_SMART_BACKTRACKING
3496                 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
3497                     REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
3498                     if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
3499                         // Try to find it, backwards
3500                         UTEXT_PREVIOUS32(fInputText); // skip the first character we tried
3501                         UBool success = FALSE;
3502                         do {
3503                             c = UTEXT_PREVIOUS32(fInputText);
3504                             if (c == U_SENTINEL) {
3505                                 break;
3506                             } else if (c < 256) {
3507                                 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
3508                                 if (s8->contains(c)) {
3509                                     success = TRUE;
3510                                     break;
3511                                 }
3512                             } else {
3513                                 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
3514                                 if (s->contains(c)) {
3515                                     success = TRUE;
3516                                     break;
3517                                 }
3518                             }
3519                         } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex);
3520 
3521                         if (success) {
3522                             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3523                             fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3524                             if (fp->fInputIdx > backSearchIndex) {
3525                                 fp = StateSave(fp, fp->fPatIdx, status);
3526                             }
3527                             fp->fPatIdx++; // Skip the LOOP_C, we just did that
3528                             break;
3529                         }
3530                     }
3531                 }
3532                 #endif
3533                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3534             }
3535             break;
3536 
3537 
3538         case URX_DOTANY:
3539             {
3540                 // . matches anything, but stops at end-of-line.
3541                 if (fp->fInputIdx >= fActiveLimit) {
3542                     // At end of input.  Match failed.  Backtrack out.
3543                     fHitEnd = TRUE;
3544                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3545                     break;
3546                 }
3547 
3548                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3549 
3550                 // There is input left.  Advance over one char, unless we've hit end-of-line
3551                 UChar32 c = UTEXT_NEXT32(fInputText);
3552                 if (((c & 0x7f) <= 0x29) &&     // First quickly bypass as many chars as possible
3553                     ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
3554                     // End of line in normal mode.   . does not match.
3555                         fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3556                     break;
3557                 }
3558                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3559             }
3560             break;
3561 
3562 
3563         case URX_DOTANY_ALL:
3564             {
3565                 // ., in dot-matches-all (including new lines) mode
3566                 if (fp->fInputIdx >= fActiveLimit) {
3567                     // At end of input.  Match failed.  Backtrack out.
3568                     fHitEnd = TRUE;
3569                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3570                     break;
3571                 }
3572 
3573                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3574 
3575                 // There is input left.  Advance over one char, except if we are
3576                 //   at a cr/lf, advance over both of them.
3577                 UChar32 c;
3578                 c = UTEXT_NEXT32(fInputText);
3579                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3580                 if (c==0x0d && fp->fInputIdx < fActiveLimit) {
3581                     // In the case of a CR/LF, we need to advance over both.
3582                     UChar32 nextc = UTEXT_CURRENT32(fInputText);
3583                     if (nextc == 0x0a) {
3584                         (void)UTEXT_NEXT32(fInputText);
3585                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3586                     }
3587                 }
3588             }
3589             break;
3590 
3591 
3592         case URX_DOTANY_UNIX:
3593             {
3594                 // '.' operator, matches all, but stops at end-of-line.
3595                 //   UNIX_LINES mode, so 0x0a is the only recognized line ending.
3596                 if (fp->fInputIdx >= fActiveLimit) {
3597                     // At end of input.  Match failed.  Backtrack out.
3598                     fHitEnd = TRUE;
3599                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3600                     break;
3601                 }
3602 
3603                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3604 
3605                 // There is input left.  Advance over one char, unless we've hit end-of-line
3606                 UChar32 c = UTEXT_NEXT32(fInputText);
3607                 if (c == 0x0a) {
3608                     // End of line in normal mode.   '.' does not match the \n
3609                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3610                 } else {
3611                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3612                 }
3613             }
3614             break;
3615 
3616 
3617         case URX_JMP:
3618             fp->fPatIdx = opValue;
3619             break;
3620 
3621         case URX_FAIL:
3622             isMatch = FALSE;
3623             goto breakFromLoop;
3624 
3625         case URX_JMP_SAV:
3626             U_ASSERT(opValue < fPattern->fCompiledPat->size());
3627             fp = StateSave(fp, fp->fPatIdx, status);       // State save to loc following current
3628             fp->fPatIdx = opValue;                         // Then JMP.
3629             break;
3630 
3631         case URX_JMP_SAV_X:
3632             // This opcode is used with (x)+, when x can match a zero length string.
3633             // Same as JMP_SAV, except conditional on the match having made forward progress.
3634             // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
3635             //   data address of the input position at the start of the loop.
3636             {
3637                 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
3638                 int32_t  stoOp = (int32_t)pat[opValue-1];
3639                 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
3640                 int32_t  frameLoc = URX_VAL(stoOp);
3641                 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
3642                 int64_t prevInputIdx = fp->fExtra[frameLoc];
3643                 U_ASSERT(prevInputIdx <= fp->fInputIdx);
3644                 if (prevInputIdx < fp->fInputIdx) {
3645                     // The match did make progress.  Repeat the loop.
3646                     fp = StateSave(fp, fp->fPatIdx, status);  // State save to loc following current
3647                     fp->fPatIdx = opValue;
3648                     fp->fExtra[frameLoc] = fp->fInputIdx;
3649                 }
3650                 // If the input position did not advance, we do nothing here,
3651                 //   execution will fall out of the loop.
3652             }
3653             break;
3654 
3655         case URX_CTR_INIT:
3656             {
3657                 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
3658                 fp->fExtra[opValue] = 0;       //  Set the loop counter variable to zero
3659 
3660                 // Pick up the three extra operands that CTR_INIT has, and
3661                 //    skip the pattern location counter past
3662                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3663                 fp->fPatIdx += 3;
3664                 int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
3665                 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
3666                 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
3667                 U_ASSERT(minCount>=0);
3668                 U_ASSERT(maxCount>=minCount || maxCount==-1);
3669                 U_ASSERT(loopLoc>fp->fPatIdx);
3670 
3671                 if (minCount == 0) {
3672                     fp = StateSave(fp, loopLoc+1, status);
3673                 }
3674                 if (maxCount == 0) {
3675                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3676                 }
3677             }
3678             break;
3679 
3680         case URX_CTR_LOOP:
3681             {
3682                 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
3683                 int32_t initOp = (int32_t)pat[opValue];
3684                 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
3685                 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
3686                 int32_t minCount  = (int32_t)pat[opValue+2];
3687                 int32_t maxCount  = (int32_t)pat[opValue+3];
3688                 // Increment the counter.  Note: we DIDN'T worry about counter
3689                 //   overflow, since the data comes from UnicodeStrings, which
3690                 //   stores its length in an int32_t. Do we have to think about
3691                 //   this now that we're using UText? Probably not, since the length
3692                 //   in UChar32s is still an int32_t.
3693                 (*pCounter)++;
3694                 U_ASSERT(*pCounter > 0);
3695                 if ((uint64_t)*pCounter >= (uint32_t)maxCount) {
3696                     U_ASSERT(*pCounter == maxCount || maxCount == -1);
3697                     break;
3698                 }
3699                 if (*pCounter >= minCount) {
3700                     fp = StateSave(fp, fp->fPatIdx, status);
3701                 }
3702                 fp->fPatIdx = opValue + 4;    // Loop back.
3703             }
3704             break;
3705 
3706         case URX_CTR_INIT_NG:
3707             {
3708                 // Initialize a non-greedy loop
3709                 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
3710                 fp->fExtra[opValue] = 0;       //  Set the loop counter variable to zero
3711 
3712                 // Pick up the three extra operands that CTR_INIT has, and
3713                 //    skip the pattern location counter past
3714                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3715                 fp->fPatIdx += 3;
3716                 int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
3717                 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
3718                 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
3719                 U_ASSERT(minCount>=0);
3720                 U_ASSERT(maxCount>=minCount || maxCount==-1);
3721                 U_ASSERT(loopLoc>fp->fPatIdx);
3722 
3723                 if (minCount == 0) {
3724                     if (maxCount != 0) {
3725                         fp = StateSave(fp, fp->fPatIdx, status);
3726                     }
3727                     fp->fPatIdx = loopLoc+1;   // Continue with stuff after repeated block
3728                 }
3729             }
3730             break;
3731 
3732         case URX_CTR_LOOP_NG:
3733             {
3734                 // Non-greedy {min, max} loops
3735                 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
3736                 int32_t initOp = (int32_t)pat[opValue];
3737                 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
3738                 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
3739                 int32_t minCount  = (int32_t)pat[opValue+2];
3740                 int32_t maxCount  = (int32_t)pat[opValue+3];
3741                 // Increment the counter.  Note: we DIDN'T worry about counter
3742                 //   overflow, since the data comes from UnicodeStrings, which
3743                 //   stores its length in an int32_t. Do we have to think about
3744                 //   this now that we're using UText? Probably not, since the length
3745                 //   in UChar32s is still an int32_t.
3746                 (*pCounter)++;
3747                 U_ASSERT(*pCounter > 0);
3748 
3749                 if ((uint64_t)*pCounter >= (uint32_t)maxCount) {
3750                     // The loop has matched the maximum permitted number of times.
3751                     //   Break out of here with no action.  Matching will
3752                     //   continue with the following pattern.
3753                     U_ASSERT(*pCounter == maxCount || maxCount == -1);
3754                     break;
3755                 }
3756 
3757                 if (*pCounter < minCount) {
3758                     // We haven't met the minimum number of matches yet.
3759                     //   Loop back for another one.
3760                     fp->fPatIdx = opValue + 4;    // Loop back.
3761                 } else {
3762                     // We do have the minimum number of matches.
3763                     //   Fall into the following pattern, but first do
3764                     //   a state save to the top of the loop, so that a failure
3765                     //   in the following pattern will try another iteration of the loop.
3766                     fp = StateSave(fp, opValue + 4, status);
3767                 }
3768             }
3769             break;
3770 
3771         case URX_STO_SP:
3772             U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
3773             fData[opValue] = fStack->size();
3774             break;
3775 
3776         case URX_LD_SP:
3777             {
3778                 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
3779                 int32_t newStackSize = (int32_t)fData[opValue];
3780                 U_ASSERT(newStackSize <= fStack->size());
3781                 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
3782                 if (newFP == (int64_t *)fp) {
3783                     break;
3784                 }
3785                 int32_t i;
3786                 for (i=0; i<fFrameSize; i++) {
3787                     newFP[i] = ((int64_t *)fp)[i];
3788                 }
3789                 fp = (REStackFrame *)newFP;
3790                 fStack->setSize(newStackSize);
3791             }
3792             break;
3793 
3794         case URX_BACKREF:
3795         case URX_BACKREF_I:
3796             {
3797                 U_ASSERT(opValue < fFrameSize);
3798                 int64_t groupStartIdx = fp->fExtra[opValue];
3799                 int64_t groupEndIdx   = fp->fExtra[opValue+1];
3800                 U_ASSERT(groupStartIdx <= groupEndIdx);
3801                 if (groupStartIdx < 0) {
3802                     // This capture group has not participated in the match thus far,
3803                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
3804                 }
3805 
3806                 if (groupEndIdx == groupStartIdx) {
3807                     //   The capture group match was of an empty string.
3808                     //   Verified by testing:  Perl matches succeed in this case, so
3809                     //   we do too.
3810                     break;
3811                 }
3812 
3813                 UTEXT_SETNATIVEINDEX(fAltInputText, groupStartIdx);
3814                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3815 
3816                 UBool haveMatch = (opType == URX_BACKREF ?
3817                     (0 == utext_compareNativeLimit(fAltInputText, groupEndIdx, fInputText, -1)) :
3818                     (0 == utext_caseCompareNativeLimit(fAltInputText, groupEndIdx, fInputText, -1, U_FOLD_CASE_DEFAULT, &status)));
3819                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3820 
3821                 if (fp->fInputIdx > fActiveLimit) {
3822                     fHitEnd = TRUE;
3823                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
3824                 } else if (!haveMatch) {
3825                     if (fp->fInputIdx == fActiveLimit) {
3826                         fHitEnd = TRUE;
3827                     }
3828                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
3829                 }
3830             }
3831             break;
3832 
3833         case URX_STO_INP_LOC:
3834             {
3835                 U_ASSERT(opValue >= 0 && opValue < fFrameSize);
3836                 fp->fExtra[opValue] = fp->fInputIdx;
3837             }
3838             break;
3839 
3840         case URX_JMPX:
3841             {
3842                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3843                 fp->fPatIdx += 1;
3844                 int32_t dataLoc  = URX_VAL(pat[instrOperandLoc]);
3845                 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
3846                 int64_t savedInputIdx = fp->fExtra[dataLoc];
3847                 U_ASSERT(savedInputIdx <= fp->fInputIdx);
3848                 if (savedInputIdx < fp->fInputIdx) {
3849                     fp->fPatIdx = opValue;                               // JMP
3850                 } else {
3851                      fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no progress in loop.
3852                 }
3853             }
3854             break;
3855 
3856         case URX_LA_START:
3857             {
3858                 // Entering a lookahead block.
3859                 // Save Stack Ptr, Input Pos.
3860                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3861                 fData[opValue]   = fStack->size();
3862                 fData[opValue+1] = fp->fInputIdx;
3863                 fActiveStart     = fLookStart;          // Set the match region change for
3864                 fActiveLimit     = fLookLimit;          //   transparent bounds.
3865             }
3866             break;
3867 
3868         case URX_LA_END:
3869             {
3870                 // Leaving a look-ahead block.
3871                 //  restore Stack Ptr, Input Pos to positions they had on entry to block.
3872                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3873                 int32_t stackSize = fStack->size();
3874                 int32_t newStackSize =(int32_t)fData[opValue];
3875                 U_ASSERT(stackSize >= newStackSize);
3876                 if (stackSize > newStackSize) {
3877                     // Copy the current top frame back to the new (cut back) top frame.
3878                     //   This makes the capture groups from within the look-ahead
3879                     //   expression available.
3880                     int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
3881                     int32_t i;
3882                     for (i=0; i<fFrameSize; i++) {
3883                         newFP[i] = ((int64_t *)fp)[i];
3884                     }
3885                     fp = (REStackFrame *)newFP;
3886                     fStack->setSize(newStackSize);
3887                 }
3888                 fp->fInputIdx = fData[opValue+1];
3889 
3890                 // Restore the active region bounds in the input string; they may have
3891                 //    been changed because of transparent bounds on a Region.
3892                 fActiveStart = fRegionStart;
3893                 fActiveLimit = fRegionLimit;
3894             }
3895             break;
3896 
3897         case URX_ONECHAR_I:
3898             if (fp->fInputIdx < fActiveLimit) {
3899                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3900 
3901                 UChar32 c = UTEXT_NEXT32(fInputText);
3902                 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
3903                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3904                     break;
3905                 }
3906             } else {
3907                 fHitEnd = TRUE;
3908             }
3909 
3910             #ifdef REGEX_SMART_BACKTRACKING
3911             if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
3912                 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
3913                 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
3914                     UBool success = FALSE;
3915                     UChar32 c = UTEXT_PREVIOUS32(fInputText);
3916                     while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex) {
3917                         if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
3918                             success = TRUE;
3919                             break;
3920                         } else if (c == U_SENTINEL) {
3921                             break;
3922                         }
3923                         c = UTEXT_PREVIOUS32(fInputText);
3924                     }
3925                     if (success) {
3926                         fHitEnd = FALSE;
3927                         fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3928                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3929                         if (fp->fInputIdx > backSearchIndex) {
3930                             fp = StateSave(fp, fp->fPatIdx, status);
3931                         }
3932                         fp->fPatIdx++; // Skip the LOOP_C, we just did that
3933                         break;
3934                     }
3935                 }
3936             }
3937             #endif
3938 
3939             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3940             break;
3941 
3942         case URX_STRING_I:
3943             {
3944                 // Test input against a literal string.
3945                 // Strings require two slots in the compiled pattern, one for the
3946                 //   offset to the string text, and one for the length.
3947                 const UCaseProps *csp = ucase_getSingleton();
3948                 {
3949                     int32_t stringStartIdx, stringLen;
3950                     stringStartIdx = opValue;
3951 
3952                     op      = (int32_t)pat[fp->fPatIdx];
3953                     fp->fPatIdx++;
3954                     opType  = URX_TYPE(op);
3955                     opValue = URX_VAL(op);
3956                     U_ASSERT(opType == URX_STRING_LEN);
3957                     stringLen = opValue;
3958 
3959                     const UChar *patternChars = litText+stringStartIdx;
3960                     const UChar *patternEnd = patternChars+stringLen;
3961 
3962                     const UChar *foldChars = NULL;
3963                     int32_t foldOffset, foldLength;
3964                     UChar32 c;
3965 
3966                     foldOffset = foldLength = 0;
3967                     UBool success = TRUE;
3968 
3969                     UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3970                     while (patternChars < patternEnd && success) {
3971                         if(foldOffset < foldLength) {
3972                             U16_NEXT_UNSAFE(foldChars, foldOffset, c);
3973                         } else {
3974                             c = UTEXT_NEXT32(fInputText);
3975                             if (c != U_SENTINEL) {
3976                                 foldLength = ucase_toFullFolding(csp, c, &foldChars, U_FOLD_CASE_DEFAULT);
3977                                 if(foldLength >= 0) {
3978                                     if(foldLength <= UCASE_MAX_STRING_LENGTH) {   // !!!: Does not correctly handle chars that fold to 0-length strings
3979                                         foldOffset = 0;
3980                                         U16_NEXT_UNSAFE(foldChars, foldOffset, c);
3981                                     } else {
3982                                         c = foldLength;
3983                                         foldLength = foldOffset; // to avoid reading chars from the folding buffer
3984                                     }
3985                                 }
3986                             }
3987 
3988                             fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3989                         }
3990 
3991                         success = FALSE;
3992                         if (c != U_SENTINEL && (fp->fInputIdx <= fActiveLimit)) {
3993                             if (U_IS_BMP(c)) {
3994                                 success = (*patternChars == c);
3995                                 patternChars += 1;
3996                             } else if (patternChars+1 < patternEnd) {
3997                                 success = (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c));
3998                                 patternChars += 2;
3999                             }
4000                         } else {
4001                             fHitEnd = TRUE;          //   TODO:  See ticket 6074
4002                         }
4003                     }
4004 
4005                     if (!success) {
4006                         #ifdef REGEX_SMART_BACKTRACKING
4007                         if (fp->fInputIdx > backSearchIndex && fStack->size()) {
4008                             REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
4009                             if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
4010                                 // Reset to last start point
4011                                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
4012                                 patternChars = litText+stringStartIdx;
4013 
4014                                 // Search backwards for a possible start
4015                                 do {
4016                                     c = UTEXT_PREVIOUS32(fInputText);
4017                                     if (c == U_SENTINEL) {
4018                                         break;
4019                                     } else {
4020                                         foldLength = ucase_toFullFolding(csp, c, &foldChars, U_FOLD_CASE_DEFAULT);
4021                                         if(foldLength >= 0) {
4022                                             if(foldLength <= UCASE_MAX_STRING_LENGTH) {   // !!!: Does not correctly handle chars that fold to 0-length strings
4023                                                 foldOffset = 0;
4024                                                 U16_NEXT_UNSAFE(foldChars, foldOffset, c);
4025                                             } else {
4026                                                 c = foldLength;
4027                                                 foldLength = foldOffset; // to avoid reading chars from the folding buffer
4028                                             }
4029                                         }
4030 
4031                                         if ((U_IS_BMP(c) && *patternChars == c) ||
4032                                                (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c))) {
4033                                             success = TRUE;
4034                                             break;
4035                                         }
4036                                     }
4037                                 } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex);
4038 
4039                                 // And try again
4040                                 if (success) {
4041                                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4042                                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
4043                                     if (fp->fInputIdx > backSearchIndex) {
4044                                         fp = StateSave(fp, fp->fPatIdx, status);
4045                                     }
4046                                     fp->fPatIdx++; // Skip the LOOP_C, we just did that
4047                                     break;
4048                                 }
4049                             }
4050                         }
4051                         #endif
4052                         fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4053                     }
4054                 }
4055             }
4056             break;
4057 
4058         case URX_LB_START:
4059             {
4060                 // Entering a look-behind block.
4061                 // Save Stack Ptr, Input Pos.
4062                 //   TODO:  implement transparent bounds.  Ticket #6067
4063                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4064                 fData[opValue]   = fStack->size();
4065                 fData[opValue+1] = fp->fInputIdx;
4066                 // Init the variable containing the start index for attempted matches.
4067                 fData[opValue+2] = -1;
4068                 // Save input string length, then reset to pin any matches to end at
4069                 //   the current position.
4070                 fData[opValue+3] = fActiveLimit;
4071                 fActiveLimit     = fp->fInputIdx;
4072             }
4073             break;
4074 
4075 
4076         case URX_LB_CONT:
4077             {
4078                 // Positive Look-Behind, at top of loop checking for matches of LB expression
4079                 //    at all possible input starting positions.
4080 
4081                 // Fetch the min and max possible match lengths.  They are the operands
4082                 //   of this op in the pattern.
4083                 int32_t minML = (int32_t)pat[fp->fPatIdx++];
4084                 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
4085                 U_ASSERT(minML <= maxML);
4086                 U_ASSERT(minML >= 0);
4087 
4088                 // Fetch (from data) the last input index where a match was attempted.
4089                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4090                 int64_t  *lbStartIdx = &fData[opValue+2];
4091                 if (*lbStartIdx < 0) {
4092                     // First time through loop.
4093                     *lbStartIdx = fp->fInputIdx - minML;
4094                 } else {
4095                     // 2nd through nth time through the loop.
4096                     // Back up start position for match by one.
4097                     if (*lbStartIdx == 0) {
4098                         (*lbStartIdx)--;
4099                     } else {
4100                         UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx);
4101                         (void)UTEXT_PREVIOUS32(fInputText);
4102                         *lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
4103                     }
4104                 }
4105 
4106                 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
4107                     // We have tried all potential match starting points without
4108                     //  getting a match.  Backtrack out, and out of the
4109                     //   Look Behind altogether.
4110                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4111                     int64_t restoreInputLen = fData[opValue+3];
4112                     U_ASSERT(restoreInputLen >= fActiveLimit);
4113                     U_ASSERT(restoreInputLen <= fInputLength);
4114                     fActiveLimit = restoreInputLen;
4115                     break;
4116                 }
4117 
4118                 //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
4119                 //      (successful match will fall off the end of the loop.)
4120                 fp = StateSave(fp, fp->fPatIdx-3, status);
4121                 fp->fInputIdx = *lbStartIdx;
4122             }
4123             break;
4124 
4125         case URX_LB_END:
4126             // End of a look-behind block, after a successful match.
4127             {
4128                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4129                 if (fp->fInputIdx != fActiveLimit) {
4130                     //  The look-behind expression matched, but the match did not
4131                     //    extend all the way to the point that we are looking behind from.
4132                     //  FAIL out of here, which will take us back to the LB_CONT, which
4133                     //     will retry the match starting at another position or fail
4134                     //     the look-behind altogether, whichever is appropriate.
4135                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4136                     break;
4137                 }
4138 
4139                 // Look-behind match is good.  Restore the orignal input string length,
4140                 //   which had been truncated to pin the end of the lookbehind match to the
4141                 //   position being looked-behind.
4142                 int64_t originalInputLen = fData[opValue+3];
4143                 U_ASSERT(originalInputLen >= fActiveLimit);
4144                 U_ASSERT(originalInputLen <= fInputLength);
4145                 fActiveLimit = originalInputLen;
4146             }
4147             break;
4148 
4149 
4150         case URX_LBN_CONT:
4151             {
4152                 // Negative Look-Behind, at top of loop checking for matches of LB expression
4153                 //    at all possible input starting positions.
4154 
4155                 // Fetch the extra parameters of this op.
4156                 int32_t minML       = (int32_t)pat[fp->fPatIdx++];
4157                 int32_t maxML       = (int32_t)pat[fp->fPatIdx++];
4158                 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
4159                         continueLoc = URX_VAL(continueLoc);
4160                 U_ASSERT(minML <= maxML);
4161                 U_ASSERT(minML >= 0);
4162                 U_ASSERT(continueLoc > fp->fPatIdx);
4163 
4164                 // Fetch (from data) the last input index where a match was attempted.
4165                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4166                 int64_t  *lbStartIdx = &fData[opValue+2];
4167                 if (*lbStartIdx < 0) {
4168                     // First time through loop.
4169                     *lbStartIdx = fp->fInputIdx - minML;
4170                 } else {
4171                     // 2nd through nth time through the loop.
4172                     // Back up start position for match by one.
4173                     if (*lbStartIdx == 0) {
4174                         (*lbStartIdx)--;
4175                     } else {
4176                         UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx);
4177                         (void)UTEXT_PREVIOUS32(fInputText);
4178                         *lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
4179                     }
4180                 }
4181 
4182                 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
4183                     // We have tried all potential match starting points without
4184                     //  getting a match, which means that the negative lookbehind as
4185                     //  a whole has succeeded.  Jump forward to the continue location
4186                     int64_t restoreInputLen = fData[opValue+3];
4187                     U_ASSERT(restoreInputLen >= fActiveLimit);
4188                     U_ASSERT(restoreInputLen <= fInputLength);
4189                     fActiveLimit = restoreInputLen;
4190                     fp->fPatIdx = continueLoc;
4191                     break;
4192                 }
4193 
4194                 //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
4195                 //      (successful match will cause a FAIL out of the loop altogether.)
4196                 fp = StateSave(fp, fp->fPatIdx-4, status);
4197                 fp->fInputIdx = *lbStartIdx;
4198             }
4199             break;
4200 
4201         case URX_LBN_END:
4202             // End of a negative look-behind block, after a successful match.
4203             {
4204                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4205                 if (fp->fInputIdx != fActiveLimit) {
4206                     //  The look-behind expression matched, but the match did not
4207                     //    extend all the way to the point that we are looking behind from.
4208                     //  FAIL out of here, which will take us back to the LB_CONT, which
4209                     //     will retry the match starting at another position or succeed
4210                     //     the look-behind altogether, whichever is appropriate.
4211                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4212                     break;
4213                 }
4214 
4215                 // Look-behind expression matched, which means look-behind test as
4216                 //   a whole Fails
4217 
4218                 //   Restore the orignal input string length, which had been truncated
4219                 //   inorder to pin the end of the lookbehind match
4220                 //   to the position being looked-behind.
4221                 int64_t originalInputLen = fData[opValue+3];
4222                 U_ASSERT(originalInputLen >= fActiveLimit);
4223                 U_ASSERT(originalInputLen <= fInputLength);
4224                 fActiveLimit = originalInputLen;
4225 
4226                 // Restore original stack position, discarding any state saved
4227                 //   by the successful pattern match.
4228                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4229                 int32_t newStackSize = (int32_t)fData[opValue];
4230                 U_ASSERT(fStack->size() > newStackSize);
4231                 fStack->setSize(newStackSize);
4232 
4233                 //  FAIL, which will take control back to someplace
4234                 //  prior to entering the look-behind test.
4235                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4236             }
4237             break;
4238 
4239 
4240         case URX_LOOP_SR_I:
4241             // Loop Initialization for the optimized implementation of
4242             //     [some character set]*
4243             //   This op scans through all matching input.
4244             //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
4245             {
4246                 U_ASSERT(opValue > 0 && opValue < sets->size());
4247                 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
4248                 UnicodeSet   *s  = (UnicodeSet *)sets->elementAt(opValue);
4249 
4250                 // Loop through input, until either the input is exhausted or
4251                 //   we reach a character that is not a member of the set.
4252                 int64_t ix = fp->fInputIdx;
4253                 UTEXT_SETNATIVEINDEX(fInputText, ix);
4254                 for (;;) {
4255                     if (ix >= fActiveLimit) {
4256                         fHitEnd = TRUE;
4257                         break;
4258                     }
4259                     UChar32 c = UTEXT_NEXT32(fInputText);
4260                     if (c<256) {
4261                         if (s8->contains(c) == FALSE) {
4262                             break;
4263                         }
4264                     } else {
4265                         if (s->contains(c) == FALSE) {
4266                             break;
4267                         }
4268                     }
4269                     ix = UTEXT_GETNATIVEINDEX(fInputText);
4270                 }
4271 
4272                 // If there were no matching characters, skip over the loop altogether.
4273                 //   The loop doesn't run at all, a * op always succeeds.
4274                 if (ix == fp->fInputIdx) {
4275                     fp->fPatIdx++;   // skip the URX_LOOP_C op.
4276                     break;
4277                 }
4278 
4279                 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
4280                 //   must follow.  It's operand is the stack location
4281                 //   that holds the starting input index for the match of this [set]*
4282                 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
4283                 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
4284                 int32_t stackLoc = URX_VAL(loopcOp);
4285                 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
4286                 fp->fExtra[stackLoc] = fp->fInputIdx;
4287                 #ifdef REGEX_SMART_BACKTRACKING
4288                 backSearchIndex = fp->fInputIdx;
4289                 #endif
4290                 fp->fInputIdx = ix;
4291 
4292                 // Save State to the URX_LOOP_C op that follows this one,
4293                 //   so that match failures in the following code will return to there.
4294                 //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
4295                 fp = StateSave(fp, fp->fPatIdx, status);
4296                 fp->fPatIdx++;
4297             }
4298             break;
4299 
4300 
4301         case URX_LOOP_DOT_I:
4302             // Loop Initialization for the optimized implementation of .*
4303             //   This op scans through all remaining input.
4304             //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
4305             {
4306                 // Loop through input until the input is exhausted (we reach an end-of-line)
4307                 // In DOTALL mode, we can just go straight to the end of the input.
4308                 int64_t ix;
4309                 if ((opValue & 1) == 1) {
4310                     // Dot-matches-All mode.  Jump straight to the end of the string.
4311                     ix = fActiveLimit;
4312                     fHitEnd = TRUE;
4313                 } else {
4314                     // NOT DOT ALL mode.  Line endings do not match '.'
4315                     // Scan forward until a line ending or end of input.
4316                     ix = fp->fInputIdx;
4317                     UTEXT_SETNATIVEINDEX(fInputText, ix);
4318                     for (;;) {
4319                         if (ix >= fActiveLimit) {
4320                             fHitEnd = TRUE;
4321                             break;
4322                         }
4323                         UChar32 c = UTEXT_NEXT32(fInputText);
4324                         if ((c & 0x7f) <= 0x29) {          // Fast filter of non-new-line-s
4325                             if ((c == 0x0a) ||             //  0x0a is newline in both modes.
4326                                (((opValue & 2) == 0) &&    // IF not UNIX_LINES mode
4327                                     (c<=0x0d && c>=0x0a)) || c==0x85 ||c==0x2028 || c==0x2029) {
4328                                 //  char is a line ending.  Exit the scanning loop.
4329                                 break;
4330                             }
4331                         }
4332                         ix = UTEXT_GETNATIVEINDEX(fInputText);
4333                     }
4334                 }
4335 
4336                 // If there were no matching characters, skip over the loop altogether.
4337                 //   The loop doesn't run at all, a * op always succeeds.
4338                 if (ix == fp->fInputIdx) {
4339                     fp->fPatIdx++;   // skip the URX_LOOP_C op.
4340                     break;
4341                 }
4342 
4343                 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
4344                 //   must follow.  It's operand is the stack location
4345                 //   that holds the starting input index for the match of this .*
4346                 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
4347                 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
4348                 int32_t stackLoc = URX_VAL(loopcOp);
4349                 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
4350                 fp->fExtra[stackLoc] = fp->fInputIdx;
4351                 #ifdef REGEX_SMART_BACKTRACKING
4352                 backSearchIndex = fp->fInputIdx;
4353                 #endif
4354                 fp->fInputIdx = ix;
4355 
4356                 // Save State to the URX_LOOP_C op that follows this one,
4357                 //   so that match failures in the following code will return to there.
4358                 //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
4359                 fp = StateSave(fp, fp->fPatIdx, status);
4360                 fp->fPatIdx++;
4361             }
4362             break;
4363 
4364 
4365         case URX_LOOP_C:
4366             {
4367                 U_ASSERT(opValue>=0 && opValue<fFrameSize);
4368                 backSearchIndex = fp->fExtra[opValue];
4369                 U_ASSERT(backSearchIndex <= fp->fInputIdx);
4370                 if (backSearchIndex == fp->fInputIdx) {
4371                     // We've backed up the input idx to the point that the loop started.
4372                     // The loop is done.  Leave here without saving state.
4373                     //  Subsequent failures won't come back here.
4374                     break;
4375                 }
4376                 // Set up for the next iteration of the loop, with input index
4377                 //   backed up by one from the last time through,
4378                 //   and a state save to this instruction in case the following code fails again.
4379                 //   (We're going backwards because this loop emulates stack unwinding, not
4380                 //    the initial scan forward.)
4381                 U_ASSERT(fp->fInputIdx > 0);
4382                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
4383                 UChar32 prevC = UTEXT_PREVIOUS32(fInputText);
4384                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
4385 
4386                 UChar32 twoPrevC = UTEXT_PREVIOUS32(fInputText);
4387                 if (prevC == 0x0a &&
4388                     fp->fInputIdx > backSearchIndex &&
4389                     twoPrevC == 0x0d) {
4390                     int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
4391                     if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
4392                         // .*, stepping back over CRLF pair.
4393                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
4394                     }
4395                 }
4396 
4397 
4398                 fp = StateSave(fp, fp->fPatIdx-1, status);
4399             }
4400             break;
4401 
4402 
4403 
4404         default:
4405             // Trouble.  The compiled pattern contains an entry with an
4406             //           unrecognized type tag.
4407             U_ASSERT(FALSE);
4408         }
4409 
4410         if (U_FAILURE(status)) {
4411             isMatch = FALSE;
4412             break;
4413         }
4414     }
4415 
4416 breakFromLoop:
4417     fMatch = isMatch;
4418     if (isMatch) {
4419         fLastMatchEnd = fMatchEnd;
4420         fMatchStart   = startIdx;
4421         fMatchEnd     = fp->fInputIdx;
4422         if (fTraceDebug) {
4423             REGEX_RUN_DEBUG_PRINTF(("Match.  start=%d   end=%d\n\n", fMatchStart, fMatchEnd));
4424         }
4425     }
4426     else
4427     {
4428         if (fTraceDebug) {
4429             REGEX_RUN_DEBUG_PRINTF(("No match\n\n"));
4430         }
4431     }
4432 
4433     fFrame = fp;                // The active stack frame when the engine stopped.
4434                                 //   Contains the capture group results that we need to
4435                                 //    access later.
4436     return;
4437 }
4438 
4439 
4440 //--------------------------------------------------------------------------------
4441 //
4442 //   MatchChunkAt   This is the actual matching engine. Like MatchAt, but with the
4443 //                  assumption that the entire string is available in the UText's
4444 //                  chunk buffer. For now, that means we can use int32_t indexes,
4445 //                  except for anything that needs to be saved (like group starts
4446 //                  and ends).
4447 //
4448 //                  startIdx:    begin matching a this index.
4449 //                  toEnd:       if true, match must extend to end of the input region
4450 //
4451 //--------------------------------------------------------------------------------
MatchChunkAt(int32_t startIdx,UBool toEnd,UErrorCode & status)4452 void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status) {
4453     UBool       isMatch  = FALSE;      // True if the we have a match.
4454 
4455     int32_t     backSearchIndex = INT32_MAX; // used after greedy single-character matches for searching backwards
4456 
4457     int32_t     op;                    // Operation from the compiled pattern, split into
4458     int32_t     opType;                //    the opcode
4459     int32_t     opValue;               //    and the operand value.
4460 
4461 #ifdef REGEX_RUN_DEBUG
4462     if (fTraceDebug)
4463     {
4464         printf("MatchAt(startIdx=%ld)\n", startIdx);
4465         printf("Original Pattern: ");
4466         UChar32 c = utext_next32From(fPattern->fPattern, 0);
4467         while (c != U_SENTINEL) {
4468             if (c<32 || c>256) {
4469                 c = '.';
4470             }
4471             REGEX_DUMP_DEBUG_PRINTF(("%c", c));
4472 
4473             c = UTEXT_NEXT32(fPattern->fPattern);
4474         }
4475         printf("\n");
4476         printf("Input String: ");
4477         c = utext_next32From(fInputText, 0);
4478         while (c != U_SENTINEL) {
4479             if (c<32 || c>256) {
4480                 c = '.';
4481             }
4482             printf("%c", c);
4483 
4484             c = UTEXT_NEXT32(fInputText);
4485         }
4486         printf("\n");
4487         printf("\n");
4488     }
4489 #endif
4490 
4491     if (U_FAILURE(status)) {
4492         return;
4493     }
4494 
4495     //  Cache frequently referenced items from the compiled pattern
4496     //
4497     int64_t             *pat           = fPattern->fCompiledPat->getBuffer();
4498 
4499     const UChar         *litText       = fPattern->fLiteralText.getBuffer();
4500     UVector             *sets          = fPattern->fSets;
4501 
4502     const UChar         *inputBuf      = fInputText->chunkContents;
4503 
4504     fFrameSize = fPattern->fFrameSize;
4505     REStackFrame        *fp            = resetStack();
4506 
4507     fp->fPatIdx   = 0;
4508     fp->fInputIdx = startIdx;
4509 
4510     // Zero out the pattern's static data
4511     int32_t i;
4512     for (i = 0; i<fPattern->fDataSize; i++) {
4513         fData[i] = 0;
4514     }
4515 
4516     //
4517     //  Main loop for interpreting the compiled pattern.
4518     //  One iteration of the loop per pattern operation performed.
4519     //
4520     for (;;) {
4521 #if 0
4522         if (_heapchk() != _HEAPOK) {
4523             fprintf(stderr, "Heap Trouble\n");
4524         }
4525 #endif
4526 
4527         op      = (int32_t)pat[fp->fPatIdx];
4528         opType  = URX_TYPE(op);
4529         opValue = URX_VAL(op);
4530 #ifdef REGEX_RUN_DEBUG
4531         if (fTraceDebug) {
4532             UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
4533             printf("inputIdx=%d   inputChar=%x   sp=%3d   activeLimit=%d  ", fp->fInputIdx,
4534                    UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
4535             fPattern->dumpOp(fp->fPatIdx);
4536         }
4537 #endif
4538         fp->fPatIdx++;
4539 
4540         switch (opType) {
4541 
4542 
4543         case URX_NOP:
4544             break;
4545 
4546 
4547         case URX_BACKTRACK:
4548             // Force a backtrack.  In some circumstances, the pattern compiler
4549             //   will notice that the pattern can't possibly match anything, and will
4550             //   emit one of these at that point.
4551             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4552             break;
4553 
4554 
4555         case URX_ONECHAR:
4556             if (fp->fInputIdx < fActiveLimit) {
4557                 UChar32 c;
4558                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4559                 if (c == opValue) {
4560                     break;
4561                 }
4562             } else {
4563                 fHitEnd = TRUE;
4564             }
4565 
4566             #ifdef REGEX_SMART_BACKTRACKING
4567             if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
4568                 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
4569                 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
4570                     int64_t reverseIndex = fp->fInputIdx;
4571                     UChar32 c;
4572                     do {
4573                         U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
4574                         if (c == opValue) {
4575                             break;
4576                         }
4577                     } while (reverseIndex > backSearchIndex);
4578                     if (c == opValue) {
4579                         fHitEnd = FALSE;
4580                         fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4581                         fp->fInputIdx = reverseIndex;
4582                         if (fp->fInputIdx > backSearchIndex) {
4583                             fp = StateSave(fp, fp->fPatIdx, status);
4584                         }
4585                         fp->fPatIdx++; // Skip the LOOP_C, we just did that
4586                         break;
4587                     }
4588                 }
4589             }
4590             #endif
4591 
4592             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4593             break;
4594 
4595 
4596         case URX_STRING:
4597             {
4598                 // Test input against a literal string.
4599                 // Strings require two slots in the compiled pattern, one for the
4600                 //   offset to the string text, and one for the length.
4601                 int32_t   stringStartIdx = opValue;
4602                 int32_t   stringLen;
4603 
4604                 op      = (int32_t)pat[fp->fPatIdx];     // Fetch the second operand
4605                 fp->fPatIdx++;
4606                 opType    = URX_TYPE(op);
4607                 stringLen = URX_VAL(op);
4608                 U_ASSERT(opType == URX_STRING_LEN);
4609                 U_ASSERT(stringLen >= 2);
4610 
4611                 if (fp->fInputIdx + stringLen > fActiveLimit) {
4612                     // No match.  String is longer than the remaining input text.
4613                     fHitEnd = TRUE;          //   TODO:  See ticket 6074
4614                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4615                     break;
4616                 }
4617 
4618                 const UChar * pInp = inputBuf + fp->fInputIdx;
4619                 const UChar * pPat = litText+stringStartIdx;
4620                 const UChar * pEnd = pInp + stringLen;
4621                 UBool success = FALSE;
4622                 for(;;) {
4623                     if (*pInp == *pPat) {
4624                         pInp++;
4625                         pPat++;
4626                         if (pInp == pEnd) {
4627                             // Successful Match.
4628                             success = TRUE;
4629                             break;
4630                         }
4631                     } else {
4632                         // Match failed.
4633                         break;
4634                     }
4635                 }
4636 
4637                 if (success) {
4638                     fp->fInputIdx += stringLen;
4639                 } else {
4640                     #ifdef REGEX_SMART_BACKTRACKING
4641                     if (fp->fInputIdx > backSearchIndex && fStack->size()) {
4642                         REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
4643                         if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
4644                             // Reset to last start point
4645                             int64_t reverseIndex = fp->fInputIdx;
4646                             UChar32 c;
4647                             pPat = litText+stringStartIdx;
4648 
4649                             // Search backwards for a possible start
4650                             do {
4651                                 U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
4652                                 if ((U_IS_BMP(c) && *pPat == c) ||
4653                                     (*pPat == U16_LEAD(c) && *(pPat+1) == U16_TRAIL(c))) {
4654                                     success = TRUE;
4655                                     break;
4656                                 }
4657                             } while (reverseIndex > backSearchIndex);
4658 
4659                             // And try again
4660                             if (success) {
4661                                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4662                                 fp->fInputIdx = reverseIndex;
4663                                 if (fp->fInputIdx > backSearchIndex) {
4664                                     fp = StateSave(fp, fp->fPatIdx, status);
4665                                 }
4666                                 fp->fPatIdx++; // Skip the LOOP_C, we just did that
4667                                 break;
4668                             }
4669                         }
4670                     }
4671                     #endif
4672                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4673                 }
4674             }
4675             break;
4676 
4677 
4678         case URX_STATE_SAVE:
4679             fp = StateSave(fp, opValue, status);
4680             break;
4681 
4682 
4683         case URX_END:
4684             // The match loop will exit via this path on a successful match,
4685             //   when we reach the end of the pattern.
4686             if (toEnd && fp->fInputIdx != fActiveLimit) {
4687                 // The pattern matched, but not to the end of input.  Try some more.
4688                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4689                 break;
4690             }
4691             isMatch = TRUE;
4692             goto  breakFromLoop;
4693 
4694             // Start and End Capture stack frame variables are laid out out like this:
4695             //  fp->fExtra[opValue]  - The start of a completed capture group
4696             //             opValue+1 - The end   of a completed capture group
4697             //             opValue+2 - the start of a capture group whose end
4698             //                          has not yet been reached (and might not ever be).
4699         case URX_START_CAPTURE:
4700             U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
4701             fp->fExtra[opValue+2] = fp->fInputIdx;
4702             break;
4703 
4704 
4705         case URX_END_CAPTURE:
4706             U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
4707             U_ASSERT(fp->fExtra[opValue+2] >= 0);            // Start pos for this group must be set.
4708             fp->fExtra[opValue]   = fp->fExtra[opValue+2];   // Tentative start becomes real.
4709             fp->fExtra[opValue+1] = fp->fInputIdx;           // End position
4710             U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
4711             break;
4712 
4713 
4714         case URX_DOLLAR:                   //  $, test for End of line
4715             //     or for position before new line at end of input
4716             if (fp->fInputIdx < fAnchorLimit-2) {
4717                 // We are no where near the end of input.  Fail.
4718                 //   This is the common case.  Keep it first.
4719                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4720                 break;
4721             }
4722             if (fp->fInputIdx >= fAnchorLimit) {
4723                 // We really are at the end of input.  Success.
4724                 fHitEnd = TRUE;
4725                 fRequireEnd = TRUE;
4726                 break;
4727             }
4728 
4729             // If we are positioned just before a new-line that is located at the
4730             //   end of input, succeed.
4731             if (fp->fInputIdx == fAnchorLimit-1) {
4732                 UChar32 c;
4733                 U16_GET(inputBuf, fAnchorStart, fp->fInputIdx, fAnchorLimit, c);
4734 
4735                 if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) {
4736                     if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
4737                         // At new-line at end of input. Success
4738                         fHitEnd = TRUE;
4739                         fRequireEnd = TRUE;
4740                         break;
4741                     }
4742                 }
4743             } else if (fp->fInputIdx == fAnchorLimit-2 &&
4744                 inputBuf[fp->fInputIdx]==0x0d && inputBuf[fp->fInputIdx+1]==0x0a) {
4745                     fHitEnd = TRUE;
4746                     fRequireEnd = TRUE;
4747                     break;                         // At CR/LF at end of input.  Success
4748             }
4749 
4750             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4751 
4752             break;
4753 
4754 
4755         case URX_DOLLAR_D:                   //  $, test for End of Line, in UNIX_LINES mode.
4756             if (fp->fInputIdx >= fAnchorLimit-1) {
4757                 // Either at the last character of input, or off the end.
4758                 if (fp->fInputIdx == fAnchorLimit-1) {
4759                     // At last char of input.  Success if it's a new line.
4760                     if (inputBuf[fp->fInputIdx] == 0x0a) {
4761                         fHitEnd = TRUE;
4762                         fRequireEnd = TRUE;
4763                         break;
4764                     }
4765                 } else {
4766                     // Off the end of input.  Success.
4767                     fHitEnd = TRUE;
4768                     fRequireEnd = TRUE;
4769                     break;
4770                 }
4771             }
4772 
4773             // Not at end of input.  Back-track out.
4774             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4775             break;
4776 
4777 
4778         case URX_DOLLAR_M:                //  $, test for End of line in multi-line mode
4779             {
4780                 if (fp->fInputIdx >= fAnchorLimit) {
4781                     // We really are at the end of input.  Success.
4782                     fHitEnd = TRUE;
4783                     fRequireEnd = TRUE;
4784                     break;
4785                 }
4786                 // If we are positioned just before a new-line, succeed.
4787                 // It makes no difference where the new-line is within the input.
4788                 UChar32 c = inputBuf[fp->fInputIdx];
4789                 if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) {
4790                     // At a line end, except for the odd chance of  being in the middle of a CR/LF sequence
4791                     //  In multi-line mode, hitting a new-line just before the end of input does not
4792                     //   set the hitEnd or requireEnd flags
4793                     if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
4794                         break;
4795                     }
4796                 }
4797                 // not at a new line.  Fail.
4798                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4799             }
4800             break;
4801 
4802 
4803         case URX_DOLLAR_MD:                //  $, test for End of line in multi-line and UNIX_LINES mode
4804             {
4805                 if (fp->fInputIdx >= fAnchorLimit) {
4806                     // We really are at the end of input.  Success.
4807                     fHitEnd = TRUE;
4808                     fRequireEnd = TRUE;  // Java set requireEnd in this case, even though
4809                     break;               //   adding a new-line would not lose the match.
4810                 }
4811                 // If we are not positioned just before a new-line, the test fails; backtrack out.
4812                 // It makes no difference where the new-line is within the input.
4813                 if (inputBuf[fp->fInputIdx] != 0x0a) {
4814                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4815                 }
4816             }
4817             break;
4818 
4819 
4820         case URX_CARET:                    //  ^, test for start of line
4821             if (fp->fInputIdx != fAnchorStart) {
4822                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4823             }
4824             break;
4825 
4826 
4827         case URX_CARET_M:                   //  ^, test for start of line in mulit-line mode
4828             {
4829                 if (fp->fInputIdx == fAnchorStart) {
4830                     // We are at the start input.  Success.
4831                     break;
4832                 }
4833                 // Check whether character just before the current pos is a new-line
4834                 //   unless we are at the end of input
4835                 UChar  c = inputBuf[fp->fInputIdx - 1];
4836                 if ((fp->fInputIdx < fAnchorLimit) &&
4837                     ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
4838                     //  It's a new-line.  ^ is true.  Success.
4839                     //  TODO:  what should be done with positions between a CR and LF?
4840                     break;
4841                 }
4842                 // Not at the start of a line.  Fail.
4843                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4844             }
4845             break;
4846 
4847 
4848         case URX_CARET_M_UNIX:       //  ^, test for start of line in mulit-line + Unix-line mode
4849             {
4850                 U_ASSERT(fp->fInputIdx >= fAnchorStart);
4851                 if (fp->fInputIdx <= fAnchorStart) {
4852                     // We are at the start input.  Success.
4853                     break;
4854                 }
4855                 // Check whether character just before the current pos is a new-line
4856                 U_ASSERT(fp->fInputIdx <= fAnchorLimit);
4857                 UChar  c = inputBuf[fp->fInputIdx - 1];
4858                 if (c != 0x0a) {
4859                     // Not at the start of a line.  Back-track out.
4860                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4861                 }
4862             }
4863             break;
4864 
4865         case URX_BACKSLASH_B:          // Test for word boundaries
4866             {
4867                 UBool success = isChunkWordBoundary((int32_t)fp->fInputIdx);
4868                 success ^= (opValue != 0);     // flip sense for \B
4869                 if (!success) {
4870                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4871                 }
4872             }
4873             break;
4874 
4875 
4876         case URX_BACKSLASH_BU:          // Test for word boundaries, Unicode-style
4877             {
4878                 UBool success = isUWordBoundary(fp->fInputIdx);
4879                 success ^= (opValue != 0);     // flip sense for \B
4880                 if (!success) {
4881                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4882                 }
4883             }
4884             break;
4885 
4886 
4887         case URX_BACKSLASH_D:            // Test for decimal digit
4888             {
4889                 if (fp->fInputIdx >= fActiveLimit) {
4890                     fHitEnd = TRUE;
4891                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4892                     break;
4893                 }
4894 
4895                 UChar32 c;
4896                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4897                 int8_t ctype = u_charType(c);     // TODO:  make a unicode set for this.  Will be faster.
4898                 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
4899                 success ^= (opValue != 0);        // flip sense for \D
4900                 if (!success) {
4901                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4902                 }
4903             }
4904             break;
4905 
4906 
4907         case URX_BACKSLASH_G:          // Test for position at end of previous match
4908             if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) {
4909                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4910             }
4911             break;
4912 
4913 
4914         case URX_BACKSLASH_X:
4915         //  Match a Grapheme, as defined by Unicode TR 29.
4916         //  Differs slightly from Perl, which consumes combining marks independently
4917         //    of context.
4918         {
4919 
4920             // Fail if at end of input
4921             if (fp->fInputIdx >= fActiveLimit) {
4922                 fHitEnd = TRUE;
4923                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4924                 break;
4925             }
4926 
4927             // Examine (and consume) the current char.
4928             //   Dispatch into a little state machine, based on the char.
4929             UChar32  c;
4930             U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4931             UnicodeSet **sets = fPattern->fStaticSets;
4932             if (sets[URX_GC_NORMAL]->contains(c))  goto GC_Extend;
4933             if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control;
4934             if (sets[URX_GC_L]->contains(c))       goto GC_L;
4935             if (sets[URX_GC_LV]->contains(c))      goto GC_V;
4936             if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
4937             if (sets[URX_GC_V]->contains(c))       goto GC_V;
4938             if (sets[URX_GC_T]->contains(c))       goto GC_T;
4939             goto GC_Extend;
4940 
4941 
4942 
4943 GC_L:
4944             if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
4945             U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4946             if (sets[URX_GC_L]->contains(c))       goto GC_L;
4947             if (sets[URX_GC_LV]->contains(c))      goto GC_V;
4948             if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
4949             if (sets[URX_GC_V]->contains(c))       goto GC_V;
4950             U16_PREV(inputBuf, 0, fp->fInputIdx, c);
4951             goto GC_Extend;
4952 
4953 GC_V:
4954             if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
4955             U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4956             if (sets[URX_GC_V]->contains(c))       goto GC_V;
4957             if (sets[URX_GC_T]->contains(c))       goto GC_T;
4958             U16_PREV(inputBuf, 0, fp->fInputIdx, c);
4959             goto GC_Extend;
4960 
4961 GC_T:
4962             if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
4963             U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4964             if (sets[URX_GC_T]->contains(c))       goto GC_T;
4965             U16_PREV(inputBuf, 0, fp->fInputIdx, c);
4966             goto GC_Extend;
4967 
4968 GC_Extend:
4969             // Combining characters are consumed here
4970             for (;;) {
4971                 if (fp->fInputIdx >= fActiveLimit) {
4972                     break;
4973                 }
4974                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4975                 if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
4976                     U16_BACK_1(inputBuf, 0, fp->fInputIdx);
4977                     break;
4978                 }
4979             }
4980             goto GC_Done;
4981 
4982 GC_Control:
4983             // Most control chars stand alone (don't combine with combining chars),
4984             //   except for that CR/LF sequence is a single grapheme cluster.
4985             if (c == 0x0d && fp->fInputIdx < fActiveLimit && inputBuf[fp->fInputIdx] == 0x0a) {
4986                 fp->fInputIdx++;
4987             }
4988 
4989 GC_Done:
4990             if (fp->fInputIdx >= fActiveLimit) {
4991                 fHitEnd = TRUE;
4992             }
4993             break;
4994         }
4995 
4996 
4997 
4998 
4999         case URX_BACKSLASH_Z:          // Test for end of Input
5000             if (fp->fInputIdx < fAnchorLimit) {
5001                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5002             } else {
5003                 fHitEnd = TRUE;
5004                 fRequireEnd = TRUE;
5005             }
5006             break;
5007 
5008 
5009 
5010         case URX_STATIC_SETREF:
5011             {
5012                 // Test input character against one of the predefined sets
5013                 //    (Word Characters, for example)
5014                 // The high bit of the op value is a flag for the match polarity.
5015                 //    0:   success if input char is in set.
5016                 //    1:   success if input char is not in set.
5017                 if (fp->fInputIdx >= fActiveLimit) {
5018                     fHitEnd = TRUE;
5019                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5020                     break;
5021                 }
5022 
5023                 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
5024                 opValue &= ~URX_NEG_SET;
5025                 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
5026 
5027                 UChar32 c;
5028                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5029                 if (c < 256) {
5030                     Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
5031                     if (s8->contains(c)) {
5032                         success = !success;
5033                     }
5034                 } else {
5035                     const UnicodeSet *s = fPattern->fStaticSets[opValue];
5036                     if (s->contains(c)) {
5037                         success = !success;
5038                     }
5039                 }
5040                 if (!success) {
5041                     #ifdef REGEX_SMART_BACKTRACKING
5042                     if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
5043                         REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
5044                         if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
5045                             // Try to find it, backwards
5046                             int64_t reverseIndex = fp->fInputIdx;
5047                             U16_BACK_1(inputBuf, backSearchIndex, reverseIndex); // skip the first character we tried
5048                             success = ((opValue & URX_NEG_SET) == URX_NEG_SET); // reset
5049                             do {
5050                                 U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
5051                                 if (c < 256) {
5052                                     Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
5053                                     if (s8->contains(c)) {
5054                                         success = !success;
5055                                     }
5056                                 } else {
5057                                     const UnicodeSet *s = fPattern->fStaticSets[opValue];
5058                                     if (s->contains(c)) {
5059                                         success = !success;
5060                                     }
5061                                 }
5062                             } while (reverseIndex > backSearchIndex && !success);
5063 
5064                             if (success) {
5065                                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5066                                 fp->fInputIdx = reverseIndex;
5067                                 if (fp->fInputIdx > backSearchIndex) {
5068                                     fp = StateSave(fp, fp->fPatIdx, status);
5069                                 }
5070                                 fp->fPatIdx++; // Skip the LOOP_C, we just did that
5071                                 break;
5072                             }
5073                         }
5074                     }
5075                     #endif
5076                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5077                 }
5078             }
5079             break;
5080 
5081 
5082         case URX_STAT_SETREF_N:
5083             {
5084                 // Test input character for NOT being a member of  one of
5085                 //    the predefined sets (Word Characters, for example)
5086                 if (fp->fInputIdx >= fActiveLimit) {
5087                     fHitEnd = TRUE;
5088                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5089                     break;
5090                 }
5091 
5092                 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
5093 
5094                 UChar32  c;
5095                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5096                 if (c < 256) {
5097                     Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
5098                     if (s8->contains(c) == FALSE) {
5099                         break;
5100                     }
5101                 } else {
5102                     const UnicodeSet *s = fPattern->fStaticSets[opValue];
5103                     if (s->contains(c) == FALSE) {
5104                         break;
5105                     }
5106                 }
5107 
5108                 #ifdef REGEX_SMART_BACKTRACKING
5109                 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
5110                     REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
5111                     if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
5112                         // Try to find it, backwards
5113                         int64_t reverseIndex = fp->fInputIdx;
5114                         U16_BACK_1(inputBuf, backSearchIndex, reverseIndex); // skip the first character we tried
5115                         UBool success = FALSE;
5116                         do {
5117                             U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
5118                             if (c < 256) {
5119                                 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
5120                                 if (s8->contains(c) == FALSE) {
5121                                     success = TRUE;
5122                                     break;
5123                                 }
5124                             } else {
5125                                 const UnicodeSet *s = fPattern->fStaticSets[opValue];
5126                                 if (s->contains(c) == FALSE) {
5127                                     success = TRUE;
5128                                     break;
5129                                 }
5130                             }
5131                         } while (reverseIndex > backSearchIndex);
5132 
5133                         if (success) {
5134                             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5135                             fp->fInputIdx = reverseIndex;
5136                             if (fp->fInputIdx > backSearchIndex) {
5137                                 fp = StateSave(fp, fp->fPatIdx, status);
5138                             }
5139                             fp->fPatIdx++; // Skip the LOOP_C, we just did that
5140                             break;
5141                         }
5142                     }
5143                 }
5144                 #endif
5145                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5146             }
5147             break;
5148 
5149 
5150         case URX_SETREF:
5151             {
5152                 if (fp->fInputIdx >= fActiveLimit) {
5153                     fHitEnd = TRUE;
5154                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5155                     break;
5156                 }
5157 
5158                 U_ASSERT(opValue > 0 && opValue < sets->size());
5159 
5160                 // There is input left.  Pick up one char and test it for set membership.
5161                 UChar32  c;
5162                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5163                 if (c<256) {
5164                     Regex8BitSet *s8 = &fPattern->fSets8[opValue];
5165                     if (s8->contains(c)) {
5166                         // The character is in the set.  A Match.
5167                         break;
5168                     }
5169                 } else {
5170                     UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
5171                     if (s->contains(c)) {
5172                         // The character is in the set.  A Match.
5173                         break;
5174                     }
5175                 }
5176 
5177                 // the character wasn't in the set.
5178                 #ifdef REGEX_SMART_BACKTRACKING
5179                 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
5180                     REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
5181                     if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
5182                         // Try to find it, backwards
5183                         int64_t reverseIndex = fp->fInputIdx;
5184                         U16_BACK_1(inputBuf, backSearchIndex, reverseIndex); // skip the first character we tried
5185                         UBool success = FALSE;
5186                         do {
5187                             U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
5188                             if (c < 256) {
5189                                 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
5190                                 if (s8->contains(c)) {
5191                                     success = TRUE;
5192                                     break;
5193                                 }
5194                             } else {
5195                                 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
5196                                 if (s->contains(c)) {
5197                                     success = TRUE;
5198                                     break;
5199                                 }
5200                             }
5201                         } while (reverseIndex > backSearchIndex);
5202 
5203                         if (success) {
5204                             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5205                             fp->fInputIdx = reverseIndex;
5206                             if (fp->fInputIdx > reverseIndex) {
5207                                 fp = StateSave(fp, fp->fPatIdx, status);
5208                             }
5209                             fp->fPatIdx++; // Skip the LOOP_C, we just did that
5210                             break;
5211                         }
5212                     }
5213                 }
5214                 #endif
5215                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5216             }
5217             break;
5218 
5219 
5220         case URX_DOTANY:
5221             {
5222                 // . matches anything, but stops at end-of-line.
5223                 if (fp->fInputIdx >= fActiveLimit) {
5224                     // At end of input.  Match failed.  Backtrack out.
5225                     fHitEnd = TRUE;
5226                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5227                     break;
5228                 }
5229 
5230                 // There is input left.  Advance over one char, unless we've hit end-of-line
5231                 UChar32  c;
5232                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5233                 if (((c & 0x7f) <= 0x29) &&     // First quickly bypass as many chars as possible
5234                     ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
5235                     // End of line in normal mode.   . does not match.
5236                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5237                     break;
5238                 }
5239             }
5240             break;
5241 
5242 
5243         case URX_DOTANY_ALL:
5244             {
5245                 // . in dot-matches-all (including new lines) mode
5246                 if (fp->fInputIdx >= fActiveLimit) {
5247                     // At end of input.  Match failed.  Backtrack out.
5248                     fHitEnd = TRUE;
5249                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5250                     break;
5251                 }
5252 
5253                 // There is input left.  Advance over one char, except if we are
5254                 //   at a cr/lf, advance over both of them.
5255                 UChar32 c;
5256                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5257                 if (c==0x0d && fp->fInputIdx < fActiveLimit) {
5258                     // In the case of a CR/LF, we need to advance over both.
5259                     if (inputBuf[fp->fInputIdx] == 0x0a) {
5260                         U16_FWD_1(inputBuf, fp->fInputIdx, fActiveLimit);
5261                     }
5262                 }
5263             }
5264             break;
5265 
5266 
5267         case URX_DOTANY_UNIX:
5268             {
5269                 // '.' operator, matches all, but stops at end-of-line.
5270                 //   UNIX_LINES mode, so 0x0a is the only recognized line ending.
5271                 if (fp->fInputIdx >= fActiveLimit) {
5272                     // At end of input.  Match failed.  Backtrack out.
5273                     fHitEnd = TRUE;
5274                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5275                     break;
5276                 }
5277 
5278                 // There is input left.  Advance over one char, unless we've hit end-of-line
5279                 UChar32 c;
5280                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5281                 if (c == 0x0a) {
5282                     // End of line in normal mode.   '.' does not match the \n
5283                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5284                 }
5285             }
5286             break;
5287 
5288 
5289         case URX_JMP:
5290             fp->fPatIdx = opValue;
5291             break;
5292 
5293         case URX_FAIL:
5294             isMatch = FALSE;
5295             goto breakFromLoop;
5296 
5297         case URX_JMP_SAV:
5298             U_ASSERT(opValue < fPattern->fCompiledPat->size());
5299             fp = StateSave(fp, fp->fPatIdx, status);       // State save to loc following current
5300             fp->fPatIdx = opValue;                         // Then JMP.
5301             break;
5302 
5303         case URX_JMP_SAV_X:
5304             // This opcode is used with (x)+, when x can match a zero length string.
5305             // Same as JMP_SAV, except conditional on the match having made forward progress.
5306             // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
5307             //   data address of the input position at the start of the loop.
5308             {
5309                 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
5310                 int32_t  stoOp = (int32_t)pat[opValue-1];
5311                 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
5312                 int32_t  frameLoc = URX_VAL(stoOp);
5313                 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
5314                 int32_t prevInputIdx = (int32_t)fp->fExtra[frameLoc];
5315                 U_ASSERT(prevInputIdx <= fp->fInputIdx);
5316                 if (prevInputIdx < fp->fInputIdx) {
5317                     // The match did make progress.  Repeat the loop.
5318                     fp = StateSave(fp, fp->fPatIdx, status);  // State save to loc following current
5319                     fp->fPatIdx = opValue;
5320                     fp->fExtra[frameLoc] = fp->fInputIdx;
5321                 }
5322                 // If the input position did not advance, we do nothing here,
5323                 //   execution will fall out of the loop.
5324             }
5325             break;
5326 
5327         case URX_CTR_INIT:
5328             {
5329                 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
5330                 fp->fExtra[opValue] = 0;       //  Set the loop counter variable to zero
5331 
5332                 // Pick up the three extra operands that CTR_INIT has, and
5333                 //    skip the pattern location counter past
5334                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
5335                 fp->fPatIdx += 3;
5336                 int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
5337                 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
5338                 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
5339                 U_ASSERT(minCount>=0);
5340                 U_ASSERT(maxCount>=minCount || maxCount==-1);
5341                 U_ASSERT(loopLoc>fp->fPatIdx);
5342 
5343                 if (minCount == 0) {
5344                     fp = StateSave(fp, loopLoc+1, status);
5345                 }
5346                 if (maxCount == 0) {
5347                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5348                 }
5349             }
5350             break;
5351 
5352         case URX_CTR_LOOP:
5353             {
5354                 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
5355                 int32_t initOp = (int32_t)pat[opValue];
5356                 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
5357                 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
5358                 int32_t minCount  = (int32_t)pat[opValue+2];
5359                 int32_t maxCount  = (int32_t)pat[opValue+3];
5360                 // Increment the counter.  Note: we DIDN'T worry about counter
5361                 //   overflow, since the data comes from UnicodeStrings, which
5362                 //   stores its length in an int32_t. Do we have to think about
5363                 //   this now that we're using UText? Probably not, since the length
5364                 //   in UChar32s is still an int32_t.
5365                 (*pCounter)++;
5366                 U_ASSERT(*pCounter > 0);
5367                 if ((uint64_t)*pCounter >= (uint32_t)maxCount) {
5368                     U_ASSERT(*pCounter == maxCount || maxCount == -1);
5369                     break;
5370                 }
5371                 if (*pCounter >= minCount) {
5372                     fp = StateSave(fp, fp->fPatIdx, status);
5373                 }
5374                 fp->fPatIdx = opValue + 4;    // Loop back.
5375             }
5376             break;
5377 
5378         case URX_CTR_INIT_NG:
5379             {
5380                 // Initialize a non-greedy loop
5381                 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
5382                 fp->fExtra[opValue] = 0;       //  Set the loop counter variable to zero
5383 
5384                 // Pick up the three extra operands that CTR_INIT has, and
5385                 //    skip the pattern location counter past
5386                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
5387                 fp->fPatIdx += 3;
5388                 int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
5389                 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
5390                 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
5391                 U_ASSERT(minCount>=0);
5392                 U_ASSERT(maxCount>=minCount || maxCount==-1);
5393                 U_ASSERT(loopLoc>fp->fPatIdx);
5394 
5395                 if (minCount == 0) {
5396                     if (maxCount != 0) {
5397                         fp = StateSave(fp, fp->fPatIdx, status);
5398                     }
5399                     fp->fPatIdx = loopLoc+1;   // Continue with stuff after repeated block
5400                 }
5401             }
5402             break;
5403 
5404         case URX_CTR_LOOP_NG:
5405             {
5406                 // Non-greedy {min, max} loops
5407                 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
5408                 int32_t initOp = (int32_t)pat[opValue];
5409                 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
5410                 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
5411                 int32_t minCount  = (int32_t)pat[opValue+2];
5412                 int32_t maxCount  = (int32_t)pat[opValue+3];
5413                 // Increment the counter.  Note: we DIDN'T worry about counter
5414                 //   overflow, since the data comes from UnicodeStrings, which
5415                 //   stores its length in an int32_t. Do we have to think about
5416                 //   this now that we're using UText? Probably not, since the length
5417                 //   in UChar32s is still an int32_t.
5418                 (*pCounter)++;
5419                 U_ASSERT(*pCounter > 0);
5420 
5421                 if ((uint64_t)*pCounter >= (uint32_t)maxCount) {
5422                     // The loop has matched the maximum permitted number of times.
5423                     //   Break out of here with no action.  Matching will
5424                     //   continue with the following pattern.
5425                     U_ASSERT(*pCounter == maxCount || maxCount == -1);
5426                     break;
5427                 }
5428 
5429                 if (*pCounter < minCount) {
5430                     // We haven't met the minimum number of matches yet.
5431                     //   Loop back for another one.
5432                     fp->fPatIdx = opValue + 4;    // Loop back.
5433                 } else {
5434                     // We do have the minimum number of matches.
5435                     //   Fall into the following pattern, but first do
5436                     //   a state save to the top of the loop, so that a failure
5437                     //   in the following pattern will try another iteration of the loop.
5438                     fp = StateSave(fp, opValue + 4, status);
5439                 }
5440             }
5441             break;
5442 
5443         case URX_STO_SP:
5444             U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
5445             fData[opValue] = fStack->size();
5446             break;
5447 
5448         case URX_LD_SP:
5449             {
5450                 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
5451                 int32_t newStackSize = (int32_t)fData[opValue];
5452                 U_ASSERT(newStackSize <= fStack->size());
5453                 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
5454                 if (newFP == (int64_t *)fp) {
5455                     break;
5456                 }
5457                 int32_t i;
5458                 for (i=0; i<fFrameSize; i++) {
5459                     newFP[i] = ((int64_t *)fp)[i];
5460                 }
5461                 fp = (REStackFrame *)newFP;
5462                 fStack->setSize(newStackSize);
5463             }
5464             break;
5465 
5466         case URX_BACKREF:
5467         case URX_BACKREF_I:
5468             {
5469                 U_ASSERT(opValue < fFrameSize);
5470                 int64_t groupStartIdx = fp->fExtra[opValue];
5471                 int64_t groupEndIdx   = fp->fExtra[opValue+1];
5472                 U_ASSERT(groupStartIdx <= groupEndIdx);
5473                 int64_t len = groupEndIdx-groupStartIdx;
5474                 if (groupStartIdx < 0) {
5475                     // This capture group has not participated in the match thus far,
5476                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
5477                 }
5478 
5479                 if (len == 0) {
5480                         //   The capture group match was of an empty string.
5481                         //   Verified by testing:  Perl matches succeed in this case, so
5482                         //   we do too.
5483                         break;
5484                     }
5485 
5486                 UBool  haveMatch = FALSE;
5487                 if (fp->fInputIdx + len <= fActiveLimit) {
5488                     if (opType == URX_BACKREF) {
5489                         if (u_strncmp(inputBuf+groupStartIdx, inputBuf+fp->fInputIdx, (int32_t)len) == 0) {
5490                             haveMatch = TRUE;
5491                         }
5492                     } else {
5493                         if (u_strncasecmp(inputBuf+groupStartIdx, inputBuf+fp->fInputIdx,
5494                                   (int32_t)len, U_FOLD_CASE_DEFAULT) == 0) {
5495                             haveMatch = TRUE;
5496                         }
5497                     }
5498                 } else {
5499                     // TODO: probably need to do a partial string comparison, and only
5500                     //       set HitEnd if the available input matched.  Ticket #6074
5501                     fHitEnd = TRUE;
5502                 }
5503                 if (haveMatch) {
5504                     fp->fInputIdx += len;     // Match.  Advance current input position.
5505                 } else {
5506                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
5507                 }
5508             }
5509             break;
5510 
5511         case URX_STO_INP_LOC:
5512             {
5513                 U_ASSERT(opValue >= 0 && opValue < fFrameSize);
5514                 fp->fExtra[opValue] = fp->fInputIdx;
5515             }
5516             break;
5517 
5518         case URX_JMPX:
5519             {
5520                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
5521                 fp->fPatIdx += 1;
5522                 int32_t dataLoc  = URX_VAL(pat[instrOperandLoc]);
5523                 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
5524                 int32_t savedInputIdx = (int32_t)fp->fExtra[dataLoc];
5525                 U_ASSERT(savedInputIdx <= fp->fInputIdx);
5526                 if (savedInputIdx < fp->fInputIdx) {
5527                     fp->fPatIdx = opValue;                               // JMP
5528                 } else {
5529                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no progress in loop.
5530                 }
5531             }
5532             break;
5533 
5534         case URX_LA_START:
5535             {
5536                 // Entering a lookahead block.
5537                 // Save Stack Ptr, Input Pos.
5538                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5539                 fData[opValue]   = fStack->size();
5540                 fData[opValue+1] = fp->fInputIdx;
5541                 fActiveStart     = fLookStart;          // Set the match region change for
5542                 fActiveLimit     = fLookLimit;          //   transparent bounds.
5543             }
5544             break;
5545 
5546         case URX_LA_END:
5547             {
5548                 // Leaving a look-ahead block.
5549                 //  restore Stack Ptr, Input Pos to positions they had on entry to block.
5550                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5551                 int32_t stackSize = fStack->size();
5552                 int32_t newStackSize = (int32_t)fData[opValue];
5553                 U_ASSERT(stackSize >= newStackSize);
5554                 if (stackSize > newStackSize) {
5555                     // Copy the current top frame back to the new (cut back) top frame.
5556                     //   This makes the capture groups from within the look-ahead
5557                     //   expression available.
5558                     int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
5559                     int32_t i;
5560                     for (i=0; i<fFrameSize; i++) {
5561                         newFP[i] = ((int64_t *)fp)[i];
5562                     }
5563                     fp = (REStackFrame *)newFP;
5564                     fStack->setSize(newStackSize);
5565                 }
5566                 fp->fInputIdx = fData[opValue+1];
5567 
5568                 // Restore the active region bounds in the input string; they may have
5569                 //    been changed because of transparent bounds on a Region.
5570                 fActiveStart = fRegionStart;
5571                 fActiveLimit = fRegionLimit;
5572             }
5573             break;
5574 
5575         case URX_ONECHAR_I:
5576             if (fp->fInputIdx < fActiveLimit) {
5577                 UChar32 c;
5578                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5579                 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
5580                     break;
5581                 }
5582             } else {
5583                 fHitEnd = TRUE;
5584             }
5585 
5586             #ifdef REGEX_SMART_BACKTRACKING
5587             if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
5588                 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
5589                 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
5590                     UBool success = FALSE;
5591                     int64_t reverseIndex = fp->fInputIdx;
5592                     UChar32 c;
5593                     while (reverseIndex > backSearchIndex) {
5594                         U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
5595                         if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
5596                             success = TRUE;
5597                             break;
5598                         } else if (c == U_SENTINEL) {
5599                             break;
5600                         }
5601                     }
5602                     if (success) {
5603                         fHitEnd = FALSE;
5604                         fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5605                         fp->fInputIdx = reverseIndex;
5606                         if (fp->fInputIdx > backSearchIndex) {
5607                             fp = StateSave(fp, fp->fPatIdx, status);
5608                         }
5609                         fp->fPatIdx++; // Skip the LOOP_C, we just did that
5610                         break;
5611                     }
5612                 }
5613             }
5614             #endif
5615 
5616             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5617             break;
5618 
5619         case URX_STRING_I:
5620             {
5621                 // Test input against a literal string.
5622                 // Strings require two slots in the compiled pattern, one for the
5623                 //   offset to the string text, and one for the length.
5624                 const UCaseProps *csp = ucase_getSingleton();
5625                 {
5626                     int32_t stringStartIdx, stringLen;
5627                     stringStartIdx = opValue;
5628 
5629                     op      = (int32_t)pat[fp->fPatIdx];
5630                     fp->fPatIdx++;
5631                     opType  = URX_TYPE(op);
5632                     opValue = URX_VAL(op);
5633                     U_ASSERT(opType == URX_STRING_LEN);
5634                     stringLen = opValue;
5635 
5636                     const UChar *patternChars = litText+stringStartIdx;
5637                     const UChar *patternEnd = patternChars+stringLen;
5638 
5639                     const UChar *foldChars = NULL;
5640                     int32_t foldOffset, foldLength;
5641                     UChar32 c;
5642                     UBool c_is_valid = FALSE;
5643 
5644                     #ifdef REGEX_SMART_BACKTRACKING
5645                     int32_t originalInputIdx = fp->fInputIdx;
5646                     #endif
5647                     UBool success = TRUE;
5648 
5649                     foldOffset = foldLength = 0;
5650 
5651                     while (patternChars < patternEnd && success) {
5652                         if (fp->fInputIdx < fActiveLimit) {  // don't read past end of string
5653                             if(foldOffset < foldLength) {
5654                                 U16_NEXT_UNSAFE(foldChars, foldOffset, c);
5655                                 c_is_valid = TRUE;
5656                             } else {
5657                                 // test pre-condition of U16_NEXT: i < length
5658                                 U_ASSERT(fp->fInputIdx < fActiveLimit);
5659                                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5660                                 c_is_valid = TRUE;
5661                                 foldLength = ucase_toFullFolding(csp, c, &foldChars, U_FOLD_CASE_DEFAULT);
5662                                 if(foldLength >= 0) {
5663                                     if(foldLength <= UCASE_MAX_STRING_LENGTH) {   // !!!: Does not correctly handle chars that fold to 0-length strings
5664                                         foldOffset = 0;
5665                                         U16_NEXT_UNSAFE(foldChars, foldOffset, c);
5666                                     } else {
5667                                         c = foldLength;
5668                                         foldLength = foldOffset; // to avoid reading chars from the folding buffer
5669                                     }
5670                                 }
5671                             }
5672                         } else {
5673                           c_is_valid = FALSE;
5674                         }
5675 
5676                         if (fp->fInputIdx <= fActiveLimit && c_is_valid) {
5677                             if (U_IS_BMP(c)) {
5678                                 success = (*patternChars == c);
5679                                 patternChars += 1;
5680                             } else if (patternChars+1 < patternEnd) {
5681                                 success = (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c));
5682                                 patternChars += 2;
5683                             }
5684                         } else {
5685                             success = FALSE;
5686                             fHitEnd = TRUE;          //   TODO:  See ticket 6074
5687                         }
5688                     }
5689 
5690                     if (!success) {
5691                         #ifdef REGEX_SMART_BACKTRACKING
5692                         if (fp->fInputIdx > backSearchIndex && fStack->size()) {
5693                             REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
5694                             if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
5695                                 // Reset to last start point
5696                                 int64_t reverseIndex = originalInputIdx;
5697                                 patternChars = litText+stringStartIdx;
5698 
5699                                 // Search backwards for a possible start
5700                                 do {
5701                                     U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
5702                                     foldLength = ucase_toFullFolding(csp, c, &foldChars, U_FOLD_CASE_DEFAULT);
5703                                     if(foldLength >= 0) {
5704                                         if(foldLength <= UCASE_MAX_STRING_LENGTH) {   // !!!: Does not correctly handle chars that fold to 0-length strings
5705                                             foldOffset = 0;
5706                                             U16_NEXT_UNSAFE(foldChars, foldOffset, c);
5707                                         } else {
5708                                             c = foldLength;
5709                                             foldLength = foldOffset; // to avoid reading chars from the folding buffer
5710                                         }
5711                                     }
5712 
5713                                     if ((U_IS_BMP(c) && *patternChars == c) ||
5714                                            (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c))) {
5715                                         success = TRUE;
5716                                         break;
5717                                     }
5718                                 } while (reverseIndex > backSearchIndex);
5719 
5720                                 // And try again
5721                                 if (success) {
5722                                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5723                                     fp->fInputIdx = reverseIndex;
5724                                     if (fp->fInputIdx > backSearchIndex) {
5725                                         fp = StateSave(fp, fp->fPatIdx, status);
5726                                     }
5727                                     fp->fPatIdx++; // Skip the LOOP_C, we just did that
5728                                     break;
5729                                 }
5730                             }
5731                         }
5732                         #endif
5733                         fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5734                     }
5735                 }
5736             }
5737             break;
5738 
5739         case URX_LB_START:
5740             {
5741                 // Entering a look-behind block.
5742                 // Save Stack Ptr, Input Pos.
5743                 //   TODO:  implement transparent bounds.  Ticket #6067
5744                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5745                 fData[opValue]   = fStack->size();
5746                 fData[opValue+1] = fp->fInputIdx;
5747                 // Init the variable containing the start index for attempted matches.
5748                 fData[opValue+2] = -1;
5749                 // Save input string length, then reset to pin any matches to end at
5750                 //   the current position.
5751                 fData[opValue+3] = fActiveLimit;
5752                 fActiveLimit     = fp->fInputIdx;
5753             }
5754             break;
5755 
5756 
5757         case URX_LB_CONT:
5758             {
5759                 // Positive Look-Behind, at top of loop checking for matches of LB expression
5760                 //    at all possible input starting positions.
5761 
5762                 // Fetch the min and max possible match lengths.  They are the operands
5763                 //   of this op in the pattern.
5764                 int32_t minML = (int32_t)pat[fp->fPatIdx++];
5765                 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
5766                 U_ASSERT(minML <= maxML);
5767                 U_ASSERT(minML >= 0);
5768 
5769                 // Fetch (from data) the last input index where a match was attempted.
5770                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5771                 int64_t  *lbStartIdx = &fData[opValue+2];
5772                 if (*lbStartIdx < 0) {
5773                     // First time through loop.
5774                     *lbStartIdx = fp->fInputIdx - minML;
5775                 } else {
5776                     // 2nd through nth time through the loop.
5777                     // Back up start position for match by one.
5778                     if (*lbStartIdx == 0) {
5779                         (*lbStartIdx)--;
5780                     } else {
5781                         U16_BACK_1(inputBuf, 0, *lbStartIdx);
5782                     }
5783                 }
5784 
5785                 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
5786                     // We have tried all potential match starting points without
5787                     //  getting a match.  Backtrack out, and out of the
5788                     //   Look Behind altogether.
5789                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5790                     int64_t restoreInputLen = fData[opValue+3];
5791                     U_ASSERT(restoreInputLen >= fActiveLimit);
5792                     U_ASSERT(restoreInputLen <= fInputLength);
5793                     fActiveLimit = restoreInputLen;
5794                     break;
5795                 }
5796 
5797                 //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
5798                 //      (successful match will fall off the end of the loop.)
5799                 fp = StateSave(fp, fp->fPatIdx-3, status);
5800                 fp->fInputIdx =  *lbStartIdx;
5801             }
5802             break;
5803 
5804         case URX_LB_END:
5805             // End of a look-behind block, after a successful match.
5806             {
5807                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5808                 if (fp->fInputIdx != fActiveLimit) {
5809                     //  The look-behind expression matched, but the match did not
5810                     //    extend all the way to the point that we are looking behind from.
5811                     //  FAIL out of here, which will take us back to the LB_CONT, which
5812                     //     will retry the match starting at another position or fail
5813                     //     the look-behind altogether, whichever is appropriate.
5814                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5815                     break;
5816                 }
5817 
5818                 // Look-behind match is good.  Restore the orignal input string length,
5819                 //   which had been truncated to pin the end of the lookbehind match to the
5820                 //   position being looked-behind.
5821                 int64_t originalInputLen = fData[opValue+3];
5822                 U_ASSERT(originalInputLen >= fActiveLimit);
5823                 U_ASSERT(originalInputLen <= fInputLength);
5824                 fActiveLimit = originalInputLen;
5825             }
5826             break;
5827 
5828 
5829         case URX_LBN_CONT:
5830             {
5831                 // Negative Look-Behind, at top of loop checking for matches of LB expression
5832                 //    at all possible input starting positions.
5833 
5834                 // Fetch the extra parameters of this op.
5835                 int32_t minML       = (int32_t)pat[fp->fPatIdx++];
5836                 int32_t maxML       = (int32_t)pat[fp->fPatIdx++];
5837                 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
5838                 continueLoc = URX_VAL(continueLoc);
5839                 U_ASSERT(minML <= maxML);
5840                 U_ASSERT(minML >= 0);
5841                 U_ASSERT(continueLoc > fp->fPatIdx);
5842 
5843                 // Fetch (from data) the last input index where a match was attempted.
5844                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5845                 int64_t  *lbStartIdx = &fData[opValue+2];
5846                 if (*lbStartIdx < 0) {
5847                     // First time through loop.
5848                     *lbStartIdx = fp->fInputIdx - minML;
5849                 } else {
5850                     // 2nd through nth time through the loop.
5851                     // Back up start position for match by one.
5852                     if (*lbStartIdx == 0) {
5853                         (*lbStartIdx)--;   // Because U16_BACK is unsafe starting at 0.
5854                     } else {
5855                         U16_BACK_1(inputBuf, 0, *lbStartIdx);
5856                     }
5857                 }
5858 
5859                 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
5860                     // We have tried all potential match starting points without
5861                     //  getting a match, which means that the negative lookbehind as
5862                     //  a whole has succeeded.  Jump forward to the continue location
5863                     int64_t restoreInputLen = fData[opValue+3];
5864                     U_ASSERT(restoreInputLen >= fActiveLimit);
5865                     U_ASSERT(restoreInputLen <= fInputLength);
5866                     fActiveLimit = restoreInputLen;
5867                     fp->fPatIdx = continueLoc;
5868                     break;
5869                 }
5870 
5871                 //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
5872                 //      (successful match will cause a FAIL out of the loop altogether.)
5873                 fp = StateSave(fp, fp->fPatIdx-4, status);
5874                 fp->fInputIdx =  *lbStartIdx;
5875             }
5876             break;
5877 
5878         case URX_LBN_END:
5879             // End of a negative look-behind block, after a successful match.
5880             {
5881                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5882                 if (fp->fInputIdx != fActiveLimit) {
5883                     //  The look-behind expression matched, but the match did not
5884                     //    extend all the way to the point that we are looking behind from.
5885                     //  FAIL out of here, which will take us back to the LB_CONT, which
5886                     //     will retry the match starting at another position or succeed
5887                     //     the look-behind altogether, whichever is appropriate.
5888                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5889                     break;
5890                 }
5891 
5892                 // Look-behind expression matched, which means look-behind test as
5893                 //   a whole Fails
5894 
5895                 //   Restore the orignal input string length, which had been truncated
5896                 //   inorder to pin the end of the lookbehind match
5897                 //   to the position being looked-behind.
5898                 int64_t originalInputLen = fData[opValue+3];
5899                 U_ASSERT(originalInputLen >= fActiveLimit);
5900                 U_ASSERT(originalInputLen <= fInputLength);
5901                 fActiveLimit = originalInputLen;
5902 
5903                 // Restore original stack position, discarding any state saved
5904                 //   by the successful pattern match.
5905                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5906                 int32_t newStackSize = (int32_t)fData[opValue];
5907                 U_ASSERT(fStack->size() > newStackSize);
5908                 fStack->setSize(newStackSize);
5909 
5910                 //  FAIL, which will take control back to someplace
5911                 //  prior to entering the look-behind test.
5912                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5913             }
5914             break;
5915 
5916 
5917         case URX_LOOP_SR_I:
5918             // Loop Initialization for the optimized implementation of
5919             //     [some character set]*
5920             //   This op scans through all matching input.
5921             //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
5922             {
5923                 U_ASSERT(opValue > 0 && opValue < sets->size());
5924                 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
5925                 UnicodeSet   *s  = (UnicodeSet *)sets->elementAt(opValue);
5926 
5927                 // Loop through input, until either the input is exhausted or
5928                 //   we reach a character that is not a member of the set.
5929                 int32_t ix = (int32_t)fp->fInputIdx;
5930                 for (;;) {
5931                     if (ix >= fActiveLimit) {
5932                         fHitEnd = TRUE;
5933                         break;
5934                     }
5935                     UChar32   c;
5936                     U16_NEXT(inputBuf, ix, fActiveLimit, c);
5937                     if (c<256) {
5938                         if (s8->contains(c) == FALSE) {
5939                             U16_BACK_1(inputBuf, 0, ix);
5940                             break;
5941                         }
5942                     } else {
5943                         if (s->contains(c) == FALSE) {
5944                             U16_BACK_1(inputBuf, 0, ix);
5945                             break;
5946                         }
5947                     }
5948                 }
5949 
5950                 // If there were no matching characters, skip over the loop altogether.
5951                 //   The loop doesn't run at all, a * op always succeeds.
5952                 if (ix == fp->fInputIdx) {
5953                     fp->fPatIdx++;   // skip the URX_LOOP_C op.
5954                     break;
5955                 }
5956 
5957                 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
5958                 //   must follow.  It's operand is the stack location
5959                 //   that holds the starting input index for the match of this [set]*
5960                 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
5961                 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
5962                 int32_t stackLoc = URX_VAL(loopcOp);
5963                 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
5964                 fp->fExtra[stackLoc] = fp->fInputIdx;
5965                 #ifdef REGEX_SMART_BACKTRACKING
5966                 backSearchIndex = fp->fInputIdx;
5967                 #endif
5968                 fp->fInputIdx = ix;
5969 
5970                 // Save State to the URX_LOOP_C op that follows this one,
5971                 //   so that match failures in the following code will return to there.
5972                 //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
5973                 fp = StateSave(fp, fp->fPatIdx, status);
5974                 fp->fPatIdx++;
5975             }
5976             break;
5977 
5978 
5979         case URX_LOOP_DOT_I:
5980             // Loop Initialization for the optimized implementation of .*
5981             //   This op scans through all remaining input.
5982             //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
5983             {
5984                 // Loop through input until the input is exhausted (we reach an end-of-line)
5985                 // In DOTALL mode, we can just go straight to the end of the input.
5986                 int32_t ix;
5987                 if ((opValue & 1) == 1) {
5988                     // Dot-matches-All mode.  Jump straight to the end of the string.
5989                     ix = (int32_t)fActiveLimit;
5990                     fHitEnd = TRUE;
5991                 } else {
5992                     // NOT DOT ALL mode.  Line endings do not match '.'
5993                     // Scan forward until a line ending or end of input.
5994                     ix = (int32_t)fp->fInputIdx;
5995                     for (;;) {
5996                         if (ix >= fActiveLimit) {
5997                             fHitEnd = TRUE;
5998                             break;
5999                         }
6000                         UChar32   c;
6001                         U16_NEXT(inputBuf, ix, fActiveLimit, c);   // c = inputBuf[ix++]
6002                         if ((c & 0x7f) <= 0x29) {          // Fast filter of non-new-line-s
6003                             if ((c == 0x0a) ||             //  0x0a is newline in both modes.
6004                                 (((opValue & 2) == 0) &&    // IF not UNIX_LINES mode
6005                                    ((c<=0x0d && c>=0x0a) || c==0x85 || c==0x2028 || c==0x2029))) {
6006                                 //  char is a line ending.  Put the input pos back to the
6007                                 //    line ending char, and exit the scanning loop.
6008                                 U16_BACK_1(inputBuf, 0, ix);
6009                                 break;
6010                             }
6011                         }
6012                     }
6013                 }
6014 
6015                 // If there were no matching characters, skip over the loop altogether.
6016                 //   The loop doesn't run at all, a * op always succeeds.
6017                 if (ix == fp->fInputIdx) {
6018                     fp->fPatIdx++;   // skip the URX_LOOP_C op.
6019                     break;
6020                 }
6021 
6022                 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
6023                 //   must follow.  It's operand is the stack location
6024                 //   that holds the starting input index for the match of this .*
6025                 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
6026                 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
6027                 int32_t stackLoc = URX_VAL(loopcOp);
6028                 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
6029                 fp->fExtra[stackLoc] = fp->fInputIdx;
6030                 #ifdef REGEX_SMART_BACKTRACKING
6031                 backSearchIndex = fp->fInputIdx;
6032                 #endif
6033                 fp->fInputIdx = ix;
6034 
6035                 // Save State to the URX_LOOP_C op that follows this one,
6036                 //   so that match failures in the following code will return to there.
6037                 //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
6038                 fp = StateSave(fp, fp->fPatIdx, status);
6039                 fp->fPatIdx++;
6040             }
6041             break;
6042 
6043 
6044         case URX_LOOP_C:
6045             {
6046                 U_ASSERT(opValue>=0 && opValue<fFrameSize);
6047                 backSearchIndex = (int32_t)fp->fExtra[opValue];
6048                 U_ASSERT(backSearchIndex <= fp->fInputIdx);
6049                 if (backSearchIndex == fp->fInputIdx) {
6050                     // We've backed up the input idx to the point that the loop started.
6051                     // The loop is done.  Leave here without saving state.
6052                     //  Subsequent failures won't come back here.
6053                     break;
6054                 }
6055                 // Set up for the next iteration of the loop, with input index
6056                 //   backed up by one from the last time through,
6057                 //   and a state save to this instruction in case the following code fails again.
6058                 //   (We're going backwards because this loop emulates stack unwinding, not
6059                 //    the initial scan forward.)
6060                 U_ASSERT(fp->fInputIdx > 0);
6061                 UChar32 prevC;
6062                 U16_PREV(inputBuf, 0, fp->fInputIdx, prevC); // !!!: should this 0 be one of f*Limit?
6063 
6064                 if (prevC == 0x0a &&
6065                     fp->fInputIdx > backSearchIndex &&
6066                     inputBuf[fp->fInputIdx-1] == 0x0d) {
6067                     int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
6068                     if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
6069                         // .*, stepping back over CRLF pair.
6070                         U16_BACK_1(inputBuf, 0, fp->fInputIdx);
6071                     }
6072                 }
6073 
6074 
6075                 fp = StateSave(fp, fp->fPatIdx-1, status);
6076             }
6077             break;
6078 
6079 
6080 
6081         default:
6082             // Trouble.  The compiled pattern contains an entry with an
6083             //           unrecognized type tag.
6084             U_ASSERT(FALSE);
6085         }
6086 
6087         if (U_FAILURE(status)) {
6088             isMatch = FALSE;
6089             break;
6090         }
6091     }
6092 
6093 breakFromLoop:
6094     fMatch = isMatch;
6095     if (isMatch) {
6096         fLastMatchEnd = fMatchEnd;
6097         fMatchStart   = startIdx;
6098         fMatchEnd     = fp->fInputIdx;
6099         if (fTraceDebug) {
6100             REGEX_RUN_DEBUG_PRINTF(("Match.  start=%d   end=%d\n\n", fMatchStart, fMatchEnd));
6101         }
6102     }
6103     else
6104     {
6105         if (fTraceDebug) {
6106             REGEX_RUN_DEBUG_PRINTF(("No match\n\n"));
6107         }
6108     }
6109 
6110     fFrame = fp;                // The active stack frame when the engine stopped.
6111     //   Contains the capture group results that we need to
6112     //    access later.
6113 
6114     return;
6115 }
6116 
6117 
6118 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher)
6119 
6120 U_NAMESPACE_END
6121 
6122 #endif  // !UCONFIG_NO_REGULAR_EXPRESSIONS
6123