• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 **************************************************************************
3 *   Copyright (C) 2002-2010 International Business Machines Corporation  *
4 *   and others. All rights reserved.                                     *
5 **************************************************************************
6 */
7 //
8 //  file:  rematch.cpp
9 //
10 //         Contains the implementation of class RegexMatcher,
11 //         which is one of the main API classes for the ICU regular expression package.
12 //
13 
14 #include "unicode/utypes.h"
15 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
16 
17 #include "unicode/regex.h"
18 #include "unicode/uniset.h"
19 #include "unicode/uchar.h"
20 #include "unicode/ustring.h"
21 #include "unicode/rbbi.h"
22 #include "uassert.h"
23 #include "cmemory.h"
24 #include "uvector.h"
25 #include "uvectr32.h"
26 #include "uvectr64.h"
27 #include "regeximp.h"
28 #include "regexst.h"
29 #include "regextxt.h"
30 #include "ucase.h"
31 
32 // #include <malloc.h>        // Needed for heapcheck testing
33 
34 
35 // Find progress callback
36 // ----------------------
37 // Macro to inline test & call to ReportFindProgress().  Eliminates unnecessary function call.
38 //
39 #define REGEXFINDPROGRESS_INTERRUPT(pos, status)     \
40     (fFindProgressCallbackFn != NULL) && (ReportFindProgress(pos, status) == FALSE)
41 
42 
43 // Smart Backtracking
44 // ------------------
45 // When a failure would go back to a LOOP_C instruction,
46 // strings, characters, and setrefs scan backwards for a valid start
47 // character themselves, pop the stack, and save state, emulating the
48 // LOOP_C's effect but assured that the next character of input is a
49 // possible matching character.
50 //
51 // Good idea in theory; unfortunately it only helps out a few specific
52 // cases and slows the engine down a little in the rest.
53 
54 //#define REGEX_SMART_BACKTRACKING 1
55 
56 U_NAMESPACE_BEGIN
57 
58 // Default limit for the size of the back track stack, to avoid system
59 //    failures causedby heap exhaustion.  Units are in 32 bit words, not bytes.
60 // This value puts ICU's limits higher than most other regexp implementations,
61 //    which use recursion rather than the heap, and take more storage per
62 //    backtrack point.
63 //
64 static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY = 8000000;
65 
66 // Time limit counter constant.
67 //   Time limits for expression evaluation are in terms of quanta of work by
68 //   the engine, each of which is 10,000 state saves.
69 //   This constant determines that state saves per tick number.
70 static const int32_t TIMER_INITIAL_VALUE = 10000;
71 
72 //-----------------------------------------------------------------------------
73 //
74 //   Constructor and Destructor
75 //
76 //-----------------------------------------------------------------------------
RegexMatcher(const RegexPattern * pat)77 RegexMatcher::RegexMatcher(const RegexPattern *pat)  {
78     fDeferredStatus = U_ZERO_ERROR;
79     init(fDeferredStatus);
80     if (U_FAILURE(fDeferredStatus)) {
81         return;
82     }
83     if (pat==NULL) {
84         fDeferredStatus = U_ILLEGAL_ARGUMENT_ERROR;
85         return;
86     }
87     fPattern = pat;
88     init2(RegexStaticSets::gStaticSets->fEmptyText, fDeferredStatus);
89 }
90 
91 
92 
RegexMatcher(const UnicodeString & regexp,const UnicodeString & input,uint32_t flags,UErrorCode & status)93 RegexMatcher::RegexMatcher(const UnicodeString &regexp, const UnicodeString &input,
94                            uint32_t flags, UErrorCode &status) {
95     init(status);
96     if (U_FAILURE(status)) {
97         return;
98     }
99     UParseError    pe;
100     fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
101     fPattern           = fPatternOwned;
102 
103     UText inputText = UTEXT_INITIALIZER;
104     utext_openConstUnicodeString(&inputText, &input, &status);
105     init2(&inputText, status);
106     utext_close(&inputText);
107 
108     fInputUniStrMaybeMutable = TRUE;
109 }
110 
111 
RegexMatcher(UText * regexp,UText * input,uint32_t flags,UErrorCode & status)112 RegexMatcher::RegexMatcher(UText *regexp, UText *input,
113                            uint32_t flags, UErrorCode &status) {
114     init(status);
115     if (U_FAILURE(status)) {
116         return;
117     }
118     UParseError    pe;
119     fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
120     if (U_FAILURE(status)) {
121         return;
122     }
123 
124     fPattern           = fPatternOwned;
125     init2(input, status);
126 }
127 
128 
RegexMatcher(const UnicodeString & regexp,uint32_t flags,UErrorCode & status)129 RegexMatcher::RegexMatcher(const UnicodeString &regexp,
130                            uint32_t flags, UErrorCode &status) {
131     init(status);
132     if (U_FAILURE(status)) {
133         return;
134     }
135     UParseError    pe;
136     fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
137     if (U_FAILURE(status)) {
138         return;
139     }
140     fPattern           = fPatternOwned;
141     init2(RegexStaticSets::gStaticSets->fEmptyText, status);
142 }
143 
RegexMatcher(UText * regexp,uint32_t flags,UErrorCode & status)144 RegexMatcher::RegexMatcher(UText *regexp,
145                            uint32_t flags, UErrorCode &status) {
146     init(status);
147     if (U_FAILURE(status)) {
148         return;
149     }
150     UParseError    pe;
151     fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
152         if (U_FAILURE(status)) {
153         return;
154     }
155 
156     fPattern           = fPatternOwned;
157     init2(RegexStaticSets::gStaticSets->fEmptyText, status);
158 }
159 
160 
161 
162 
~RegexMatcher()163 RegexMatcher::~RegexMatcher() {
164     delete fStack;
165     if (fData != fSmallData) {
166         uprv_free(fData);
167         fData = NULL;
168     }
169     if (fPatternOwned) {
170         delete fPatternOwned;
171         fPatternOwned = NULL;
172         fPattern = NULL;
173     }
174 
175     if (fInput) {
176         delete fInput;
177     }
178     if (fInputText) {
179         utext_close(fInputText);
180     }
181     if (fAltInputText) {
182         utext_close(fAltInputText);
183     }
184 
185     #if UCONFIG_NO_BREAK_ITERATION==0
186     delete fWordBreakItr;
187     #endif
188 }
189 
190 //
191 //   init()   common initialization for use by all constructors.
192 //            Initialize all fields, get the object into a consistent state.
193 //            This must be done even when the initial status shows an error,
194 //            so that the object is initialized sufficiently well for the destructor
195 //            to run safely.
196 //
init(UErrorCode & status)197 void RegexMatcher::init(UErrorCode &status) {
198     fPattern           = NULL;
199     fPatternOwned      = NULL;
200     fFrameSize         = 0;
201     fRegionStart       = 0;
202     fRegionLimit       = 0;
203     fAnchorStart       = 0;
204     fAnchorLimit       = 0;
205     fLookStart         = 0;
206     fLookLimit         = 0;
207     fActiveStart       = 0;
208     fActiveLimit       = 0;
209     fTransparentBounds = FALSE;
210     fAnchoringBounds   = TRUE;
211     fMatch             = FALSE;
212     fMatchStart        = 0;
213     fMatchEnd          = 0;
214     fLastMatchEnd      = -1;
215     fAppendPosition    = 0;
216     fHitEnd            = FALSE;
217     fRequireEnd        = FALSE;
218     fStack             = NULL;
219     fFrame             = NULL;
220     fTimeLimit         = 0;
221     fTime              = 0;
222     fTickCounter       = 0;
223     fStackLimit        = DEFAULT_BACKTRACK_STACK_CAPACITY;
224     fCallbackFn        = NULL;
225     fCallbackContext   = NULL;
226     fFindProgressCallbackFn      = NULL;
227     fFindProgressCallbackContext = NULL;
228     fTraceDebug        = FALSE;
229     fDeferredStatus    = status;
230     fData              = fSmallData;
231     fWordBreakItr      = NULL;
232 
233     fStack             = new UVector64(status);
234     fInputText         = NULL;
235     fAltInputText      = NULL;
236     fInput             = NULL;
237     fInputLength       = 0;
238     fInputUniStrMaybeMutable = FALSE;
239 
240     if (U_FAILURE(status)) {
241         fDeferredStatus = status;
242     }
243 }
244 
245 //
246 //  init2()   Common initialization for use by RegexMatcher constructors, part 2.
247 //            This handles the common setup to be done after the Pattern is available.
248 //
init2(UText * input,UErrorCode & status)249 void RegexMatcher::init2(UText *input, UErrorCode &status) {
250     if (U_FAILURE(status)) {
251         fDeferredStatus = status;
252         return;
253     }
254 
255     if (fPattern->fDataSize > (int32_t)(sizeof(fSmallData)/sizeof(fSmallData[0]))) {
256         fData = (int64_t *)uprv_malloc(fPattern->fDataSize * sizeof(int64_t));
257         if (fData == NULL) {
258             status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
259             return;
260         }
261     }
262 
263     reset(input);
264     setStackLimit(DEFAULT_BACKTRACK_STACK_CAPACITY, status);
265     if (U_FAILURE(status)) {
266         fDeferredStatus = status;
267         return;
268     }
269 }
270 
271 
272 static const UChar BACKSLASH  = 0x5c;
273 static const UChar DOLLARSIGN = 0x24;
274 //--------------------------------------------------------------------------------
275 //
276 //    appendReplacement
277 //
278 //--------------------------------------------------------------------------------
appendReplacement(UnicodeString & dest,const UnicodeString & replacement,UErrorCode & status)279 RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest,
280                                               const UnicodeString &replacement,
281                                               UErrorCode &status) {
282     UText replacementText = UTEXT_INITIALIZER;
283 
284     utext_openConstUnicodeString(&replacementText, &replacement, &status);
285     if (U_SUCCESS(status)) {
286         UText resultText = UTEXT_INITIALIZER;
287         utext_openUnicodeString(&resultText, &dest, &status);
288 
289         if (U_SUCCESS(status)) {
290             appendReplacement(&resultText, &replacementText, status);
291             utext_close(&resultText);
292         }
293         utext_close(&replacementText);
294     }
295 
296     return *this;
297 }
298 
299 //
300 //    appendReplacement, UText mode
301 //
appendReplacement(UText * dest,UText * replacement,UErrorCode & status)302 RegexMatcher &RegexMatcher::appendReplacement(UText *dest,
303                                               UText *replacement,
304                                               UErrorCode &status) {
305     if (U_FAILURE(status)) {
306         return *this;
307     }
308     if (U_FAILURE(fDeferredStatus)) {
309         status = fDeferredStatus;
310         return *this;
311     }
312     if (fMatch == FALSE) {
313         status = U_REGEX_INVALID_STATE;
314         return *this;
315     }
316 
317     // Copy input string from the end of previous match to start of current match
318     int64_t  destLen = utext_nativeLength(dest);
319     if (fMatchStart > fAppendPosition) {
320         if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
321             destLen += utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
322                                      (int32_t)(fMatchStart-fAppendPosition), &status);
323         } else {
324             int32_t len16;
325             if (UTEXT_USES_U16(fInputText)) {
326                 len16 = (int32_t)(fMatchStart-fAppendPosition);
327             } else {
328                 UErrorCode lengthStatus = U_ZERO_ERROR;
329                 len16 = utext_extract(fInputText, fAppendPosition, fMatchStart, NULL, 0, &lengthStatus);
330             }
331             UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
332             if (inputChars == NULL) {
333                 status = U_MEMORY_ALLOCATION_ERROR;
334                 return *this;
335             }
336             utext_extract(fInputText, fAppendPosition, fMatchStart, inputChars, len16+1, &status);
337             destLen += utext_replace(dest, destLen, destLen, inputChars, len16, &status);
338             uprv_free(inputChars);
339         }
340     }
341     fAppendPosition = fMatchEnd;
342 
343 
344     // scan the replacement text, looking for substitutions ($n) and \escapes.
345     //  TODO:  optimize this loop by efficiently scanning for '$' or '\',
346     //         move entire ranges not containing substitutions.
347     UTEXT_SETNATIVEINDEX(replacement, 0);
348     UChar32 c = UTEXT_NEXT32(replacement);
349     while (c != U_SENTINEL) {
350         if (c == BACKSLASH) {
351             // Backslash Escape.  Copy the following char out without further checks.
352             //                    Note:  Surrogate pairs don't need any special handling
353             //                           The second half wont be a '$' or a '\', and
354             //                           will move to the dest normally on the next
355             //                           loop iteration.
356             c = UTEXT_CURRENT32(replacement);
357             if (c == U_SENTINEL) {
358                 break;
359             }
360 
361             if (c==0x55/*U*/ || c==0x75/*u*/) {
362                 // We have a \udddd or \Udddddddd escape sequence.
363                 int32_t offset = 0;
364                 struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(replacement);
365                 UChar32 escapedChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context);
366                 if (escapedChar != (UChar32)0xFFFFFFFF) {
367                     if (U_IS_BMP(escapedChar)) {
368                         UChar c16 = (UChar)escapedChar;
369                         destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
370                     } else {
371                         UChar surrogate[2];
372                         surrogate[0] = U16_LEAD(escapedChar);
373                         surrogate[1] = U16_TRAIL(escapedChar);
374                         if (U_SUCCESS(status)) {
375                             destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
376                         }
377                     }
378                     // TODO:  Report errors for mal-formed \u escapes?
379                     //        As this is, the original sequence is output, which may be OK.
380                     if (context.lastOffset == offset) {
381                         UTEXT_PREVIOUS32(replacement);
382                     } else if (context.lastOffset != offset-1) {
383                         utext_moveIndex32(replacement, offset - context.lastOffset - 1);
384                     }
385                 }
386             } else {
387                 UTEXT_NEXT32(replacement);
388                 // Plain backslash escape.  Just put out the escaped character.
389                 if (U_IS_BMP(c)) {
390                     UChar c16 = (UChar)c;
391                     destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
392                 } else {
393                     UChar surrogate[2];
394                     surrogate[0] = U16_LEAD(c);
395                     surrogate[1] = U16_TRAIL(c);
396                     if (U_SUCCESS(status)) {
397                         destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
398                     }
399                 }
400             }
401         } else if (c != DOLLARSIGN) {
402             // Normal char, not a $.  Copy it out without further checks.
403             if (U_IS_BMP(c)) {
404                 UChar c16 = (UChar)c;
405                 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
406             } else {
407                 UChar surrogate[2];
408                 surrogate[0] = U16_LEAD(c);
409                 surrogate[1] = U16_TRAIL(c);
410                 if (U_SUCCESS(status)) {
411                     destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
412                 }
413             }
414         } else {
415             // We've got a $.  Pick up a capture group number if one follows.
416             // Consume at most the number of digits necessary for the largest capture
417             // number that is valid for this pattern.
418 
419             int32_t numDigits = 0;
420             int32_t groupNum  = 0;
421             UChar32 digitC;
422             for (;;) {
423                 digitC = UTEXT_CURRENT32(replacement);
424                 if (digitC == U_SENTINEL) {
425                     break;
426                 }
427                 if (u_isdigit(digitC) == FALSE) {
428                     break;
429                 }
430                 UTEXT_NEXT32(replacement);
431                 groupNum=groupNum*10 + u_charDigitValue(digitC);
432                 numDigits++;
433                 if (numDigits >= fPattern->fMaxCaptureDigits) {
434                     break;
435                 }
436             }
437 
438 
439             if (numDigits == 0) {
440                 // The $ didn't introduce a group number at all.
441                 // Treat it as just part of the substitution text.
442                 UChar c16 = DOLLARSIGN;
443                 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
444             } else {
445                 // Finally, append the capture group data to the destination.
446                 destLen += appendGroup(groupNum, dest, status);
447                 if (U_FAILURE(status)) {
448                     // Can fail if group number is out of range.
449                     break;
450                 }
451             }
452         }
453 
454         if (U_FAILURE(status)) {
455             break;
456         } else {
457             c = UTEXT_NEXT32(replacement);
458         }
459     }
460 
461     return *this;
462 }
463 
464 
465 
466 //--------------------------------------------------------------------------------
467 //
468 //    appendTail     Intended to be used in conjunction with appendReplacement()
469 //                   To the destination string, append everything following
470 //                   the last match position from the input string.
471 //
472 //                   Note:  Match ranges do not affect appendTail or appendReplacement
473 //
474 //--------------------------------------------------------------------------------
appendTail(UnicodeString & dest)475 UnicodeString &RegexMatcher::appendTail(UnicodeString &dest) {
476     UErrorCode status = U_ZERO_ERROR;
477     UText resultText = UTEXT_INITIALIZER;
478     utext_openUnicodeString(&resultText, &dest, &status);
479 
480     if (U_SUCCESS(status)) {
481         appendTail(&resultText, status);
482         utext_close(&resultText);
483     }
484 
485     return dest;
486 }
487 
488 //
489 //   appendTail, UText mode
490 //
appendTail(UText * dest,UErrorCode & status)491 UText *RegexMatcher::appendTail(UText *dest, UErrorCode &status) {
492     UBool bailOut = FALSE;
493     if (U_FAILURE(status)) {
494         bailOut = TRUE;
495     }
496     if (U_FAILURE(fDeferredStatus)) {
497         status = fDeferredStatus;
498         bailOut = TRUE;
499     }
500 
501     if (bailOut) {
502         //  dest must not be NULL
503         if (dest) {
504             utext_replace(dest, utext_nativeLength(dest), utext_nativeLength(dest), NULL, 0, &status);
505             return dest;
506         }
507     }
508 
509     if (fInputLength > fAppendPosition) {
510         if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
511             int64_t destLen = utext_nativeLength(dest);
512             utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
513                           (int32_t)(fInputLength-fAppendPosition), &status);
514         } else {
515             int32_t len16;
516             if (UTEXT_USES_U16(fInputText)) {
517                 len16 = (int32_t)(fInputLength-fAppendPosition);
518             } else {
519                 len16 = utext_extract(fInputText, fAppendPosition, fInputLength, NULL, 0, &status);
520                 status = U_ZERO_ERROR; // buffer overflow
521             }
522 
523             UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16));
524             if (inputChars == NULL) {
525                 fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
526             } else {
527                 utext_extract(fInputText, fAppendPosition, fInputLength, inputChars, len16, &status); // unterminated
528                 int64_t destLen = utext_nativeLength(dest);
529                 utext_replace(dest, destLen, destLen, inputChars, len16, &status);
530                 uprv_free(inputChars);
531             }
532         }
533     }
534     return dest;
535 }
536 
537 
538 
539 //--------------------------------------------------------------------------------
540 //
541 //   end
542 //
543 //--------------------------------------------------------------------------------
end(UErrorCode & err) const544 int32_t RegexMatcher::end(UErrorCode &err) const {
545     return end(0, err);
546 }
547 
end64(UErrorCode & err) const548 int64_t RegexMatcher::end64(UErrorCode &err) const {
549     return end64(0, err);
550 }
551 
end64(int32_t group,UErrorCode & err) const552 int64_t RegexMatcher::end64(int32_t group, UErrorCode &err) const {
553     if (U_FAILURE(err)) {
554         return -1;
555     }
556     if (fMatch == FALSE) {
557         err = U_REGEX_INVALID_STATE;
558         return -1;
559     }
560     if (group < 0 || group > fPattern->fGroupMap->size()) {
561         err = U_INDEX_OUTOFBOUNDS_ERROR;
562         return -1;
563     }
564     int64_t e = -1;
565     if (group == 0) {
566         e = fMatchEnd;
567     } else {
568         // Get the position within the stack frame of the variables for
569         //    this capture group.
570         int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
571         U_ASSERT(groupOffset < fPattern->fFrameSize);
572         U_ASSERT(groupOffset >= 0);
573         e = fFrame->fExtra[groupOffset + 1];
574     }
575 
576         return e;
577 }
578 
end(int32_t group,UErrorCode & err) const579 int32_t RegexMatcher::end(int32_t group, UErrorCode &err) const {
580     return (int32_t)end64(group, err);
581 }
582 
583 
584 //--------------------------------------------------------------------------------
585 //
586 //   find()
587 //
588 //--------------------------------------------------------------------------------
find()589 UBool RegexMatcher::find() {
590     // Start at the position of the last match end.  (Will be zero if the
591     //   matcher has been reset.)
592     //
593     if (U_FAILURE(fDeferredStatus)) {
594         return FALSE;
595     }
596 
597     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
598         return findUsingChunk();
599     }
600 
601     int64_t startPos = fMatchEnd;
602     if (startPos==0) {
603         startPos = fActiveStart;
604     }
605 
606     if (fMatch) {
607         // Save the position of any previous successful match.
608         fLastMatchEnd = fMatchEnd;
609 
610         if (fMatchStart == fMatchEnd) {
611             // Previous match had zero length.  Move start position up one position
612             //  to avoid sending find() into a loop on zero-length matches.
613             if (startPos >= fActiveLimit) {
614                 fMatch = FALSE;
615                 fHitEnd = TRUE;
616                 return FALSE;
617             }
618             UTEXT_SETNATIVEINDEX(fInputText, startPos);
619             UTEXT_NEXT32(fInputText);
620             startPos = UTEXT_GETNATIVEINDEX(fInputText);
621         }
622     } else {
623         if (fLastMatchEnd >= 0) {
624             // A previous find() failed to match.  Don't try again.
625             //   (without this test, a pattern with a zero-length match
626             //    could match again at the end of an input string.)
627             fHitEnd = TRUE;
628             return FALSE;
629         }
630     }
631 
632 
633     // Compute the position in the input string beyond which a match can not begin, because
634     //   the minimum length match would extend past the end of the input.
635     //   Note:  some patterns that cannot match anything will have fMinMatchLength==Max Int.
636     //          Be aware of possible overflows if making changes here.
637     int64_t testStartLimit;
638     if (UTEXT_USES_U16(fInputText)) {
639         testStartLimit = fActiveLimit - fPattern->fMinMatchLen;
640         if (startPos > testStartLimit) {
641             fMatch = FALSE;
642             fHitEnd = TRUE;
643             return FALSE;
644         }
645     } else {
646         // For now, let the matcher discover that it can't match on its own
647         // We don't know how long the match len is in native characters
648         testStartLimit = fActiveLimit;
649     }
650 
651     UChar32  c;
652     U_ASSERT(startPos >= 0);
653 
654     switch (fPattern->fStartType) {
655     case START_NO_INFO:
656         // No optimization was found.
657         //  Try a match at each input position.
658         for (;;) {
659             MatchAt(startPos, FALSE, fDeferredStatus);
660             if (U_FAILURE(fDeferredStatus)) {
661                 return FALSE;
662             }
663             if (fMatch) {
664                 return TRUE;
665             }
666             if (startPos >= testStartLimit) {
667                 fHitEnd = TRUE;
668                 return FALSE;
669             }
670             UTEXT_SETNATIVEINDEX(fInputText, startPos);
671             UTEXT_NEXT32(fInputText);
672             startPos = UTEXT_GETNATIVEINDEX(fInputText);
673             // Note that it's perfectly OK for a pattern to have a zero-length
674             //   match at the end of a string, so we must make sure that the loop
675             //   runs with startPos == testStartLimit the last time through.
676             if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
677                 return FALSE;
678         }
679         U_ASSERT(FALSE);
680 
681     case START_START:
682         // Matches are only possible at the start of the input string
683         //   (pattern begins with ^ or \A)
684         if (startPos > fActiveStart) {
685             fMatch = FALSE;
686             return FALSE;
687         }
688         MatchAt(startPos, FALSE, fDeferredStatus);
689         if (U_FAILURE(fDeferredStatus)) {
690             return FALSE;
691         }
692         return fMatch;
693 
694 
695     case START_SET:
696         {
697             // Match may start on any char from a pre-computed set.
698             U_ASSERT(fPattern->fMinMatchLen > 0);
699             int64_t pos;
700             UTEXT_SETNATIVEINDEX(fInputText, startPos);
701             for (;;) {
702                 c = UTEXT_NEXT32(fInputText);
703                 pos = UTEXT_GETNATIVEINDEX(fInputText);
704                 // c will be -1 (U_SENTINEL) at end of text, in which case we
705                 // skip this next block (so we don't have a negative array index)
706                 // and handle end of text in the following block.
707                 if (c >= 0 && ((c<256 && fPattern->fInitialChars8->contains(c)) ||
708                               (c>=256 && fPattern->fInitialChars->contains(c)))) {
709                     MatchAt(startPos, FALSE, fDeferredStatus);
710                     if (U_FAILURE(fDeferredStatus)) {
711                         return FALSE;
712                     }
713                     if (fMatch) {
714                         return TRUE;
715                     }
716                     UTEXT_SETNATIVEINDEX(fInputText, pos);
717                 }
718                 if (startPos >= testStartLimit) {
719                     fMatch = FALSE;
720                     fHitEnd = TRUE;
721                     return FALSE;
722                 }
723                 startPos = pos;
724 	            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
725                     return FALSE;
726             }
727         }
728         U_ASSERT(FALSE);
729 
730     case START_STRING:
731     case START_CHAR:
732         {
733             // Match starts on exactly one char.
734             U_ASSERT(fPattern->fMinMatchLen > 0);
735             UChar32 theChar = fPattern->fInitialChar;
736             int64_t pos;
737             UTEXT_SETNATIVEINDEX(fInputText, startPos);
738             for (;;) {
739                 c = UTEXT_NEXT32(fInputText);
740                 pos = UTEXT_GETNATIVEINDEX(fInputText);
741                 if (c == theChar) {
742                     MatchAt(startPos, FALSE, fDeferredStatus);
743                     if (U_FAILURE(fDeferredStatus)) {
744                         return FALSE;
745                     }
746                     if (fMatch) {
747                         return TRUE;
748                     }
749                     UTEXT_SETNATIVEINDEX(fInputText, pos);
750                 }
751                 if (startPos >= testStartLimit) {
752                     fMatch = FALSE;
753                     fHitEnd = TRUE;
754                     return FALSE;
755                 }
756                 startPos = pos;
757 	            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
758                     return FALSE;
759            }
760         }
761         U_ASSERT(FALSE);
762 
763     case START_LINE:
764         {
765             UChar32  c;
766             if (startPos == fAnchorStart) {
767                 MatchAt(startPos, FALSE, fDeferredStatus);
768                 if (U_FAILURE(fDeferredStatus)) {
769                     return FALSE;
770                 }
771                 if (fMatch) {
772                     return TRUE;
773                 }
774                 UTEXT_SETNATIVEINDEX(fInputText, startPos);
775                 c = UTEXT_NEXT32(fInputText);
776                 startPos = UTEXT_GETNATIVEINDEX(fInputText);
777             } else {
778                 UTEXT_SETNATIVEINDEX(fInputText, startPos);
779                 c = UTEXT_PREVIOUS32(fInputText);
780                 UTEXT_SETNATIVEINDEX(fInputText, startPos);
781             }
782 
783             if (fPattern->fFlags & UREGEX_UNIX_LINES) {
784                 for (;;) {
785                     if (c == 0x0a) {
786                             MatchAt(startPos, FALSE, fDeferredStatus);
787                             if (U_FAILURE(fDeferredStatus)) {
788                                 return FALSE;
789                             }
790                             if (fMatch) {
791                                 return TRUE;
792                             }
793                             UTEXT_SETNATIVEINDEX(fInputText, startPos);
794                     }
795                     if (startPos >= testStartLimit) {
796                         fMatch = FALSE;
797                         fHitEnd = TRUE;
798                         return FALSE;
799                     }
800                     c = UTEXT_NEXT32(fInputText);
801                     startPos = UTEXT_GETNATIVEINDEX(fInputText);
802                     // Note that it's perfectly OK for a pattern to have a zero-length
803                     //   match at the end of a string, so we must make sure that the loop
804                     //   runs with startPos == testStartLimit the last time through.
805 		            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
806                         return FALSE;
807                 }
808             } else {
809                 for (;;) {
810                     if (((c & 0x7f) <= 0x29) &&     // First quickly bypass as many chars as possible
811                         ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 )) {
812                             if (c == 0x0d && startPos < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
813                                 UTEXT_NEXT32(fInputText);
814                                 startPos = UTEXT_GETNATIVEINDEX(fInputText);
815                             }
816                             MatchAt(startPos, FALSE, fDeferredStatus);
817                             if (U_FAILURE(fDeferredStatus)) {
818                                 return FALSE;
819                             }
820                             if (fMatch) {
821                                 return TRUE;
822                             }
823                             UTEXT_SETNATIVEINDEX(fInputText, startPos);
824                     }
825                     if (startPos >= testStartLimit) {
826                         fMatch = FALSE;
827                         fHitEnd = TRUE;
828                         return FALSE;
829                     }
830                     c = UTEXT_NEXT32(fInputText);
831                     startPos = UTEXT_GETNATIVEINDEX(fInputText);
832                     // Note that it's perfectly OK for a pattern to have a zero-length
833                     //   match at the end of a string, so we must make sure that the loop
834                     //   runs with startPos == testStartLimit the last time through.
835 		            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
836                         return FALSE;
837                 }
838             }
839         }
840 
841     default:
842         U_ASSERT(FALSE);
843     }
844 
845     U_ASSERT(FALSE);
846     return FALSE;
847 }
848 
849 
850 
find(int64_t start,UErrorCode & status)851 UBool RegexMatcher::find(int64_t start, UErrorCode &status) {
852     if (U_FAILURE(status)) {
853         return FALSE;
854     }
855     if (U_FAILURE(fDeferredStatus)) {
856         status = fDeferredStatus;
857         return FALSE;
858     }
859     this->reset();                        // Note:  Reset() is specified by Java Matcher documentation.
860                                           //        This will reset the region to be the full input length.
861     if (start < 0) {
862         status = U_INDEX_OUTOFBOUNDS_ERROR;
863         return FALSE;
864     }
865 
866     int64_t nativeStart = start;
867     if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
868         status = U_INDEX_OUTOFBOUNDS_ERROR;
869         return FALSE;
870     }
871     fMatchEnd = nativeStart;
872     return find();
873 }
874 
875 
876 //--------------------------------------------------------------------------------
877 //
878 //   findUsingChunk() -- like find(), but with the advance knowledge that the
879 //                       entire string is available in the UText's chunk buffer.
880 //
881 //--------------------------------------------------------------------------------
findUsingChunk()882 UBool RegexMatcher::findUsingChunk() {
883     // Start at the position of the last match end.  (Will be zero if the
884     //   matcher has been reset.
885     //
886 
887     int32_t startPos = (int32_t)fMatchEnd;
888     if (startPos==0) {
889         startPos = (int32_t)fActiveStart;
890     }
891 
892     const UChar *inputBuf = fInputText->chunkContents;
893 
894     if (fMatch) {
895         // Save the position of any previous successful match.
896         fLastMatchEnd = fMatchEnd;
897 
898         if (fMatchStart == fMatchEnd) {
899             // Previous match had zero length.  Move start position up one position
900             //  to avoid sending find() into a loop on zero-length matches.
901             if (startPos >= fActiveLimit) {
902                 fMatch = FALSE;
903                 fHitEnd = TRUE;
904                 return FALSE;
905             }
906             U16_FWD_1(inputBuf, startPos, fInputLength);
907         }
908     } else {
909         if (fLastMatchEnd >= 0) {
910             // A previous find() failed to match.  Don't try again.
911             //   (without this test, a pattern with a zero-length match
912             //    could match again at the end of an input string.)
913             fHitEnd = TRUE;
914             return FALSE;
915         }
916     }
917 
918 
919     // Compute the position in the input string beyond which a match can not begin, because
920     //   the minimum length match would extend past the end of the input.
921     //   Note:  some patterns that cannot match anything will have fMinMatchLength==Max Int.
922     //          Be aware of possible overflows if making changes here.
923     int32_t testLen  = (int32_t)(fActiveLimit - fPattern->fMinMatchLen);
924     if (startPos > testLen) {
925         fMatch = FALSE;
926         fHitEnd = TRUE;
927         return FALSE;
928     }
929 
930     UChar32  c;
931     U_ASSERT(startPos >= 0);
932 
933     switch (fPattern->fStartType) {
934     case START_NO_INFO:
935         // No optimization was found.
936         //  Try a match at each input position.
937         for (;;) {
938             MatchChunkAt(startPos, FALSE, fDeferredStatus);
939             if (U_FAILURE(fDeferredStatus)) {
940                 return FALSE;
941             }
942             if (fMatch) {
943                 return TRUE;
944             }
945             if (startPos >= testLen) {
946                 fHitEnd = TRUE;
947                 return FALSE;
948             }
949             U16_FWD_1(inputBuf, startPos, fActiveLimit);
950             // Note that it's perfectly OK for a pattern to have a zero-length
951             //   match at the end of a string, so we must make sure that the loop
952             //   runs with startPos == testLen the last time through.
953             if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
954                 return FALSE;
955         }
956         U_ASSERT(FALSE);
957 
958     case START_START:
959         // Matches are only possible at the start of the input string
960         //   (pattern begins with ^ or \A)
961         if (startPos > fActiveStart) {
962             fMatch = FALSE;
963             return FALSE;
964         }
965         MatchChunkAt(startPos, FALSE, fDeferredStatus);
966         if (U_FAILURE(fDeferredStatus)) {
967             return FALSE;
968         }
969         return fMatch;
970 
971 
972     case START_SET:
973     {
974         // Match may start on any char from a pre-computed set.
975         U_ASSERT(fPattern->fMinMatchLen > 0);
976         for (;;) {
977             int32_t pos = startPos;
978             U16_NEXT(inputBuf, startPos, fActiveLimit, c);  // like c = inputBuf[startPos++];
979             if ((c<256 && fPattern->fInitialChars8->contains(c)) ||
980                 (c>=256 && fPattern->fInitialChars->contains(c))) {
981                 MatchChunkAt(pos, FALSE, fDeferredStatus);
982                 if (U_FAILURE(fDeferredStatus)) {
983                     return FALSE;
984                 }
985                 if (fMatch) {
986                     return TRUE;
987                 }
988             }
989             if (pos >= testLen) {
990                 fMatch = FALSE;
991                 fHitEnd = TRUE;
992                 return FALSE;
993             }
994             if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
995                 return FALSE;
996         }
997     }
998         U_ASSERT(FALSE);
999 
1000     case START_STRING:
1001     case START_CHAR:
1002     {
1003         // Match starts on exactly one char.
1004         U_ASSERT(fPattern->fMinMatchLen > 0);
1005         UChar32 theChar = fPattern->fInitialChar;
1006         for (;;) {
1007             int32_t pos = startPos;
1008             U16_NEXT(inputBuf, startPos, fActiveLimit, c);  // like c = inputBuf[startPos++];
1009             if (c == theChar) {
1010                 MatchChunkAt(pos, FALSE, fDeferredStatus);
1011                 if (U_FAILURE(fDeferredStatus)) {
1012                     return FALSE;
1013                 }
1014                 if (fMatch) {
1015                     return TRUE;
1016                 }
1017             }
1018             if (pos >= testLen) {
1019                 fMatch = FALSE;
1020                 fHitEnd = TRUE;
1021                 return FALSE;
1022             }
1023             if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
1024                 return FALSE;
1025         }
1026     }
1027         U_ASSERT(FALSE);
1028 
1029     case START_LINE:
1030     {
1031         UChar32  c;
1032         if (startPos == fAnchorStart) {
1033             MatchChunkAt(startPos, FALSE, fDeferredStatus);
1034             if (U_FAILURE(fDeferredStatus)) {
1035                 return FALSE;
1036             }
1037             if (fMatch) {
1038                 return TRUE;
1039             }
1040             U16_FWD_1(inputBuf, startPos, fActiveLimit);
1041         }
1042 
1043         if (fPattern->fFlags & UREGEX_UNIX_LINES) {
1044             for (;;) {
1045                 c = inputBuf[startPos-1];
1046                 if (c == 0x0a) {
1047                     MatchChunkAt(startPos, FALSE, fDeferredStatus);
1048                     if (U_FAILURE(fDeferredStatus)) {
1049                         return FALSE;
1050                     }
1051                     if (fMatch) {
1052                         return TRUE;
1053                     }
1054                 }
1055                 if (startPos >= testLen) {
1056                     fMatch = FALSE;
1057                     fHitEnd = TRUE;
1058                     return FALSE;
1059                 }
1060                 U16_FWD_1(inputBuf, startPos, fActiveLimit);
1061                 // Note that it's perfectly OK for a pattern to have a zero-length
1062                 //   match at the end of a string, so we must make sure that the loop
1063                 //   runs with startPos == testLen the last time through.
1064 	            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
1065                     return FALSE;
1066             }
1067         } else {
1068             for (;;) {
1069                 c = inputBuf[startPos-1];
1070                 if (((c & 0x7f) <= 0x29) &&     // First quickly bypass as many chars as possible
1071                     ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 )) {
1072                     if (c == 0x0d && startPos < fActiveLimit && inputBuf[startPos] == 0x0a) {
1073                         startPos++;
1074                     }
1075                     MatchChunkAt(startPos, FALSE, fDeferredStatus);
1076                     if (U_FAILURE(fDeferredStatus)) {
1077                         return FALSE;
1078                     }
1079                     if (fMatch) {
1080                         return TRUE;
1081                     }
1082                 }
1083                 if (startPos >= testLen) {
1084                     fMatch = FALSE;
1085                     fHitEnd = TRUE;
1086                     return FALSE;
1087                 }
1088                 U16_FWD_1(inputBuf, startPos, fActiveLimit);
1089                 // Note that it's perfectly OK for a pattern to have a zero-length
1090                 //   match at the end of a string, so we must make sure that the loop
1091                 //   runs with startPos == testLen the last time through.
1092 	            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
1093                     return FALSE;
1094             }
1095         }
1096     }
1097 
1098     default:
1099         U_ASSERT(FALSE);
1100     }
1101 
1102     U_ASSERT(FALSE);
1103     return FALSE;
1104 }
1105 
1106 
1107 
1108 //--------------------------------------------------------------------------------
1109 //
1110 //  group()
1111 //
1112 //--------------------------------------------------------------------------------
group(UErrorCode & status) const1113 UnicodeString RegexMatcher::group(UErrorCode &status) const {
1114     return group(0, status);
1115 }
1116 
1117 //  Return immutable shallow clone
group(UText * dest,int64_t & group_len,UErrorCode & status) const1118 UText *RegexMatcher::group(UText *dest, int64_t &group_len, UErrorCode &status) const {
1119     return group(0, dest, group_len, status);
1120 }
1121 
1122 //  Return immutable shallow clone
group(int32_t groupNum,UText * dest,int64_t & group_len,UErrorCode & status) const1123 UText *RegexMatcher::group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const {
1124     group_len = 0;
1125     UBool bailOut = FALSE;
1126     if (U_FAILURE(status)) {
1127         return dest;
1128     }
1129     if (U_FAILURE(fDeferredStatus)) {
1130         status = fDeferredStatus;
1131         bailOut = TRUE;
1132     }
1133     if (fMatch == FALSE) {
1134         status = U_REGEX_INVALID_STATE;
1135         bailOut = TRUE;
1136     }
1137     if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
1138         status = U_INDEX_OUTOFBOUNDS_ERROR;
1139         bailOut = TRUE;
1140     }
1141 
1142     if (bailOut) {
1143         return (dest) ? dest : utext_openUChars(NULL, NULL, 0, &status);
1144     }
1145 
1146     int64_t s, e;
1147     if (groupNum == 0) {
1148         s = fMatchStart;
1149         e = fMatchEnd;
1150     } else {
1151         int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1152         U_ASSERT(groupOffset < fPattern->fFrameSize);
1153         U_ASSERT(groupOffset >= 0);
1154         s = fFrame->fExtra[groupOffset];
1155         e = fFrame->fExtra[groupOffset+1];
1156     }
1157 
1158     if (s < 0) {
1159         // A capture group wasn't part of the match
1160         return utext_clone(dest, fInputText, FALSE, TRUE, &status);
1161     }
1162     U_ASSERT(s <= e);
1163     group_len = e - s;
1164 
1165     dest = utext_clone(dest, fInputText, FALSE, TRUE, &status);
1166     if (dest)
1167         UTEXT_SETNATIVEINDEX(dest, s);
1168     return dest;
1169 }
1170 
group(int32_t groupNum,UErrorCode & status) const1171 UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const {
1172     UnicodeString result;
1173     if (U_FAILURE(status)) {
1174         return result;
1175     }
1176     UText resultText = UTEXT_INITIALIZER;
1177     utext_openUnicodeString(&resultText, &result, &status);
1178     group(groupNum, &resultText, status);
1179     utext_close(&resultText);
1180     return result;
1181 }
1182 
1183 
1184 //  Return deep (mutable) clone
1185 //		Technology Preview (as an API), but note that the UnicodeString API is implemented
1186 //		using this function.
group(int32_t groupNum,UText * dest,UErrorCode & status) const1187 UText *RegexMatcher::group(int32_t groupNum, UText *dest, UErrorCode &status) const {
1188     UBool bailOut = FALSE;
1189     if (U_FAILURE(status)) {
1190         return dest;
1191     }
1192     if (U_FAILURE(fDeferredStatus)) {
1193         status = fDeferredStatus;
1194         bailOut = TRUE;
1195     }
1196 
1197     if (fMatch == FALSE) {
1198         status = U_REGEX_INVALID_STATE;
1199         bailOut = TRUE;
1200     }
1201     if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
1202         status = U_INDEX_OUTOFBOUNDS_ERROR;
1203         bailOut = TRUE;
1204     }
1205 
1206     if (bailOut) {
1207         if (dest) {
1208             utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status);
1209             return dest;
1210         } else {
1211             return utext_openUChars(NULL, NULL, 0, &status);
1212         }
1213     }
1214 
1215     int64_t s, e;
1216     if (groupNum == 0) {
1217         s = fMatchStart;
1218         e = fMatchEnd;
1219     } else {
1220         int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1221         U_ASSERT(groupOffset < fPattern->fFrameSize);
1222         U_ASSERT(groupOffset >= 0);
1223         s = fFrame->fExtra[groupOffset];
1224         e = fFrame->fExtra[groupOffset+1];
1225     }
1226 
1227     if (s < 0) {
1228         // A capture group wasn't part of the match
1229         if (dest) {
1230             utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status);
1231             return dest;
1232         } else {
1233             return utext_openUChars(NULL, NULL, 0, &status);
1234         }
1235     }
1236     U_ASSERT(s <= e);
1237 
1238     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1239         U_ASSERT(e <= fInputLength);
1240         if (dest) {
1241             utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents+s, (int32_t)(e-s), &status);
1242         } else {
1243             UText groupText = UTEXT_INITIALIZER;
1244             utext_openUChars(&groupText, fInputText->chunkContents+s, e-s, &status);
1245             dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status);
1246             utext_close(&groupText);
1247         }
1248     } else {
1249         int32_t len16;
1250         if (UTEXT_USES_U16(fInputText)) {
1251             len16 = (int32_t)(e-s);
1252         } else {
1253             UErrorCode lengthStatus = U_ZERO_ERROR;
1254             len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus);
1255         }
1256         UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
1257         if (groupChars == NULL) {
1258             status = U_MEMORY_ALLOCATION_ERROR;
1259             return dest;
1260         }
1261         utext_extract(fInputText, s, e, groupChars, len16+1, &status);
1262 
1263         if (dest) {
1264             utext_replace(dest, 0, utext_nativeLength(dest), groupChars, len16, &status);
1265         } else {
1266             UText groupText = UTEXT_INITIALIZER;
1267             utext_openUChars(&groupText, groupChars, len16, &status);
1268             dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status);
1269             utext_close(&groupText);
1270         }
1271 
1272         uprv_free(groupChars);
1273     }
1274     return dest;
1275 }
1276 
1277 //--------------------------------------------------------------------------------
1278 //
1279 //  appendGroup() -- currently internal only, appends a group to a UText rather
1280 //                   than replacing its contents
1281 //
1282 //--------------------------------------------------------------------------------
1283 
appendGroup(int32_t groupNum,UText * dest,UErrorCode & status) const1284 int64_t RegexMatcher::appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const {
1285     if (U_FAILURE(status)) {
1286         return 0;
1287     }
1288     if (U_FAILURE(fDeferredStatus)) {
1289         status = fDeferredStatus;
1290         return 0;
1291     }
1292     int64_t destLen = utext_nativeLength(dest);
1293 
1294     if (fMatch == FALSE) {
1295         status = U_REGEX_INVALID_STATE;
1296         return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1297     }
1298     if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
1299         status = U_INDEX_OUTOFBOUNDS_ERROR;
1300         return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1301     }
1302 
1303     int64_t s, e;
1304     if (groupNum == 0) {
1305         s = fMatchStart;
1306         e = fMatchEnd;
1307     } else {
1308         int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1309         U_ASSERT(groupOffset < fPattern->fFrameSize);
1310         U_ASSERT(groupOffset >= 0);
1311         s = fFrame->fExtra[groupOffset];
1312         e = fFrame->fExtra[groupOffset+1];
1313     }
1314 
1315     if (s < 0) {
1316         // A capture group wasn't part of the match
1317         return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1318     }
1319     U_ASSERT(s <= e);
1320 
1321     int64_t deltaLen;
1322     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1323         U_ASSERT(e <= fInputLength);
1324         deltaLen = utext_replace(dest, destLen, destLen, fInputText->chunkContents+s, (int32_t)(e-s), &status);
1325     } else {
1326         int32_t len16;
1327         if (UTEXT_USES_U16(fInputText)) {
1328             len16 = (int32_t)(e-s);
1329         } else {
1330             UErrorCode lengthStatus = U_ZERO_ERROR;
1331             len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus);
1332         }
1333         UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
1334         if (groupChars == NULL) {
1335             status = U_MEMORY_ALLOCATION_ERROR;
1336             return 0;
1337         }
1338         utext_extract(fInputText, s, e, groupChars, len16+1, &status);
1339 
1340         deltaLen = utext_replace(dest, destLen, destLen, groupChars, len16, &status);
1341         uprv_free(groupChars);
1342     }
1343     return deltaLen;
1344 }
1345 
1346 
1347 
1348 //--------------------------------------------------------------------------------
1349 //
1350 //  groupCount()
1351 //
1352 //--------------------------------------------------------------------------------
groupCount() const1353 int32_t RegexMatcher::groupCount() const {
1354     return fPattern->fGroupMap->size();
1355 }
1356 
1357 
1358 
1359 //--------------------------------------------------------------------------------
1360 //
1361 //  hasAnchoringBounds()
1362 //
1363 //--------------------------------------------------------------------------------
hasAnchoringBounds() const1364 UBool RegexMatcher::hasAnchoringBounds() const {
1365     return fAnchoringBounds;
1366 }
1367 
1368 
1369 //--------------------------------------------------------------------------------
1370 //
1371 //  hasTransparentBounds()
1372 //
1373 //--------------------------------------------------------------------------------
hasTransparentBounds() const1374 UBool RegexMatcher::hasTransparentBounds() const {
1375     return fTransparentBounds;
1376 }
1377 
1378 
1379 
1380 //--------------------------------------------------------------------------------
1381 //
1382 //  hitEnd()
1383 //
1384 //--------------------------------------------------------------------------------
hitEnd() const1385 UBool RegexMatcher::hitEnd() const {
1386     return fHitEnd;
1387 }
1388 
1389 
1390 //--------------------------------------------------------------------------------
1391 //
1392 //  input()
1393 //
1394 //--------------------------------------------------------------------------------
input() const1395 const UnicodeString &RegexMatcher::input() const {
1396     if (!fInput) {
1397         UErrorCode status = U_ZERO_ERROR;
1398         int32_t len16;
1399         if (UTEXT_USES_U16(fInputText)) {
1400             len16 = (int32_t)fInputLength;
1401         } else {
1402             len16 = utext_extract(fInputText, 0, fInputLength, NULL, 0, &status);
1403             status = U_ZERO_ERROR; // overflow, length status
1404         }
1405         UnicodeString *result = new UnicodeString(len16, 0, 0);
1406 
1407         UChar *inputChars = result->getBuffer(len16);
1408         utext_extract(fInputText, 0, fInputLength, inputChars, len16, &status); // unterminated warning
1409         result->releaseBuffer(len16);
1410 
1411         (*(const UnicodeString **)&fInput) = result; // pointer assignment, rather than operator=
1412     }
1413 
1414     return *fInput;
1415 }
1416 
1417 //--------------------------------------------------------------------------------
1418 //
1419 //  inputText()
1420 //
1421 //--------------------------------------------------------------------------------
inputText() const1422 UText *RegexMatcher::inputText() const {
1423     return fInputText;
1424 }
1425 
1426 
1427 //--------------------------------------------------------------------------------
1428 //
1429 //  getInput() -- like inputText(), but makes a clone or copies into another UText
1430 //
1431 //--------------------------------------------------------------------------------
getInput(UText * dest,UErrorCode & status) const1432 UText *RegexMatcher::getInput (UText *dest, UErrorCode &status) const {
1433     UBool bailOut = FALSE;
1434     if (U_FAILURE(status)) {
1435         return dest;
1436     }
1437     if (U_FAILURE(fDeferredStatus)) {
1438         status = fDeferredStatus;
1439         bailOut = TRUE;
1440     }
1441 
1442     if (bailOut) {
1443         if (dest) {
1444             utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status);
1445             return dest;
1446         } else {
1447             return utext_clone(NULL, fInputText, FALSE, TRUE, &status);
1448         }
1449     }
1450 
1451     if (dest) {
1452         if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1453             utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents, (int32_t)fInputLength, &status);
1454         } else {
1455             int32_t input16Len;
1456             if (UTEXT_USES_U16(fInputText)) {
1457                 input16Len = (int32_t)fInputLength;
1458             } else {
1459                 UErrorCode lengthStatus = U_ZERO_ERROR;
1460                 input16Len = utext_extract(fInputText, 0, fInputLength, NULL, 0, &lengthStatus); // buffer overflow error
1461             }
1462             UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(input16Len));
1463             if (inputChars == NULL) {
1464                 return dest;
1465             }
1466 
1467             status = U_ZERO_ERROR;
1468             utext_extract(fInputText, 0, fInputLength, inputChars, input16Len, &status); // not terminated warning
1469             status = U_ZERO_ERROR;
1470             utext_replace(dest, 0, utext_nativeLength(dest), inputChars, input16Len, &status);
1471 
1472             uprv_free(inputChars);
1473         }
1474         return dest;
1475     } else {
1476         return utext_clone(NULL, fInputText, FALSE, TRUE, &status);
1477     }
1478 }
1479 
1480 
1481 static UBool compat_SyncMutableUTextContents(UText *ut);
compat_SyncMutableUTextContents(UText * ut)1482 static UBool compat_SyncMutableUTextContents(UText *ut) {
1483     UBool retVal = FALSE;
1484 
1485     //  In the following test, we're really only interested in whether the UText should switch
1486     //  between heap and stack allocation.  If length hasn't changed, we won't, so the chunkContents
1487     //  will still point to the correct data.
1488     if (utext_nativeLength(ut) != ut->nativeIndexingLimit) {
1489         UnicodeString *us=(UnicodeString *)ut->context;
1490 
1491         // Update to the latest length.
1492         // For example, (utext_nativeLength(ut) != ut->nativeIndexingLimit).
1493         int32_t newLength = us->length();
1494 
1495         // Update the chunk description.
1496         // The buffer may have switched between stack- and heap-based.
1497         ut->chunkContents    = us->getBuffer();
1498         ut->chunkLength      = newLength;
1499         ut->chunkNativeLimit = newLength;
1500         ut->nativeIndexingLimit = newLength;
1501         retVal = TRUE;
1502     }
1503 
1504     return retVal;
1505 }
1506 
1507 //--------------------------------------------------------------------------------
1508 //
1509 //  lookingAt()
1510 //
1511 //--------------------------------------------------------------------------------
lookingAt(UErrorCode & status)1512 UBool RegexMatcher::lookingAt(UErrorCode &status) {
1513     if (U_FAILURE(status)) {
1514         return FALSE;
1515     }
1516     if (U_FAILURE(fDeferredStatus)) {
1517         status = fDeferredStatus;
1518         return FALSE;
1519     }
1520 
1521     if (fInputUniStrMaybeMutable) {
1522         if (compat_SyncMutableUTextContents(fInputText)) {
1523         fInputLength = utext_nativeLength(fInputText);
1524         reset();
1525         }
1526     }
1527     else {
1528         resetPreserveRegion();
1529     }
1530     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1531         MatchChunkAt((int32_t)fActiveStart, FALSE, status);
1532     } else {
1533         MatchAt(fActiveStart, FALSE, status);
1534     }
1535     return fMatch;
1536 }
1537 
1538 
lookingAt(int64_t start,UErrorCode & status)1539 UBool RegexMatcher::lookingAt(int64_t start, UErrorCode &status) {
1540     if (U_FAILURE(status)) {
1541         return FALSE;
1542     }
1543     if (U_FAILURE(fDeferredStatus)) {
1544         status = fDeferredStatus;
1545         return FALSE;
1546     }
1547     reset();
1548 
1549     if (start < 0) {
1550         status = U_INDEX_OUTOFBOUNDS_ERROR;
1551         return FALSE;
1552     }
1553 
1554     if (fInputUniStrMaybeMutable) {
1555         if (compat_SyncMutableUTextContents(fInputText)) {
1556         fInputLength = utext_nativeLength(fInputText);
1557         reset();
1558         }
1559     }
1560 
1561     int64_t nativeStart;
1562     nativeStart = start;
1563     if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
1564         status = U_INDEX_OUTOFBOUNDS_ERROR;
1565         return FALSE;
1566     }
1567 
1568     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1569         MatchChunkAt((int32_t)nativeStart, FALSE, status);
1570     } else {
1571         MatchAt(nativeStart, FALSE, status);
1572     }
1573     return fMatch;
1574 }
1575 
1576 
1577 
1578 //--------------------------------------------------------------------------------
1579 //
1580 //  matches()
1581 //
1582 //--------------------------------------------------------------------------------
matches(UErrorCode & status)1583 UBool RegexMatcher::matches(UErrorCode &status) {
1584     if (U_FAILURE(status)) {
1585         return FALSE;
1586     }
1587     if (U_FAILURE(fDeferredStatus)) {
1588         status = fDeferredStatus;
1589         return FALSE;
1590     }
1591 
1592     if (fInputUniStrMaybeMutable) {
1593         if (compat_SyncMutableUTextContents(fInputText)) {
1594         fInputLength = utext_nativeLength(fInputText);
1595         reset();
1596         }
1597     }
1598     else {
1599         resetPreserveRegion();
1600     }
1601 
1602     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1603         MatchChunkAt((int32_t)fActiveStart, TRUE, status);
1604     } else {
1605         MatchAt(fActiveStart, TRUE, status);
1606     }
1607     return fMatch;
1608 }
1609 
1610 
matches(int64_t start,UErrorCode & status)1611 UBool RegexMatcher::matches(int64_t start, UErrorCode &status) {
1612     if (U_FAILURE(status)) {
1613         return FALSE;
1614     }
1615     if (U_FAILURE(fDeferredStatus)) {
1616         status = fDeferredStatus;
1617         return FALSE;
1618     }
1619     reset();
1620 
1621     if (start < 0) {
1622         status = U_INDEX_OUTOFBOUNDS_ERROR;
1623         return FALSE;
1624     }
1625 
1626     if (fInputUniStrMaybeMutable) {
1627         if (compat_SyncMutableUTextContents(fInputText)) {
1628         fInputLength = utext_nativeLength(fInputText);
1629         reset();
1630         }
1631     }
1632 
1633     int64_t nativeStart;
1634     nativeStart = start;
1635     if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
1636         status = U_INDEX_OUTOFBOUNDS_ERROR;
1637         return FALSE;
1638     }
1639 
1640     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1641         MatchChunkAt((int32_t)nativeStart, TRUE, status);
1642     } else {
1643         MatchAt(nativeStart, TRUE, status);
1644     }
1645     return fMatch;
1646 }
1647 
1648 
1649 
1650 //--------------------------------------------------------------------------------
1651 //
1652 //    pattern
1653 //
1654 //--------------------------------------------------------------------------------
pattern() const1655 const RegexPattern &RegexMatcher::pattern() const {
1656     return *fPattern;
1657 }
1658 
1659 
1660 
1661 //--------------------------------------------------------------------------------
1662 //
1663 //    region
1664 //
1665 //--------------------------------------------------------------------------------
region(int64_t regionStart,int64_t regionLimit,int64_t startIndex,UErrorCode & status)1666 RegexMatcher &RegexMatcher::region(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status) {
1667     if (U_FAILURE(status)) {
1668         return *this;
1669     }
1670 
1671     if (regionStart>regionLimit || regionStart<0 || regionLimit<0) {
1672         status = U_ILLEGAL_ARGUMENT_ERROR;
1673     }
1674 
1675     int64_t nativeStart = regionStart;
1676     int64_t nativeLimit = regionLimit;
1677     if (nativeStart > fInputLength || nativeLimit > fInputLength) {
1678       status = U_ILLEGAL_ARGUMENT_ERROR;
1679     }
1680 
1681     if (startIndex == -1)
1682       this->reset();
1683     else
1684       resetPreserveRegion();
1685 
1686     fRegionStart = nativeStart;
1687     fRegionLimit = nativeLimit;
1688     fActiveStart = nativeStart;
1689     fActiveLimit = nativeLimit;
1690 
1691     if (startIndex != -1) {
1692       if (startIndex < fActiveStart || startIndex > fActiveLimit) {
1693           status = U_INDEX_OUTOFBOUNDS_ERROR;
1694       }
1695       fMatchEnd = startIndex;
1696     }
1697 
1698     if (!fTransparentBounds) {
1699         fLookStart = nativeStart;
1700         fLookLimit = nativeLimit;
1701     }
1702     if (fAnchoringBounds) {
1703         fAnchorStart = nativeStart;
1704         fAnchorLimit = nativeLimit;
1705     }
1706     return *this;
1707 }
1708 
region(int64_t start,int64_t limit,UErrorCode & status)1709 RegexMatcher &RegexMatcher::region(int64_t start, int64_t limit, UErrorCode &status) {
1710   return region(start, limit, -1, status);
1711 }
1712 
1713 //--------------------------------------------------------------------------------
1714 //
1715 //    regionEnd
1716 //
1717 //--------------------------------------------------------------------------------
regionEnd() const1718 int32_t RegexMatcher::regionEnd() const {
1719     return (int32_t)fRegionLimit;
1720 }
1721 
regionEnd64() const1722 int64_t RegexMatcher::regionEnd64() const {
1723     return fRegionLimit;
1724 }
1725 
1726 //--------------------------------------------------------------------------------
1727 //
1728 //    regionStart
1729 //
1730 //--------------------------------------------------------------------------------
regionStart() const1731 int32_t RegexMatcher::regionStart() const {
1732     return (int32_t)fRegionStart;
1733 }
1734 
regionStart64() const1735 int64_t RegexMatcher::regionStart64() const {
1736     return fRegionStart;
1737 }
1738 
1739 
1740 //--------------------------------------------------------------------------------
1741 //
1742 //    replaceAll
1743 //
1744 //--------------------------------------------------------------------------------
replaceAll(const UnicodeString & replacement,UErrorCode & status)1745 UnicodeString RegexMatcher::replaceAll(const UnicodeString &replacement, UErrorCode &status) {
1746     UText replacementText = UTEXT_INITIALIZER;
1747     UText resultText = UTEXT_INITIALIZER;
1748     UnicodeString resultString;
1749     if (U_FAILURE(status)) {
1750         return resultString;
1751     }
1752 
1753     utext_openConstUnicodeString(&replacementText, &replacement, &status);
1754     utext_openUnicodeString(&resultText, &resultString, &status);
1755 
1756     replaceAll(&replacementText, &resultText, status);
1757 
1758     utext_close(&resultText);
1759     utext_close(&replacementText);
1760 
1761     return resultString;
1762 }
1763 
1764 
1765 //
1766 //    replaceAll, UText mode
1767 //
replaceAll(UText * replacement,UText * dest,UErrorCode & status)1768 UText *RegexMatcher::replaceAll(UText *replacement, UText *dest, UErrorCode &status) {
1769     if (U_FAILURE(status)) {
1770         return dest;
1771     }
1772     if (U_FAILURE(fDeferredStatus)) {
1773         status = fDeferredStatus;
1774         return dest;
1775     }
1776 
1777     if (dest == NULL) {
1778         UnicodeString emptyString;
1779         UText empty = UTEXT_INITIALIZER;
1780 
1781         utext_openUnicodeString(&empty, &emptyString, &status);
1782         dest = utext_clone(NULL, &empty, TRUE, FALSE, &status);
1783         utext_close(&empty);
1784     }
1785 
1786     if (U_SUCCESS(status)) {
1787         reset();
1788         while (find()) {
1789             appendReplacement(dest, replacement, status);
1790             if (U_FAILURE(status)) {
1791                 break;
1792             }
1793         }
1794         appendTail(dest, status);
1795     }
1796 
1797     return dest;
1798 }
1799 
1800 
1801 //--------------------------------------------------------------------------------
1802 //
1803 //    replaceFirst
1804 //
1805 //--------------------------------------------------------------------------------
replaceFirst(const UnicodeString & replacement,UErrorCode & status)1806 UnicodeString RegexMatcher::replaceFirst(const UnicodeString &replacement, UErrorCode &status) {
1807     UText replacementText = UTEXT_INITIALIZER;
1808     UText resultText = UTEXT_INITIALIZER;
1809     UnicodeString resultString;
1810 
1811     utext_openConstUnicodeString(&replacementText, &replacement, &status);
1812     utext_openUnicodeString(&resultText, &resultString, &status);
1813 
1814     replaceFirst(&replacementText, &resultText, status);
1815 
1816     utext_close(&resultText);
1817     utext_close(&replacementText);
1818 
1819     return resultString;
1820 }
1821 
1822 //
1823 //    replaceFirst, UText mode
1824 //
replaceFirst(UText * replacement,UText * dest,UErrorCode & status)1825 UText *RegexMatcher::replaceFirst(UText *replacement, UText *dest, UErrorCode &status) {
1826     if (U_FAILURE(status)) {
1827         return dest;
1828     }
1829     if (U_FAILURE(fDeferredStatus)) {
1830         status = fDeferredStatus;
1831         return dest;
1832     }
1833 
1834     reset();
1835     if (!find()) {
1836         return getInput(dest, status);
1837     }
1838 
1839     if (dest == NULL) {
1840         UnicodeString emptyString;
1841         UText empty = UTEXT_INITIALIZER;
1842 
1843         utext_openUnicodeString(&empty, &emptyString, &status);
1844         dest = utext_clone(NULL, &empty, TRUE, FALSE, &status);
1845         utext_close(&empty);
1846     }
1847 
1848     appendReplacement(dest, replacement, status);
1849     appendTail(dest, status);
1850 
1851     return dest;
1852 }
1853 
1854 
1855 //--------------------------------------------------------------------------------
1856 //
1857 //     requireEnd
1858 //
1859 //--------------------------------------------------------------------------------
requireEnd() const1860 UBool RegexMatcher::requireEnd() const {
1861     return fRequireEnd;
1862 }
1863 
1864 
1865 //--------------------------------------------------------------------------------
1866 //
1867 //     reset
1868 //
1869 //--------------------------------------------------------------------------------
reset()1870 RegexMatcher &RegexMatcher::reset() {
1871     fRegionStart    = 0;
1872     fRegionLimit    = fInputLength;
1873     fActiveStart    = 0;
1874     fActiveLimit    = fInputLength;
1875     fAnchorStart    = 0;
1876     fAnchorLimit    = fInputLength;
1877     fLookStart      = 0;
1878     fLookLimit      = fInputLength;
1879     resetPreserveRegion();
1880     return *this;
1881 }
1882 
1883 
1884 
resetPreserveRegion()1885 void RegexMatcher::resetPreserveRegion() {
1886     fMatchStart     = 0;
1887     fMatchEnd       = 0;
1888     fLastMatchEnd   = -1;
1889     fAppendPosition = 0;
1890     fMatch          = FALSE;
1891     fHitEnd         = FALSE;
1892     fRequireEnd     = FALSE;
1893     fTime           = 0;
1894     fTickCounter    = TIMER_INITIAL_VALUE;
1895     //resetStack(); // more expensive than it looks...
1896 }
1897 
1898 
reset(const UnicodeString & input)1899 RegexMatcher &RegexMatcher::reset(const UnicodeString &input) {
1900     fInputText = utext_openConstUnicodeString(fInputText, &input, &fDeferredStatus);
1901     if (fPattern->fNeedsAltInput) {
1902         fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
1903     }
1904     fInputLength = utext_nativeLength(fInputText);
1905 
1906     reset();
1907     delete fInput;
1908     fInput = NULL;
1909 
1910     //  Do the following for any UnicodeString.
1911     //  This is for compatibility for those clients who modify the input string "live" during regex operations.
1912     fInputUniStrMaybeMutable = TRUE;
1913 
1914     if (fWordBreakItr != NULL) {
1915 #if UCONFIG_NO_BREAK_ITERATION==0
1916         UErrorCode status = U_ZERO_ERROR;
1917         fWordBreakItr->setText(fInputText, status);
1918 #endif
1919     }
1920     return *this;
1921 }
1922 
1923 
reset(UText * input)1924 RegexMatcher &RegexMatcher::reset(UText *input) {
1925     if (fInputText != input) {
1926         fInputText = utext_clone(fInputText, input, FALSE, TRUE, &fDeferredStatus);
1927         if (fPattern->fNeedsAltInput) fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
1928         fInputLength = utext_nativeLength(fInputText);
1929 
1930         delete fInput;
1931         fInput = NULL;
1932 
1933         if (fWordBreakItr != NULL) {
1934 #if UCONFIG_NO_BREAK_ITERATION==0
1935             UErrorCode status = U_ZERO_ERROR;
1936             fWordBreakItr->setText(input, status);
1937 #endif
1938         }
1939     }
1940     reset();
1941     fInputUniStrMaybeMutable = FALSE;
1942 
1943     return *this;
1944 }
1945 
1946 /*RegexMatcher &RegexMatcher::reset(const UChar *) {
1947     fDeferredStatus = U_INTERNAL_PROGRAM_ERROR;
1948     return *this;
1949 }*/
1950 
reset(int64_t position,UErrorCode & status)1951 RegexMatcher &RegexMatcher::reset(int64_t position, UErrorCode &status) {
1952     if (U_FAILURE(status)) {
1953         return *this;
1954     }
1955     reset();       // Reset also resets the region to be the entire string.
1956 
1957     if (position < 0 || position > fActiveLimit) {
1958         status = U_INDEX_OUTOFBOUNDS_ERROR;
1959         return *this;
1960     }
1961     fMatchEnd = position;
1962     return *this;
1963 }
1964 
1965 
1966 
1967 
1968 
1969 //--------------------------------------------------------------------------------
1970 //
1971 //    setTrace
1972 //
1973 //--------------------------------------------------------------------------------
setTrace(UBool state)1974 void RegexMatcher::setTrace(UBool state) {
1975     fTraceDebug = state;
1976 }
1977 
1978 
1979 
1980 //---------------------------------------------------------------------
1981 //
1982 //   split
1983 //
1984 //---------------------------------------------------------------------
split(const UnicodeString & input,UnicodeString dest[],int32_t destCapacity,UErrorCode & status)1985 int32_t  RegexMatcher::split(const UnicodeString &input,
1986         UnicodeString    dest[],
1987         int32_t          destCapacity,
1988         UErrorCode      &status)
1989 {
1990     UText inputText = UTEXT_INITIALIZER;
1991     utext_openConstUnicodeString(&inputText, &input, &status);
1992     if (U_FAILURE(status)) {
1993         return 0;
1994     }
1995 
1996     UText **destText = (UText **)uprv_malloc(sizeof(UText*)*destCapacity);
1997     if (destText == NULL) {
1998         status = U_MEMORY_ALLOCATION_ERROR;
1999         return 0;
2000     }
2001     int32_t i;
2002     for (i = 0; i < destCapacity; i++) {
2003         destText[i] = utext_openUnicodeString(NULL, &dest[i], &status);
2004     }
2005 
2006     int32_t fieldCount = split(&inputText, destText, destCapacity, status);
2007 
2008     for (i = 0; i < destCapacity; i++) {
2009         utext_close(destText[i]);
2010     }
2011 
2012     uprv_free(destText);
2013     utext_close(&inputText);
2014     return fieldCount;
2015 }
2016 
2017 //
2018 //   split, UText mode
2019 //
split(UText * input,UText * dest[],int32_t destCapacity,UErrorCode & status)2020 int32_t  RegexMatcher::split(UText *input,
2021         UText           *dest[],
2022         int32_t          destCapacity,
2023         UErrorCode      &status)
2024 {
2025     //
2026     // Check arguements for validity
2027     //
2028     if (U_FAILURE(status)) {
2029         return 0;
2030     };
2031 
2032     if (destCapacity < 1) {
2033         status = U_ILLEGAL_ARGUMENT_ERROR;
2034         return 0;
2035     }
2036 
2037     //
2038     // Reset for the input text
2039     //
2040     reset(input);
2041     int64_t   nextOutputStringStart = 0;
2042     if (fActiveLimit == 0) {
2043         return 0;
2044     }
2045 
2046     //
2047     // Loop through the input text, searching for the delimiter pattern
2048     //
2049     int32_t i;
2050     int32_t numCaptureGroups = fPattern->fGroupMap->size();
2051     for (i=0; ; i++) {
2052         if (i>=destCapacity-1) {
2053             // There is one or zero output string left.
2054             // Fill the last output string with whatever is left from the input, then exit the loop.
2055             //  ( i will be == destCapacity if we filled the output array while processing
2056             //    capture groups of the delimiter expression, in which case we will discard the
2057             //    last capture group saved in favor of the unprocessed remainder of the
2058             //    input string.)
2059             i = destCapacity-1;
2060             if (fActiveLimit > nextOutputStringStart) {
2061                 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2062                     if (dest[i]) {
2063                         utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2064                                       input->chunkContents+nextOutputStringStart,
2065                                       (int32_t)(fActiveLimit-nextOutputStringStart), &status);
2066                     } else {
2067                         UText remainingText = UTEXT_INITIALIZER;
2068                         utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2069                                          fActiveLimit-nextOutputStringStart, &status);
2070                         dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2071                         utext_close(&remainingText);
2072                     }
2073                 } else {
2074                     UErrorCode lengthStatus = U_ZERO_ERROR;
2075                     int32_t remaining16Length =
2076                         utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus);
2077                     UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2078                     if (remainingChars == NULL) {
2079                         status = U_MEMORY_ALLOCATION_ERROR;
2080                         break;
2081                     }
2082 
2083                     utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
2084                     if (dest[i]) {
2085                         utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2086                     } else {
2087                         UText remainingText = UTEXT_INITIALIZER;
2088                         utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2089                         dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2090                         utext_close(&remainingText);
2091                     }
2092 
2093                     uprv_free(remainingChars);
2094                 }
2095             }
2096             break;
2097         }
2098         if (find()) {
2099             // We found another delimiter.  Move everything from where we started looking
2100             //  up until the start of the delimiter into the next output string.
2101             if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2102                 if (dest[i]) {
2103                     utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2104                                   input->chunkContents+nextOutputStringStart,
2105                                   (int32_t)(fMatchStart-nextOutputStringStart), &status);
2106                 } else {
2107                     UText remainingText = UTEXT_INITIALIZER;
2108                     utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2109                                       fMatchStart-nextOutputStringStart, &status);
2110                     dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2111                     utext_close(&remainingText);
2112                 }
2113             } else {
2114                 UErrorCode lengthStatus = U_ZERO_ERROR;
2115                 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fMatchStart, NULL, 0, &lengthStatus);
2116                 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2117                 if (remainingChars == NULL) {
2118                     status = U_MEMORY_ALLOCATION_ERROR;
2119                     break;
2120                 }
2121                 utext_extract(input, nextOutputStringStart, fMatchStart, remainingChars, remaining16Length+1, &status);
2122                 if (dest[i]) {
2123                     utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2124                 } else {
2125                     UText remainingText = UTEXT_INITIALIZER;
2126                     utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2127                     dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2128                     utext_close(&remainingText);
2129                 }
2130 
2131                 uprv_free(remainingChars);
2132             }
2133             nextOutputStringStart = fMatchEnd;
2134 
2135             // If the delimiter pattern has capturing parentheses, the captured
2136             //  text goes out into the next n destination strings.
2137             int32_t groupNum;
2138             UBool lastGroupWasNullUText = FALSE;
2139             for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
2140                 if (i==destCapacity-1) {
2141                     break;
2142                 }
2143                 i++;
2144                 lastGroupWasNullUText = (dest[i] == NULL ? TRUE : FALSE);
2145                 dest[i] = group(groupNum, dest[i], status);
2146             }
2147 
2148             if (nextOutputStringStart == fActiveLimit) {
2149                 // The delimiter was at the end of the string.  We're done.
2150                 break;
2151             } else if (i == destCapacity-1) {
2152                 // We're out of capture groups, and the rest of the string is more important
2153                 if (lastGroupWasNullUText) {
2154                     utext_close(dest[i]);
2155                     dest[i] = NULL;
2156                 }
2157             }
2158 
2159         }
2160         else
2161         {
2162             // We ran off the end of the input while looking for the next delimiter.
2163             // All the remaining text goes into the current output string.
2164             if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2165                 if (dest[i]) {
2166                     utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2167                                   input->chunkContents+nextOutputStringStart,
2168                                   (int32_t)(fActiveLimit-nextOutputStringStart), &status);
2169                 } else {
2170                     UText remainingText = UTEXT_INITIALIZER;
2171                     utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2172                                      fActiveLimit-nextOutputStringStart, &status);
2173                     dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2174                     utext_close(&remainingText);
2175                 }
2176             } else {
2177                 UErrorCode lengthStatus = U_ZERO_ERROR;
2178                 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus);
2179                 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2180                 if (remainingChars == NULL) {
2181                     status = U_MEMORY_ALLOCATION_ERROR;
2182                     break;
2183                 }
2184 
2185                 utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
2186                 if (dest[i]) {
2187                     utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2188                 } else {
2189                     UText remainingText = UTEXT_INITIALIZER;
2190                     utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2191                     dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2192                     utext_close(&remainingText);
2193                 }
2194 
2195                 uprv_free(remainingChars);
2196             }
2197             break;
2198         }
2199         if (U_FAILURE(status)) {
2200             break;
2201         }
2202     }   // end of for loop
2203     return i+1;
2204 }
2205 
2206 
2207 //--------------------------------------------------------------------------------
2208 //
2209 //     start
2210 //
2211 //--------------------------------------------------------------------------------
start(UErrorCode & status) const2212 int32_t RegexMatcher::start(UErrorCode &status) const {
2213     return start(0, status);
2214 }
2215 
start64(UErrorCode & status) const2216 int64_t RegexMatcher::start64(UErrorCode &status) const {
2217     return start64(0, status);
2218 }
2219 
2220 //--------------------------------------------------------------------------------
2221 //
2222 //     start(int32_t group, UErrorCode &status)
2223 //
2224 //--------------------------------------------------------------------------------
2225 
start64(int32_t group,UErrorCode & status) const2226 int64_t RegexMatcher::start64(int32_t group, UErrorCode &status) const {
2227     if (U_FAILURE(status)) {
2228         return -1;
2229     }
2230     if (U_FAILURE(fDeferredStatus)) {
2231         status = fDeferredStatus;
2232         return -1;
2233     }
2234     if (fMatch == FALSE) {
2235         status = U_REGEX_INVALID_STATE;
2236         return -1;
2237     }
2238     if (group < 0 || group > fPattern->fGroupMap->size()) {
2239         status = U_INDEX_OUTOFBOUNDS_ERROR;
2240         return -1;
2241     }
2242     int64_t s;
2243     if (group == 0) {
2244         s = fMatchStart;
2245     } else {
2246         int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
2247         U_ASSERT(groupOffset < fPattern->fFrameSize);
2248         U_ASSERT(groupOffset >= 0);
2249         s = fFrame->fExtra[groupOffset];
2250     }
2251 
2252     return s;
2253 }
2254 
2255 
start(int32_t group,UErrorCode & status) const2256 int32_t RegexMatcher::start(int32_t group, UErrorCode &status) const {
2257     return (int32_t)start64(group, status);
2258 }
2259 
2260 //--------------------------------------------------------------------------------
2261 //
2262 //     useAnchoringBounds
2263 //
2264 //--------------------------------------------------------------------------------
useAnchoringBounds(UBool b)2265 RegexMatcher &RegexMatcher::useAnchoringBounds(UBool b) {
2266     fAnchoringBounds = b;
2267     fAnchorStart = (fAnchoringBounds ? fRegionStart : 0);
2268     fAnchorLimit = (fAnchoringBounds ? fRegionLimit : fInputLength);
2269     return *this;
2270 }
2271 
2272 
2273 //--------------------------------------------------------------------------------
2274 //
2275 //     useTransparentBounds
2276 //
2277 //--------------------------------------------------------------------------------
useTransparentBounds(UBool b)2278 RegexMatcher &RegexMatcher::useTransparentBounds(UBool b) {
2279     fTransparentBounds = b;
2280     fLookStart = (fTransparentBounds ? 0 : fRegionStart);
2281     fLookLimit = (fTransparentBounds ? fInputLength : fRegionLimit);
2282     return *this;
2283 }
2284 
2285 //--------------------------------------------------------------------------------
2286 //
2287 //     setTimeLimit
2288 //
2289 //--------------------------------------------------------------------------------
setTimeLimit(int32_t limit,UErrorCode & status)2290 void RegexMatcher::setTimeLimit(int32_t limit, UErrorCode &status) {
2291     if (U_FAILURE(status)) {
2292         return;
2293     }
2294     if (U_FAILURE(fDeferredStatus)) {
2295         status = fDeferredStatus;
2296         return;
2297     }
2298     if (limit < 0) {
2299         status = U_ILLEGAL_ARGUMENT_ERROR;
2300         return;
2301     }
2302     fTimeLimit = limit;
2303 }
2304 
2305 
2306 //--------------------------------------------------------------------------------
2307 //
2308 //     getTimeLimit
2309 //
2310 //--------------------------------------------------------------------------------
getTimeLimit() const2311 int32_t RegexMatcher::getTimeLimit() const {
2312     return fTimeLimit;
2313 }
2314 
2315 
2316 //--------------------------------------------------------------------------------
2317 //
2318 //     setStackLimit
2319 //
2320 //--------------------------------------------------------------------------------
setStackLimit(int32_t limit,UErrorCode & status)2321 void RegexMatcher::setStackLimit(int32_t limit, UErrorCode &status) {
2322     if (U_FAILURE(status)) {
2323         return;
2324     }
2325     if (U_FAILURE(fDeferredStatus)) {
2326         status = fDeferredStatus;
2327         return;
2328     }
2329     if (limit < 0) {
2330         status = U_ILLEGAL_ARGUMENT_ERROR;
2331         return;
2332     }
2333 
2334     // Reset the matcher.  This is needed here in case there is a current match
2335     //    whose final stack frame (containing the match results, pointed to by fFrame)
2336     //    would be lost by resizing to a smaller stack size.
2337     reset();
2338 
2339     if (limit == 0) {
2340         // Unlimited stack expansion
2341         fStack->setMaxCapacity(0);
2342     } else {
2343         // Change the units of the limit  from bytes to ints, and bump the size up
2344         //   to be big enough to hold at least one stack frame for the pattern,
2345         //   if it isn't there already.
2346         int32_t adjustedLimit = limit / sizeof(int32_t);
2347         if (adjustedLimit < fPattern->fFrameSize) {
2348             adjustedLimit = fPattern->fFrameSize;
2349         }
2350         fStack->setMaxCapacity(adjustedLimit);
2351     }
2352     fStackLimit = limit;
2353 }
2354 
2355 
2356 //--------------------------------------------------------------------------------
2357 //
2358 //     getStackLimit
2359 //
2360 //--------------------------------------------------------------------------------
getStackLimit() const2361 int32_t RegexMatcher::getStackLimit() const {
2362     return fStackLimit;
2363 }
2364 
2365 
2366 //--------------------------------------------------------------------------------
2367 //
2368 //     setMatchCallback
2369 //
2370 //--------------------------------------------------------------------------------
setMatchCallback(URegexMatchCallback * callback,const void * context,UErrorCode & status)2371 void RegexMatcher::setMatchCallback(URegexMatchCallback     *callback,
2372                                     const void              *context,
2373                                     UErrorCode              &status) {
2374     if (U_FAILURE(status)) {
2375         return;
2376     }
2377     fCallbackFn = callback;
2378     fCallbackContext = context;
2379 }
2380 
2381 
2382 //--------------------------------------------------------------------------------
2383 //
2384 //     getMatchCallback
2385 //
2386 //--------------------------------------------------------------------------------
getMatchCallback(URegexMatchCallback * & callback,const void * & context,UErrorCode & status)2387 void RegexMatcher::getMatchCallback(URegexMatchCallback   *&callback,
2388                                   const void              *&context,
2389                                   UErrorCode              &status) {
2390     if (U_FAILURE(status)) {
2391        return;
2392     }
2393     callback = fCallbackFn;
2394     context  = fCallbackContext;
2395 }
2396 
2397 
2398 //--------------------------------------------------------------------------------
2399 //
2400 //     setMatchCallback
2401 //
2402 //--------------------------------------------------------------------------------
setFindProgressCallback(URegexFindProgressCallback * callback,const void * context,UErrorCode & status)2403 void RegexMatcher::setFindProgressCallback(URegexFindProgressCallback      *callback,
2404                                                 const void                      *context,
2405                                                 UErrorCode                      &status) {
2406     if (U_FAILURE(status)) {
2407         return;
2408     }
2409     fFindProgressCallbackFn = callback;
2410     fFindProgressCallbackContext = context;
2411 }
2412 
2413 
2414 //--------------------------------------------------------------------------------
2415 //
2416 //     getMatchCallback
2417 //
2418 //--------------------------------------------------------------------------------
getFindProgressCallback(URegexFindProgressCallback * & callback,const void * & context,UErrorCode & status)2419 void RegexMatcher::getFindProgressCallback(URegexFindProgressCallback    *&callback,
2420                                                 const void                    *&context,
2421                                                 UErrorCode                    &status) {
2422     if (U_FAILURE(status)) {
2423        return;
2424     }
2425     callback = fFindProgressCallbackFn;
2426     context  = fFindProgressCallbackContext;
2427 }
2428 
2429 
2430 //================================================================================
2431 //
2432 //    Code following this point in this file is the internal
2433 //    Match Engine Implementation.
2434 //
2435 //================================================================================
2436 
2437 
2438 //--------------------------------------------------------------------------------
2439 //
2440 //   resetStack
2441 //           Discard any previous contents of the state save stack, and initialize a
2442 //           new stack frame to all -1.  The -1s are needed for capture group limits,
2443 //           where they indicate that a group has not yet matched anything.
2444 //--------------------------------------------------------------------------------
resetStack()2445 REStackFrame *RegexMatcher::resetStack() {
2446     // Discard any previous contents of the state save stack, and initialize a
2447     //  new stack frame with all -1 data.  The -1s are needed for capture group limits,
2448     //  where they indicate that a group has not yet matched anything.
2449     fStack->removeAllElements();
2450 
2451     REStackFrame *iFrame = (REStackFrame *)fStack->reserveBlock(fPattern->fFrameSize, fDeferredStatus);
2452     int32_t i;
2453     for (i=0; i<fPattern->fFrameSize-RESTACKFRAME_HDRCOUNT; i++) {
2454         iFrame->fExtra[i] = -1;
2455     }
2456     return iFrame;
2457 }
2458 
2459 
2460 
2461 //--------------------------------------------------------------------------------
2462 //
2463 //   isWordBoundary
2464 //                     in perl, "xab..cd..", \b is true at positions 0,3,5,7
2465 //                     For us,
2466 //                       If the current char is a combining mark,
2467 //                          \b is FALSE.
2468 //                       Else Scan backwards to the first non-combining char.
2469 //                            We are at a boundary if the this char and the original chars are
2470 //                               opposite in membership in \w set
2471 //
2472 //          parameters:   pos   - the current position in the input buffer
2473 //
2474 //              TODO:  double-check edge cases at region boundaries.
2475 //
2476 //--------------------------------------------------------------------------------
isWordBoundary(int64_t pos)2477 UBool RegexMatcher::isWordBoundary(int64_t pos) {
2478     UBool isBoundary = FALSE;
2479     UBool cIsWord    = FALSE;
2480 
2481     if (pos >= fLookLimit) {
2482         fHitEnd = TRUE;
2483     } else {
2484         // Determine whether char c at current position is a member of the word set of chars.
2485         // If we're off the end of the string, behave as though we're not at a word char.
2486         UTEXT_SETNATIVEINDEX(fInputText, pos);
2487         UChar32  c = UTEXT_CURRENT32(fInputText);
2488         if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
2489             // Current char is a combining one.  Not a boundary.
2490             return FALSE;
2491         }
2492         cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
2493     }
2494 
2495     // Back up until we come to a non-combining char, determine whether
2496     //  that char is a word char.
2497     UBool prevCIsWord = FALSE;
2498     for (;;) {
2499         if (UTEXT_GETNATIVEINDEX(fInputText) <= fLookStart) {
2500             break;
2501         }
2502         UChar32 prevChar = UTEXT_PREVIOUS32(fInputText);
2503         if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
2504               || u_charType(prevChar) == U_FORMAT_CHAR)) {
2505             prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar);
2506             break;
2507         }
2508     }
2509     isBoundary = cIsWord ^ prevCIsWord;
2510     return isBoundary;
2511 }
2512 
isChunkWordBoundary(int32_t pos)2513 UBool RegexMatcher::isChunkWordBoundary(int32_t pos) {
2514     UBool isBoundary = FALSE;
2515     UBool cIsWord    = FALSE;
2516 
2517     const UChar *inputBuf = fInputText->chunkContents;
2518 
2519     if (pos >= fLookLimit) {
2520         fHitEnd = TRUE;
2521     } else {
2522         // Determine whether char c at current position is a member of the word set of chars.
2523         // If we're off the end of the string, behave as though we're not at a word char.
2524         UChar32 c;
2525         U16_GET(inputBuf, fLookStart, pos, fLookLimit, c);
2526         if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
2527             // Current char is a combining one.  Not a boundary.
2528             return FALSE;
2529         }
2530         cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
2531     }
2532 
2533     // Back up until we come to a non-combining char, determine whether
2534     //  that char is a word char.
2535     UBool prevCIsWord = FALSE;
2536     for (;;) {
2537         if (pos <= fLookStart) {
2538             break;
2539         }
2540         UChar32 prevChar;
2541         U16_PREV(inputBuf, fLookStart, pos, prevChar);
2542         if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
2543               || u_charType(prevChar) == U_FORMAT_CHAR)) {
2544             prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar);
2545             break;
2546         }
2547     }
2548     isBoundary = cIsWord ^ prevCIsWord;
2549     return isBoundary;
2550 }
2551 
2552 //--------------------------------------------------------------------------------
2553 //
2554 //   isUWordBoundary
2555 //
2556 //         Test for a word boundary using RBBI word break.
2557 //
2558 //          parameters:   pos   - the current position in the input buffer
2559 //
2560 //--------------------------------------------------------------------------------
isUWordBoundary(int64_t pos)2561 UBool RegexMatcher::isUWordBoundary(int64_t pos) {
2562     UBool       returnVal = FALSE;
2563 #if UCONFIG_NO_BREAK_ITERATION==0
2564 
2565     // If we haven't yet created a break iterator for this matcher, do it now.
2566     if (fWordBreakItr == NULL) {
2567         fWordBreakItr =
2568             (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), fDeferredStatus);
2569         if (U_FAILURE(fDeferredStatus)) {
2570             return FALSE;
2571         }
2572         fWordBreakItr->setText(fInputText, fDeferredStatus);
2573     }
2574 
2575     if (pos >= fLookLimit) {
2576         fHitEnd = TRUE;
2577         returnVal = TRUE;   // With Unicode word rules, only positions within the interior of "real"
2578                             //    words are not boundaries.  All non-word chars stand by themselves,
2579                             //    with word boundaries on both sides.
2580     } else {
2581         if (!UTEXT_USES_U16(fInputText)) {
2582             // !!!: Would like a better way to do this!
2583             UErrorCode status = U_ZERO_ERROR;
2584             pos = utext_extract(fInputText, 0, pos, NULL, 0, &status);
2585         }
2586         returnVal = fWordBreakItr->isBoundary((int32_t)pos);
2587     }
2588 #endif
2589     return   returnVal;
2590 }
2591 
2592 //--------------------------------------------------------------------------------
2593 //
2594 //   IncrementTime     This function is called once each TIMER_INITIAL_VALUE state
2595 //                     saves. Increment the "time" counter, and call the
2596 //                     user callback function if there is one installed.
2597 //
2598 //                     If the match operation needs to be aborted, either for a time-out
2599 //                     or because the user callback asked for it, just set an error status.
2600 //                     The engine will pick that up and stop in its outer loop.
2601 //
2602 //--------------------------------------------------------------------------------
IncrementTime(UErrorCode & status)2603 void RegexMatcher::IncrementTime(UErrorCode &status) {
2604     fTickCounter = TIMER_INITIAL_VALUE;
2605     fTime++;
2606     if (fCallbackFn != NULL) {
2607         if ((*fCallbackFn)(fCallbackContext, fTime) == FALSE) {
2608             status = U_REGEX_STOPPED_BY_CALLER;
2609             return;
2610         }
2611     }
2612     if (fTimeLimit > 0 && fTime >= fTimeLimit) {
2613         status = U_REGEX_TIME_OUT;
2614     }
2615 }
2616 
2617 //--------------------------------------------------------------------------------
2618 //
2619 //   ReportFindProgress     This function is called once for each advance in the target
2620 //                          string from the find() function, and calls the user progress callback
2621 //                          function if there is one installed.
2622 //
2623 //                          NOTE:
2624 //
2625 //                          If the match operation needs to be aborted because the user
2626 //                          callback asked for it, just set an error status.
2627 //                          The engine will pick that up and stop in its outer loop.
2628 //
2629 //--------------------------------------------------------------------------------
ReportFindProgress(int64_t matchIndex,UErrorCode & status)2630 UBool RegexMatcher::ReportFindProgress(int64_t matchIndex, UErrorCode &status) {
2631     if (fFindProgressCallbackFn != NULL) {
2632         if ((*fFindProgressCallbackFn)(fFindProgressCallbackContext, matchIndex) == FALSE) {
2633             status = U_ZERO_ERROR /*U_REGEX_STOPPED_BY_CALLER*/;
2634             return FALSE;
2635         }
2636     }
2637     return TRUE;
2638 }
2639 
2640 //--------------------------------------------------------------------------------
2641 //
2642 //   StateSave
2643 //       Make a new stack frame, initialized as a copy of the current stack frame.
2644 //       Set the pattern index in the original stack frame from the operand value
2645 //       in the opcode.  Execution of the engine continues with the state in
2646 //       the newly created stack frame
2647 //
2648 //       Note that reserveBlock() may grow the stack, resulting in the
2649 //       whole thing being relocated in memory.
2650 //
2651 //    Parameters:
2652 //       fp           The top frame pointer when called.  At return, a new
2653 //                    fame will be present
2654 //       savePatIdx   An index into the compiled pattern.  Goes into the original
2655 //                    (not new) frame.  If execution ever back-tracks out of the
2656 //                    new frame, this will be where we continue from in the pattern.
2657 //    Return
2658 //                    The new frame pointer.
2659 //
2660 //--------------------------------------------------------------------------------
StateSave(REStackFrame * fp,int64_t savePatIdx,UErrorCode & status)2661 inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status) {
2662     // push storage for a new frame.
2663     int64_t *newFP = fStack->reserveBlock(fFrameSize, status);
2664     if (newFP == NULL) {
2665         // Failure on attempted stack expansion.
2666         //   Stack function set some other error code, change it to a more
2667         //   specific one for regular expressions.
2668         status = U_REGEX_STACK_OVERFLOW;
2669         // We need to return a writable stack frame, so just return the
2670         //    previous frame.  The match operation will stop quickly
2671         //    because of the error status, after which the frame will never
2672         //    be looked at again.
2673         return fp;
2674     }
2675     fp = (REStackFrame *)(newFP - fFrameSize);  // in case of realloc of stack.
2676 
2677     // New stack frame = copy of old top frame.
2678     int64_t *source = (int64_t *)fp;
2679     int64_t *dest   = newFP;
2680     for (;;) {
2681         *dest++ = *source++;
2682         if (source == newFP) {
2683             break;
2684         }
2685     }
2686 
2687     fTickCounter--;
2688     if (fTickCounter <= 0) {
2689        IncrementTime(status);    // Re-initializes fTickCounter
2690     }
2691     fp->fPatIdx = savePatIdx;
2692     return (REStackFrame *)newFP;
2693 }
2694 
2695 
2696 //--------------------------------------------------------------------------------
2697 //
2698 //   MatchAt      This is the actual matching engine.
2699 //
2700 //                  startIdx:    begin matching a this index.
2701 //                  toEnd:       if true, match must extend to end of the input region
2702 //
2703 //--------------------------------------------------------------------------------
MatchAt(int64_t startIdx,UBool toEnd,UErrorCode & status)2704 void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) {
2705     UBool       isMatch  = FALSE;      // True if the we have a match.
2706 
2707     int64_t     backSearchIndex = U_INT64_MAX; // used after greedy single-character matches for searching backwards
2708 
2709     int32_t     op;                    // Operation from the compiled pattern, split into
2710     int32_t     opType;                //    the opcode
2711     int32_t     opValue;               //    and the operand value.
2712 
2713     #ifdef REGEX_RUN_DEBUG
2714     if (fTraceDebug)
2715     {
2716         printf("MatchAt(startIdx=%ld)\n", startIdx);
2717         printf("Original Pattern: ");
2718         UChar32 c = utext_next32From(fPattern->fPattern, 0);
2719         while (c != U_SENTINEL) {
2720             if (c<32 || c>256) {
2721                 c = '.';
2722             }
2723             REGEX_DUMP_DEBUG_PRINTF(("%c", c));
2724 
2725             c = UTEXT_NEXT32(fPattern->fPattern);
2726         }
2727         printf("\n");
2728         printf("Input String: ");
2729         c = utext_next32From(fInputText, 0);
2730         while (c != U_SENTINEL) {
2731             if (c<32 || c>256) {
2732                 c = '.';
2733             }
2734             printf("%c", c);
2735 
2736             c = UTEXT_NEXT32(fInputText);
2737         }
2738         printf("\n");
2739         printf("\n");
2740     }
2741     #endif
2742 
2743     if (U_FAILURE(status)) {
2744         return;
2745     }
2746 
2747     //  Cache frequently referenced items from the compiled pattern
2748     //
2749     int64_t             *pat           = fPattern->fCompiledPat->getBuffer();
2750 
2751     const UChar         *litText       = fPattern->fLiteralText.getBuffer();
2752     UVector             *sets          = fPattern->fSets;
2753 
2754     fFrameSize = fPattern->fFrameSize;
2755     REStackFrame        *fp            = resetStack();
2756 
2757     fp->fPatIdx   = 0;
2758     fp->fInputIdx = startIdx;
2759 
2760     // Zero out the pattern's static data
2761     int32_t i;
2762     for (i = 0; i<fPattern->fDataSize; i++) {
2763         fData[i] = 0;
2764     }
2765 
2766     //
2767     //  Main loop for interpreting the compiled pattern.
2768     //  One iteration of the loop per pattern operation performed.
2769     //
2770     for (;;) {
2771 #if 0
2772         if (_heapchk() != _HEAPOK) {
2773             fprintf(stderr, "Heap Trouble\n");
2774         }
2775 #endif
2776 
2777         op      = (int32_t)pat[fp->fPatIdx];
2778         opType  = URX_TYPE(op);
2779         opValue = URX_VAL(op);
2780         #ifdef REGEX_RUN_DEBUG
2781         if (fTraceDebug) {
2782             UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2783             printf("inputIdx=%d   inputChar=%x   sp=%3d   activeLimit=%d  ", fp->fInputIdx,
2784                 UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
2785             fPattern->dumpOp(fp->fPatIdx);
2786         }
2787         #endif
2788         fp->fPatIdx++;
2789 
2790         switch (opType) {
2791 
2792 
2793         case URX_NOP:
2794             break;
2795 
2796 
2797         case URX_BACKTRACK:
2798             // Force a backtrack.  In some circumstances, the pattern compiler
2799             //   will notice that the pattern can't possibly match anything, and will
2800             //   emit one of these at that point.
2801             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2802             break;
2803 
2804 
2805         case URX_ONECHAR:
2806             if (fp->fInputIdx < fActiveLimit) {
2807                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2808                 UChar32 c = UTEXT_NEXT32(fInputText);
2809                 if (c == opValue) {
2810                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
2811                     break;
2812                 }
2813             } else {
2814                 fHitEnd = TRUE;
2815             }
2816 
2817             #ifdef REGEX_SMART_BACKTRACKING
2818             if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
2819                 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
2820                 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
2821                     UBool success = FALSE;
2822                     UChar32 c = UTEXT_PREVIOUS32(fInputText);
2823                     while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex) {
2824                         if (c == opValue) {
2825                             success = TRUE;
2826                             break;
2827                         } else if (c == U_SENTINEL) {
2828                             break;
2829                         }
2830                         c = UTEXT_PREVIOUS32(fInputText);
2831                     }
2832                     if (success) {
2833                         fHitEnd = FALSE;
2834                         fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2835                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
2836                         if (fp->fInputIdx > backSearchIndex) {
2837                             fp = StateSave(fp, fp->fPatIdx, status);
2838                         }
2839                         fp->fPatIdx++; // Skip the LOOP_C, we just did that
2840                         break;
2841                     }
2842                 }
2843             }
2844             #endif
2845 
2846             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2847             break;
2848 
2849 
2850         case URX_STRING:
2851             {
2852                 // Test input against a literal string.
2853                 // Strings require two slots in the compiled pattern, one for the
2854                 //   offset to the string text, and one for the length.
2855                 int32_t   stringStartIdx = opValue;
2856                 int32_t   stringLen;
2857 
2858                 op      = (int32_t)pat[fp->fPatIdx];     // Fetch the second operand
2859                 fp->fPatIdx++;
2860                 opType    = URX_TYPE(op);
2861                 stringLen = URX_VAL(op);
2862                 U_ASSERT(opType == URX_STRING_LEN);
2863                 U_ASSERT(stringLen >= 2);
2864 
2865                 const UChar *patternChars = litText+stringStartIdx;
2866                 const UChar *patternEnd = patternChars+stringLen;
2867 
2868                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2869                 UChar32 c;
2870                 UBool success = TRUE;
2871 
2872                 while (patternChars < patternEnd && success) {
2873                     c = UTEXT_NEXT32(fInputText);
2874 
2875                     if (c != U_SENTINEL && UTEXT_GETNATIVEINDEX(fInputText) <= fActiveLimit) {
2876                         if (U_IS_BMP(c)) {
2877                             success = (*patternChars == c);
2878                             patternChars += 1;
2879                         } else if (patternChars+1 < patternEnd) {
2880                             success = (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c));
2881                             patternChars += 2;
2882                         }
2883                     } else {
2884                         success = FALSE;
2885                         fHitEnd = TRUE;          //   TODO:  See ticket 6074
2886                     }
2887                 }
2888 
2889                 if (success) {
2890                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
2891                 } else {
2892                     #ifdef REGEX_SMART_BACKTRACKING
2893                     if (fp->fInputIdx > backSearchIndex && fStack->size()) {
2894                         REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
2895                         if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
2896                             // Reset to last start point
2897                             UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2898                             patternChars = litText+stringStartIdx;
2899 
2900                             // Search backwards for a possible start
2901                             do {
2902                                 c = UTEXT_PREVIOUS32(fInputText);
2903                                 if (c == U_SENTINEL) {
2904                                     break;
2905                                 } else if ((U_IS_BMP(c) && *patternChars == c) ||
2906                                     (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c))) {
2907                                     success = TRUE;
2908                                     break;
2909                                 }
2910                             } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex);
2911 
2912                             // And try again
2913                             if (success) {
2914                                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2915                                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
2916                                 if (fp->fInputIdx > backSearchIndex) {
2917                                     fp = StateSave(fp, fp->fPatIdx, status);
2918                                 }
2919                                 fp->fPatIdx++; // Skip the LOOP_C, we just did that
2920                                 break;
2921                             }
2922                         }
2923                     }
2924                     #endif
2925                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2926                 }
2927             }
2928             break;
2929 
2930 
2931         case URX_STATE_SAVE:
2932             fp = StateSave(fp, opValue, status);
2933             break;
2934 
2935 
2936         case URX_END:
2937             // The match loop will exit via this path on a successful match,
2938             //   when we reach the end of the pattern.
2939             if (toEnd && fp->fInputIdx != fActiveLimit) {
2940                 // The pattern matched, but not to the end of input.  Try some more.
2941                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2942                 break;
2943             }
2944             isMatch = TRUE;
2945             goto  breakFromLoop;
2946 
2947         // Start and End Capture stack frame variables are laid out out like this:
2948             //  fp->fExtra[opValue]  - The start of a completed capture group
2949             //             opValue+1 - The end   of a completed capture group
2950             //             opValue+2 - the start of a capture group whose end
2951             //                          has not yet been reached (and might not ever be).
2952         case URX_START_CAPTURE:
2953             U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
2954             fp->fExtra[opValue+2] = fp->fInputIdx;
2955             break;
2956 
2957 
2958         case URX_END_CAPTURE:
2959             U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
2960             U_ASSERT(fp->fExtra[opValue+2] >= 0);            // Start pos for this group must be set.
2961             fp->fExtra[opValue]   = fp->fExtra[opValue+2];   // Tentative start becomes real.
2962             fp->fExtra[opValue+1] = fp->fInputIdx;           // End position
2963             U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
2964             break;
2965 
2966 
2967         case URX_DOLLAR:                   //  $, test for End of line
2968                                            //     or for position before new line at end of input
2969             {
2970                 if (fp->fInputIdx >= fAnchorLimit) {
2971                     // We really are at the end of input.  Success.
2972                     fHitEnd = TRUE;
2973                     fRequireEnd = TRUE;
2974                     break;
2975                 }
2976 
2977                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2978 
2979                 // If we are positioned just before a new-line that is located at the
2980                 //   end of input, succeed.
2981                 UChar32 c = UTEXT_NEXT32(fInputText);
2982                 if (UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
2983                     if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) {
2984                         // If not in the middle of a CR/LF sequence
2985                         if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && (UTEXT_PREVIOUS32(fInputText), UTEXT_PREVIOUS32(fInputText))==0x0d)) {
2986                             // At new-line at end of input. Success
2987                             fHitEnd = TRUE;
2988                             fRequireEnd = TRUE;
2989 
2990                             break;
2991                         }
2992                     }
2993                 } else {
2994                     UChar32 nextC = UTEXT_NEXT32(fInputText);
2995                     if (c == 0x0d && nextC == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
2996                         fHitEnd = TRUE;
2997                         fRequireEnd = TRUE;
2998                         break;                         // At CR/LF at end of input.  Success
2999                     }
3000                 }
3001 
3002                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3003             }
3004             break;
3005 
3006 
3007          case URX_DOLLAR_D:                   //  $, test for End of Line, in UNIX_LINES mode.
3008             if (fp->fInputIdx >= fAnchorLimit) {
3009                 // Off the end of input.  Success.
3010                 fHitEnd = TRUE;
3011                 fRequireEnd = TRUE;
3012                 break;
3013             } else {
3014                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3015                 UChar32 c = UTEXT_NEXT32(fInputText);
3016                 // Either at the last character of input, or off the end.
3017                 if (c == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) == fAnchorLimit) {
3018                     fHitEnd = TRUE;
3019                     fRequireEnd = TRUE;
3020                     break;
3021                 }
3022             }
3023 
3024             // Not at end of input.  Back-track out.
3025             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3026             break;
3027 
3028 
3029          case URX_DOLLAR_M:                //  $, test for End of line in multi-line mode
3030              {
3031                  if (fp->fInputIdx >= fAnchorLimit) {
3032                      // We really are at the end of input.  Success.
3033                      fHitEnd = TRUE;
3034                      fRequireEnd = TRUE;
3035                      break;
3036                  }
3037                  // If we are positioned just before a new-line, succeed.
3038                  // It makes no difference where the new-line is within the input.
3039                  UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3040                  UChar32 c = UTEXT_CURRENT32(fInputText);
3041                  if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) {
3042                      // At a line end, except for the odd chance of  being in the middle of a CR/LF sequence
3043                      //  In multi-line mode, hitting a new-line just before the end of input does not
3044                      //   set the hitEnd or requireEnd flags
3045                      if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && UTEXT_PREVIOUS32(fInputText)==0x0d)) {
3046                         break;
3047                      }
3048                  }
3049                  // not at a new line.  Fail.
3050                  fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3051              }
3052              break;
3053 
3054 
3055          case URX_DOLLAR_MD:                //  $, test for End of line in multi-line and UNIX_LINES mode
3056              {
3057                  if (fp->fInputIdx >= fAnchorLimit) {
3058                      // We really are at the end of input.  Success.
3059                      fHitEnd = TRUE;
3060                      fRequireEnd = TRUE;  // Java set requireEnd in this case, even though
3061                      break;               //   adding a new-line would not lose the match.
3062                  }
3063                  // If we are not positioned just before a new-line, the test fails; backtrack out.
3064                  // It makes no difference where the new-line is within the input.
3065                  UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3066                  if (UTEXT_CURRENT32(fInputText) != 0x0a) {
3067                      fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3068                  }
3069              }
3070              break;
3071 
3072 
3073        case URX_CARET:                    //  ^, test for start of line
3074             if (fp->fInputIdx != fAnchorStart) {
3075                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3076             }
3077             break;
3078 
3079 
3080        case URX_CARET_M:                   //  ^, test for start of line in mulit-line mode
3081            {
3082                if (fp->fInputIdx == fAnchorStart) {
3083                    // We are at the start input.  Success.
3084                    break;
3085                }
3086                // Check whether character just before the current pos is a new-line
3087                //   unless we are at the end of input
3088                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3089                UChar32  c = UTEXT_PREVIOUS32(fInputText);
3090                if ((fp->fInputIdx < fAnchorLimit) &&
3091                    ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
3092                    //  It's a new-line.  ^ is true.  Success.
3093                    //  TODO:  what should be done with positions between a CR and LF?
3094                    break;
3095                }
3096                // Not at the start of a line.  Fail.
3097                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3098            }
3099            break;
3100 
3101 
3102        case URX_CARET_M_UNIX:       //  ^, test for start of line in mulit-line + Unix-line mode
3103            {
3104                U_ASSERT(fp->fInputIdx >= fAnchorStart);
3105                if (fp->fInputIdx <= fAnchorStart) {
3106                    // We are at the start input.  Success.
3107                    break;
3108                }
3109                // Check whether character just before the current pos is a new-line
3110                U_ASSERT(fp->fInputIdx <= fAnchorLimit);
3111                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3112                UChar32  c = UTEXT_PREVIOUS32(fInputText);
3113                if (c != 0x0a) {
3114                    // Not at the start of a line.  Back-track out.
3115                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3116                }
3117            }
3118            break;
3119 
3120         case URX_BACKSLASH_B:          // Test for word boundaries
3121             {
3122                 UBool success = isWordBoundary(fp->fInputIdx);
3123                 success ^= (opValue != 0);     // flip sense for \B
3124                 if (!success) {
3125                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3126                 }
3127             }
3128             break;
3129 
3130 
3131         case URX_BACKSLASH_BU:          // Test for word boundaries, Unicode-style
3132             {
3133                 UBool success = isUWordBoundary(fp->fInputIdx);
3134                 success ^= (opValue != 0);     // flip sense for \B
3135                 if (!success) {
3136                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3137                 }
3138             }
3139             break;
3140 
3141 
3142         case URX_BACKSLASH_D:            // Test for decimal digit
3143             {
3144                 if (fp->fInputIdx >= fActiveLimit) {
3145                     fHitEnd = TRUE;
3146                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3147                     break;
3148                 }
3149 
3150                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3151 
3152                 UChar32 c = UTEXT_NEXT32(fInputText);
3153                 int8_t ctype = u_charType(c);     // TODO:  make a unicode set for this.  Will be faster.
3154                 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
3155                 success ^= (opValue != 0);        // flip sense for \D
3156                 if (success) {
3157                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3158                 } else {
3159                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3160                 }
3161             }
3162             break;
3163 
3164 
3165         case URX_BACKSLASH_G:          // Test for position at end of previous match
3166             if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) {
3167                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3168             }
3169             break;
3170 
3171 
3172         case URX_BACKSLASH_X:
3173             //  Match a Grapheme, as defined by Unicode TR 29.
3174             //  Differs slightly from Perl, which consumes combining marks independently
3175             //    of context.
3176             {
3177 
3178                 // Fail if at end of input
3179                 if (fp->fInputIdx >= fActiveLimit) {
3180                     fHitEnd = TRUE;
3181                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3182                     break;
3183                 }
3184 
3185                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3186 
3187                 // Examine (and consume) the current char.
3188                 //   Dispatch into a little state machine, based on the char.
3189                 UChar32  c;
3190                 c = UTEXT_NEXT32(fInputText);
3191                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3192                 UnicodeSet **sets = fPattern->fStaticSets;
3193                 if (sets[URX_GC_NORMAL]->contains(c))  goto GC_Extend;
3194                 if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control;
3195                 if (sets[URX_GC_L]->contains(c))       goto GC_L;
3196                 if (sets[URX_GC_LV]->contains(c))      goto GC_V;
3197                 if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
3198                 if (sets[URX_GC_V]->contains(c))       goto GC_V;
3199                 if (sets[URX_GC_T]->contains(c))       goto GC_T;
3200                 goto GC_Extend;
3201 
3202 
3203 
3204 GC_L:
3205                 if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
3206                 c = UTEXT_NEXT32(fInputText);
3207                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3208                 if (sets[URX_GC_L]->contains(c))       goto GC_L;
3209                 if (sets[URX_GC_LV]->contains(c))      goto GC_V;
3210                 if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
3211                 if (sets[URX_GC_V]->contains(c))       goto GC_V;
3212                 UTEXT_PREVIOUS32(fInputText);
3213                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3214                 goto GC_Extend;
3215 
3216 GC_V:
3217                 if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
3218                 c = UTEXT_NEXT32(fInputText);
3219                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3220                 if (sets[URX_GC_V]->contains(c))       goto GC_V;
3221                 if (sets[URX_GC_T]->contains(c))       goto GC_T;
3222                 UTEXT_PREVIOUS32(fInputText);
3223                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3224                 goto GC_Extend;
3225 
3226 GC_T:
3227                 if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
3228                 c = UTEXT_NEXT32(fInputText);
3229                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3230                 if (sets[URX_GC_T]->contains(c))       goto GC_T;
3231                 UTEXT_PREVIOUS32(fInputText);
3232                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3233                 goto GC_Extend;
3234 
3235 GC_Extend:
3236                 // Combining characters are consumed here
3237                 for (;;) {
3238                     if (fp->fInputIdx >= fActiveLimit) {
3239                         break;
3240                     }
3241                     c = UTEXT_CURRENT32(fInputText);
3242                     if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
3243                         break;
3244                     }
3245                     UTEXT_NEXT32(fInputText);
3246                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3247                 }
3248                 goto GC_Done;
3249 
3250 GC_Control:
3251                 // Most control chars stand alone (don't combine with combining chars),
3252                 //   except for that CR/LF sequence is a single grapheme cluster.
3253                 if (c == 0x0d && fp->fInputIdx < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
3254                     c = UTEXT_NEXT32(fInputText);
3255                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3256                 }
3257 
3258 GC_Done:
3259                 if (fp->fInputIdx >= fActiveLimit) {
3260                     fHitEnd = TRUE;
3261                 }
3262                 break;
3263             }
3264 
3265 
3266 
3267 
3268         case URX_BACKSLASH_Z:          // Test for end of Input
3269             if (fp->fInputIdx < fAnchorLimit) {
3270                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3271             } else {
3272                 fHitEnd = TRUE;
3273                 fRequireEnd = TRUE;
3274             }
3275             break;
3276 
3277 
3278 
3279         case URX_STATIC_SETREF:
3280             {
3281                 // Test input character against one of the predefined sets
3282                 //    (Word Characters, for example)
3283                 // The high bit of the op value is a flag for the match polarity.
3284                 //    0:   success if input char is in set.
3285                 //    1:   success if input char is not in set.
3286                 if (fp->fInputIdx >= fActiveLimit) {
3287                     fHitEnd = TRUE;
3288                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3289                     break;
3290                 }
3291 
3292                 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
3293                 opValue &= ~URX_NEG_SET;
3294                 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
3295 
3296                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3297                 UChar32 c = UTEXT_NEXT32(fInputText);
3298                 if (c < 256) {
3299                     Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
3300                     if (s8->contains(c)) {
3301                         success = !success;
3302                     }
3303                 } else {
3304                     const UnicodeSet *s = fPattern->fStaticSets[opValue];
3305                     if (s->contains(c)) {
3306                         success = !success;
3307                     }
3308                 }
3309                 if (success) {
3310                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3311                 } else {
3312                     // the character wasn't in the set.
3313                     #ifdef REGEX_SMART_BACKTRACKING
3314                     if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
3315                         REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
3316                         if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
3317                             // Try to find it, backwards
3318                             UTEXT_PREVIOUS32(fInputText); // skip the first character we tried
3319                             success = ((opValue & URX_NEG_SET) == URX_NEG_SET); // reset
3320                             do {
3321                                 c = UTEXT_PREVIOUS32(fInputText);
3322                                 if (c == U_SENTINEL) {
3323                                     break;
3324                                 } else if (c < 256) {
3325                                     Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
3326                                     if (s8->contains(c)) {
3327                                         success = !success;
3328                                     }
3329                                 } else {
3330                                     const UnicodeSet *s = fPattern->fStaticSets[opValue];
3331                                     if (s->contains(c)) {
3332                                         success = !success;
3333                                     }
3334                                 }
3335                             } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex && !success);
3336 
3337                             if (success && c != U_SENTINEL) {
3338                                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3339                                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3340                                 if (fp->fInputIdx > backSearchIndex) {
3341                                     fp = StateSave(fp, fp->fPatIdx, status);
3342                                 }
3343                                 fp->fPatIdx++; // Skip the LOOP_C, we just did that
3344                                 break;
3345                             }
3346                         }
3347                     }
3348                     #endif
3349                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3350                 }
3351             }
3352             break;
3353 
3354 
3355         case URX_STAT_SETREF_N:
3356             {
3357                 // Test input character for NOT being a member of  one of
3358                 //    the predefined sets (Word Characters, for example)
3359                 if (fp->fInputIdx >= fActiveLimit) {
3360                     fHitEnd = TRUE;
3361                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3362                     break;
3363                 }
3364 
3365                 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
3366 
3367                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3368 
3369                 UChar32 c = UTEXT_NEXT32(fInputText);
3370                 if (c < 256) {
3371                     Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
3372                     if (s8->contains(c) == FALSE) {
3373                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3374                         break;
3375                     }
3376                 } else {
3377                     const UnicodeSet *s = fPattern->fStaticSets[opValue];
3378                     if (s->contains(c) == FALSE) {
3379                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3380                         break;
3381                     }
3382                 }
3383                 // the character wasn't in the set.
3384                 #ifdef REGEX_SMART_BACKTRACKING
3385                 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
3386                     REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
3387                     if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
3388                         // Try to find it, backwards
3389                         UTEXT_PREVIOUS32(fInputText); // skip the first character we tried
3390                         UBool success = FALSE;
3391                         do {
3392                             c = UTEXT_PREVIOUS32(fInputText);
3393                             if (c == U_SENTINEL) {
3394                                 break;
3395                             } else if (c < 256) {
3396                                 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
3397                                 if (s8->contains(c) == FALSE) {
3398                                     success = TRUE;
3399                                     break;
3400                                 }
3401                             } else {
3402                                 const UnicodeSet *s = fPattern->fStaticSets[opValue];
3403                                 if (s->contains(c) == FALSE) {
3404                                     success = TRUE;
3405                                     break;
3406                                 }
3407                             }
3408                         } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex);
3409 
3410                         if (success) {
3411                             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3412                             fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3413                             if (fp->fInputIdx > backSearchIndex) {
3414                                 fp = StateSave(fp, fp->fPatIdx, status);
3415                             }
3416                             fp->fPatIdx++; // Skip the LOOP_C, we just did that
3417                             break;
3418                         }
3419                     }
3420                 }
3421                 #endif
3422                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3423             }
3424             break;
3425 
3426 
3427         case URX_SETREF:
3428             if (fp->fInputIdx >= fActiveLimit) {
3429                 fHitEnd = TRUE;
3430                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3431                 break;
3432             } else {
3433                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3434 
3435                 // There is input left.  Pick up one char and test it for set membership.
3436                 UChar32 c = UTEXT_NEXT32(fInputText);
3437                 U_ASSERT(opValue > 0 && opValue < sets->size());
3438                 if (c<256) {
3439                     Regex8BitSet *s8 = &fPattern->fSets8[opValue];
3440                     if (s8->contains(c)) {
3441                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3442                         break;
3443                     }
3444                 } else {
3445                     UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
3446                     if (s->contains(c)) {
3447                         // The character is in the set.  A Match.
3448                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3449                         break;
3450                     }
3451                 }
3452 
3453                 // the character wasn't in the set.
3454                 #ifdef REGEX_SMART_BACKTRACKING
3455                 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
3456                     REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
3457                     if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
3458                         // Try to find it, backwards
3459                         UTEXT_PREVIOUS32(fInputText); // skip the first character we tried
3460                         UBool success = FALSE;
3461                         do {
3462                             c = UTEXT_PREVIOUS32(fInputText);
3463                             if (c == U_SENTINEL) {
3464                                 break;
3465                             } else if (c < 256) {
3466                                 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
3467                                 if (s8->contains(c)) {
3468                                     success = TRUE;
3469                                     break;
3470                                 }
3471                             } else {
3472                                 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
3473                                 if (s->contains(c)) {
3474                                     success = TRUE;
3475                                     break;
3476                                 }
3477                             }
3478                         } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex);
3479 
3480                         if (success) {
3481                             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3482                             fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3483                             if (fp->fInputIdx > backSearchIndex) {
3484                                 fp = StateSave(fp, fp->fPatIdx, status);
3485                             }
3486                             fp->fPatIdx++; // Skip the LOOP_C, we just did that
3487                             break;
3488                         }
3489                     }
3490                 }
3491                 #endif
3492                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3493             }
3494             break;
3495 
3496 
3497         case URX_DOTANY:
3498             {
3499                 // . matches anything, but stops at end-of-line.
3500                 if (fp->fInputIdx >= fActiveLimit) {
3501                     // At end of input.  Match failed.  Backtrack out.
3502                     fHitEnd = TRUE;
3503                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3504                     break;
3505                 }
3506 
3507                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3508 
3509                 // There is input left.  Advance over one char, unless we've hit end-of-line
3510                 UChar32 c = UTEXT_NEXT32(fInputText);
3511                 if (((c & 0x7f) <= 0x29) &&     // First quickly bypass as many chars as possible
3512                     ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
3513                     // End of line in normal mode.   . does not match.
3514                         fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3515                     break;
3516                 }
3517                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3518             }
3519             break;
3520 
3521 
3522         case URX_DOTANY_ALL:
3523             {
3524                 // ., in dot-matches-all (including new lines) mode
3525                 if (fp->fInputIdx >= fActiveLimit) {
3526                     // At end of input.  Match failed.  Backtrack out.
3527                     fHitEnd = TRUE;
3528                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3529                     break;
3530                 }
3531 
3532                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3533 
3534                 // There is input left.  Advance over one char, except if we are
3535                 //   at a cr/lf, advance over both of them.
3536                 UChar32 c;
3537                 c = UTEXT_NEXT32(fInputText);
3538                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3539                 if (c==0x0d && fp->fInputIdx < fActiveLimit) {
3540                     // In the case of a CR/LF, we need to advance over both.
3541                     UChar32 nextc = UTEXT_CURRENT32(fInputText);
3542                     if (nextc == 0x0a) {
3543                         UTEXT_NEXT32(fInputText);
3544                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3545                     }
3546                 }
3547             }
3548             break;
3549 
3550 
3551         case URX_DOTANY_UNIX:
3552             {
3553                 // '.' operator, matches all, but stops at end-of-line.
3554                 //   UNIX_LINES mode, so 0x0a is the only recognized line ending.
3555                 if (fp->fInputIdx >= fActiveLimit) {
3556                     // At end of input.  Match failed.  Backtrack out.
3557                     fHitEnd = TRUE;
3558                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3559                     break;
3560                 }
3561 
3562                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3563 
3564                 // There is input left.  Advance over one char, unless we've hit end-of-line
3565                 UChar32 c = UTEXT_NEXT32(fInputText);
3566                 if (c == 0x0a) {
3567                     // End of line in normal mode.   '.' does not match the \n
3568                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3569                 } else {
3570                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3571                 }
3572             }
3573             break;
3574 
3575 
3576         case URX_JMP:
3577             fp->fPatIdx = opValue;
3578             break;
3579 
3580         case URX_FAIL:
3581             isMatch = FALSE;
3582             goto breakFromLoop;
3583 
3584         case URX_JMP_SAV:
3585             U_ASSERT(opValue < fPattern->fCompiledPat->size());
3586             fp = StateSave(fp, fp->fPatIdx, status);       // State save to loc following current
3587             fp->fPatIdx = opValue;                         // Then JMP.
3588             break;
3589 
3590         case URX_JMP_SAV_X:
3591             // This opcode is used with (x)+, when x can match a zero length string.
3592             // Same as JMP_SAV, except conditional on the match having made forward progress.
3593             // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
3594             //   data address of the input position at the start of the loop.
3595             {
3596                 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
3597                 int32_t  stoOp = (int32_t)pat[opValue-1];
3598                 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
3599                 int32_t  frameLoc = URX_VAL(stoOp);
3600                 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
3601                 int64_t prevInputIdx = fp->fExtra[frameLoc];
3602                 U_ASSERT(prevInputIdx <= fp->fInputIdx);
3603                 if (prevInputIdx < fp->fInputIdx) {
3604                     // The match did make progress.  Repeat the loop.
3605                     fp = StateSave(fp, fp->fPatIdx, status);  // State save to loc following current
3606                     fp->fPatIdx = opValue;
3607                     fp->fExtra[frameLoc] = fp->fInputIdx;
3608                 }
3609                 // If the input position did not advance, we do nothing here,
3610                 //   execution will fall out of the loop.
3611             }
3612             break;
3613 
3614         case URX_CTR_INIT:
3615             {
3616                 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
3617                 fp->fExtra[opValue] = 0;       //  Set the loop counter variable to zero
3618 
3619                 // Pick up the three extra operands that CTR_INIT has, and
3620                 //    skip the pattern location counter past
3621                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3622                 fp->fPatIdx += 3;
3623                 int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
3624                 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
3625                 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
3626                 U_ASSERT(minCount>=0);
3627                 U_ASSERT(maxCount>=minCount || maxCount==-1);
3628                 U_ASSERT(loopLoc>fp->fPatIdx);
3629 
3630                 if (minCount == 0) {
3631                     fp = StateSave(fp, loopLoc+1, status);
3632                 }
3633                 if (maxCount == 0) {
3634                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3635                 }
3636             }
3637             break;
3638 
3639         case URX_CTR_LOOP:
3640             {
3641                 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
3642                 int32_t initOp = (int32_t)pat[opValue];
3643                 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
3644                 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
3645                 int32_t minCount  = (int32_t)pat[opValue+2];
3646                 int32_t maxCount  = (int32_t)pat[opValue+3];
3647                 // Increment the counter.  Note: we DIDN'T worry about counter
3648                 //   overflow, since the data comes from UnicodeStrings, which
3649                 //   stores its length in an int32_t. Do we have to think about
3650                 //   this now that we're using UText? Probably not, since the length
3651                 //   in UChar32s is still an int32_t.
3652                 (*pCounter)++;
3653                 U_ASSERT(*pCounter > 0);
3654                 if ((uint64_t)*pCounter >= (uint32_t)maxCount) {
3655                     U_ASSERT(*pCounter == maxCount || maxCount == -1);
3656                     break;
3657                 }
3658                 if (*pCounter >= minCount) {
3659                     fp = StateSave(fp, fp->fPatIdx, status);
3660                 }
3661                 fp->fPatIdx = opValue + 4;    // Loop back.
3662             }
3663             break;
3664 
3665         case URX_CTR_INIT_NG:
3666             {
3667                 // Initialize a non-greedy loop
3668                 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
3669                 fp->fExtra[opValue] = 0;       //  Set the loop counter variable to zero
3670 
3671                 // Pick up the three extra operands that CTR_INIT has, and
3672                 //    skip the pattern location counter past
3673                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3674                 fp->fPatIdx += 3;
3675                 int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
3676                 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
3677                 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
3678                 U_ASSERT(minCount>=0);
3679                 U_ASSERT(maxCount>=minCount || maxCount==-1);
3680                 U_ASSERT(loopLoc>fp->fPatIdx);
3681 
3682                 if (minCount == 0) {
3683                     if (maxCount != 0) {
3684                         fp = StateSave(fp, fp->fPatIdx, status);
3685                     }
3686                     fp->fPatIdx = loopLoc+1;   // Continue with stuff after repeated block
3687                 }
3688             }
3689             break;
3690 
3691         case URX_CTR_LOOP_NG:
3692             {
3693                 // Non-greedy {min, max} loops
3694                 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
3695                 int32_t initOp = (int32_t)pat[opValue];
3696                 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
3697                 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
3698                 int32_t minCount  = (int32_t)pat[opValue+2];
3699                 int32_t maxCount  = (int32_t)pat[opValue+3];
3700                 // Increment the counter.  Note: we DIDN'T worry about counter
3701                 //   overflow, since the data comes from UnicodeStrings, which
3702                 //   stores its length in an int32_t. Do we have to think about
3703                 //   this now that we're using UText? Probably not, since the length
3704                 //   in UChar32s is still an int32_t.
3705                 (*pCounter)++;
3706                 U_ASSERT(*pCounter > 0);
3707 
3708                 if ((uint64_t)*pCounter >= (uint32_t)maxCount) {
3709                     // The loop has matched the maximum permitted number of times.
3710                     //   Break out of here with no action.  Matching will
3711                     //   continue with the following pattern.
3712                     U_ASSERT(*pCounter == maxCount || maxCount == -1);
3713                     break;
3714                 }
3715 
3716                 if (*pCounter < minCount) {
3717                     // We haven't met the minimum number of matches yet.
3718                     //   Loop back for another one.
3719                     fp->fPatIdx = opValue + 4;    // Loop back.
3720                 } else {
3721                     // We do have the minimum number of matches.
3722                     //   Fall into the following pattern, but first do
3723                     //   a state save to the top of the loop, so that a failure
3724                     //   in the following pattern will try another iteration of the loop.
3725                     fp = StateSave(fp, opValue + 4, status);
3726                 }
3727             }
3728             break;
3729 
3730         case URX_STO_SP:
3731             U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
3732             fData[opValue] = fStack->size();
3733             break;
3734 
3735         case URX_LD_SP:
3736             {
3737                 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
3738                 int32_t newStackSize = (int32_t)fData[opValue];
3739                 U_ASSERT(newStackSize <= fStack->size());
3740                 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
3741                 if (newFP == (int64_t *)fp) {
3742                     break;
3743                 }
3744                 int32_t i;
3745                 for (i=0; i<fFrameSize; i++) {
3746                     newFP[i] = ((int64_t *)fp)[i];
3747                 }
3748                 fp = (REStackFrame *)newFP;
3749                 fStack->setSize(newStackSize);
3750             }
3751             break;
3752 
3753         case URX_BACKREF:
3754         case URX_BACKREF_I:
3755             {
3756                 U_ASSERT(opValue < fFrameSize);
3757                 int64_t groupStartIdx = fp->fExtra[opValue];
3758                 int64_t groupEndIdx   = fp->fExtra[opValue+1];
3759                 U_ASSERT(groupStartIdx <= groupEndIdx);
3760                 if (groupStartIdx < 0) {
3761                     // This capture group has not participated in the match thus far,
3762                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
3763                 }
3764 
3765                 if (groupEndIdx == groupStartIdx) {
3766                     //   The capture group match was of an empty string.
3767                     //   Verified by testing:  Perl matches succeed in this case, so
3768                     //   we do too.
3769                     break;
3770                 }
3771 
3772                 UTEXT_SETNATIVEINDEX(fAltInputText, groupStartIdx);
3773                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3774 
3775                 UBool haveMatch = (opType == URX_BACKREF ?
3776                     (0 == utext_compareNativeLimit(fAltInputText, groupEndIdx, fInputText, -1)) :
3777                     (0 == utext_caseCompareNativeLimit(fAltInputText, groupEndIdx, fInputText, -1, U_FOLD_CASE_DEFAULT, &status)));
3778                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3779 
3780                 if (fp->fInputIdx > fActiveLimit) {
3781                     fHitEnd = TRUE;
3782                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
3783                 } else if (!haveMatch) {
3784                     if (fp->fInputIdx == fActiveLimit) {
3785                         fHitEnd = TRUE;
3786                     }
3787                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
3788                 }
3789             }
3790             break;
3791 
3792         case URX_STO_INP_LOC:
3793             {
3794                 U_ASSERT(opValue >= 0 && opValue < fFrameSize);
3795                 fp->fExtra[opValue] = fp->fInputIdx;
3796             }
3797             break;
3798 
3799         case URX_JMPX:
3800             {
3801                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3802                 fp->fPatIdx += 1;
3803                 int32_t dataLoc  = URX_VAL(pat[instrOperandLoc]);
3804                 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
3805                 int64_t savedInputIdx = fp->fExtra[dataLoc];
3806                 U_ASSERT(savedInputIdx <= fp->fInputIdx);
3807                 if (savedInputIdx < fp->fInputIdx) {
3808                     fp->fPatIdx = opValue;                               // JMP
3809                 } else {
3810                      fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no progress in loop.
3811                 }
3812             }
3813             break;
3814 
3815         case URX_LA_START:
3816             {
3817                 // Entering a lookahead block.
3818                 // Save Stack Ptr, Input Pos.
3819                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3820                 fData[opValue]   = fStack->size();
3821                 fData[opValue+1] = fp->fInputIdx;
3822                 fActiveStart     = fLookStart;          // Set the match region change for
3823                 fActiveLimit     = fLookLimit;          //   transparent bounds.
3824             }
3825             break;
3826 
3827         case URX_LA_END:
3828             {
3829                 // Leaving a look-ahead block.
3830                 //  restore Stack Ptr, Input Pos to positions they had on entry to block.
3831                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3832                 int32_t stackSize = fStack->size();
3833                 int32_t newStackSize =(int32_t)fData[opValue];
3834                 U_ASSERT(stackSize >= newStackSize);
3835                 if (stackSize > newStackSize) {
3836                     // Copy the current top frame back to the new (cut back) top frame.
3837                     //   This makes the capture groups from within the look-ahead
3838                     //   expression available.
3839                     int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
3840                     int32_t i;
3841                     for (i=0; i<fFrameSize; i++) {
3842                         newFP[i] = ((int64_t *)fp)[i];
3843                     }
3844                     fp = (REStackFrame *)newFP;
3845                     fStack->setSize(newStackSize);
3846                 }
3847                 fp->fInputIdx = fData[opValue+1];
3848 
3849                 // Restore the active region bounds in the input string; they may have
3850                 //    been changed because of transparent bounds on a Region.
3851                 fActiveStart = fRegionStart;
3852                 fActiveLimit = fRegionLimit;
3853             }
3854             break;
3855 
3856         case URX_ONECHAR_I:
3857             if (fp->fInputIdx < fActiveLimit) {
3858                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3859 
3860                 UChar32 c = UTEXT_NEXT32(fInputText);
3861                 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
3862                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3863                     break;
3864                 }
3865             } else {
3866                 fHitEnd = TRUE;
3867             }
3868 
3869             #ifdef REGEX_SMART_BACKTRACKING
3870             if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
3871                 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
3872                 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
3873                     UBool success = FALSE;
3874                     UChar32 c = UTEXT_PREVIOUS32(fInputText);
3875                     while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex) {
3876                         if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
3877                             success = TRUE;
3878                             break;
3879                         } else if (c == U_SENTINEL) {
3880                             break;
3881                         }
3882                         c = UTEXT_PREVIOUS32(fInputText);
3883                     }
3884                     if (success) {
3885                         fHitEnd = FALSE;
3886                         fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3887                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3888                         if (fp->fInputIdx > backSearchIndex) {
3889                             fp = StateSave(fp, fp->fPatIdx, status);
3890                         }
3891                         fp->fPatIdx++; // Skip the LOOP_C, we just did that
3892                         break;
3893                     }
3894                 }
3895             }
3896             #endif
3897 
3898             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3899             break;
3900 
3901         case URX_STRING_I:
3902             {
3903                 // Test input against a literal string.
3904                 // Strings require two slots in the compiled pattern, one for the
3905                 //   offset to the string text, and one for the length.
3906                 const UCaseProps *csp = ucase_getSingleton();
3907                 {
3908                     int32_t stringStartIdx, stringLen;
3909                     stringStartIdx = opValue;
3910 
3911                     op      = (int32_t)pat[fp->fPatIdx];
3912                     fp->fPatIdx++;
3913                     opType  = URX_TYPE(op);
3914                     opValue = URX_VAL(op);
3915                     U_ASSERT(opType == URX_STRING_LEN);
3916                     stringLen = opValue;
3917 
3918                     const UChar *patternChars = litText+stringStartIdx;
3919                     const UChar *patternEnd = patternChars+stringLen;
3920 
3921                     const UChar *foldChars = NULL;
3922                     int32_t foldOffset, foldLength;
3923                     UChar32 c;
3924 
3925                     foldOffset = foldLength = 0;
3926                     UBool success = TRUE;
3927 
3928                     UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3929                     while (patternChars < patternEnd && success) {
3930                         if(foldOffset < foldLength) {
3931                             U16_NEXT_UNSAFE(foldChars, foldOffset, c);
3932                         } else {
3933                             c = UTEXT_NEXT32(fInputText);
3934                             if (c != U_SENTINEL) {
3935                                 foldLength = ucase_toFullFolding(csp, c, &foldChars, U_FOLD_CASE_DEFAULT);
3936                                 if(foldLength >= 0) {
3937                                     if(foldLength <= UCASE_MAX_STRING_LENGTH) {   // !!!: Does not correctly handle chars that fold to 0-length strings
3938                                         foldOffset = 0;
3939                                         U16_NEXT_UNSAFE(foldChars, foldOffset, c);
3940                                     } else {
3941                                         c = foldLength;
3942                                         foldLength = foldOffset; // to avoid reading chars from the folding buffer
3943                                     }
3944                                 }
3945                             }
3946 
3947                             fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3948                         }
3949 
3950                         success = FALSE;
3951                         if (c != U_SENTINEL && (fp->fInputIdx <= fActiveLimit)) {
3952                             if (U_IS_BMP(c)) {
3953                                 success = (*patternChars == c);
3954                                 patternChars += 1;
3955                             } else if (patternChars+1 < patternEnd) {
3956                                 success = (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c));
3957                                 patternChars += 2;
3958                             }
3959                         } else {
3960                             fHitEnd = TRUE;          //   TODO:  See ticket 6074
3961                         }
3962                     }
3963 
3964                     if (!success) {
3965                         #ifdef REGEX_SMART_BACKTRACKING
3966                         if (fp->fInputIdx > backSearchIndex && fStack->size()) {
3967                             REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
3968                             if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
3969                                 // Reset to last start point
3970                                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3971                                 patternChars = litText+stringStartIdx;
3972 
3973                                 // Search backwards for a possible start
3974                                 do {
3975                                     c = UTEXT_PREVIOUS32(fInputText);
3976                                     if (c == U_SENTINEL) {
3977                                         break;
3978                                     } else {
3979                                         foldLength = ucase_toFullFolding(csp, c, &foldChars, U_FOLD_CASE_DEFAULT);
3980                                         if(foldLength >= 0) {
3981                                             if(foldLength <= UCASE_MAX_STRING_LENGTH) {   // !!!: Does not correctly handle chars that fold to 0-length strings
3982                                                 foldOffset = 0;
3983                                                 U16_NEXT_UNSAFE(foldChars, foldOffset, c);
3984                                             } else {
3985                                                 c = foldLength;
3986                                                 foldLength = foldOffset; // to avoid reading chars from the folding buffer
3987                                             }
3988                                         }
3989 
3990                                         if ((U_IS_BMP(c) && *patternChars == c) ||
3991                                                (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c))) {
3992                                             success = TRUE;
3993                                             break;
3994                                         }
3995                                     }
3996                                 } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex);
3997 
3998                                 // And try again
3999                                 if (success) {
4000                                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4001                                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
4002                                     if (fp->fInputIdx > backSearchIndex) {
4003                                         fp = StateSave(fp, fp->fPatIdx, status);
4004                                     }
4005                                     fp->fPatIdx++; // Skip the LOOP_C, we just did that
4006                                     break;
4007                                 }
4008                             }
4009                         }
4010                         #endif
4011                         fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4012                     }
4013                 }
4014             }
4015             break;
4016 
4017         case URX_LB_START:
4018             {
4019                 // Entering a look-behind block.
4020                 // Save Stack Ptr, Input Pos.
4021                 //   TODO:  implement transparent bounds.  Ticket #6067
4022                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4023                 fData[opValue]   = fStack->size();
4024                 fData[opValue+1] = fp->fInputIdx;
4025                 // Init the variable containing the start index for attempted matches.
4026                 fData[opValue+2] = -1;
4027                 // Save input string length, then reset to pin any matches to end at
4028                 //   the current position.
4029                 fData[opValue+3] = fActiveLimit;
4030                 fActiveLimit     = fp->fInputIdx;
4031             }
4032             break;
4033 
4034 
4035         case URX_LB_CONT:
4036             {
4037                 // Positive Look-Behind, at top of loop checking for matches of LB expression
4038                 //    at all possible input starting positions.
4039 
4040                 // Fetch the min and max possible match lengths.  They are the operands
4041                 //   of this op in the pattern.
4042                 int32_t minML = (int32_t)pat[fp->fPatIdx++];
4043                 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
4044                 U_ASSERT(minML <= maxML);
4045                 U_ASSERT(minML >= 0);
4046 
4047                 // Fetch (from data) the last input index where a match was attempted.
4048                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4049                 int64_t  *lbStartIdx = &fData[opValue+2];
4050                 if (*lbStartIdx < 0) {
4051                     // First time through loop.
4052                     *lbStartIdx = fp->fInputIdx - minML;
4053                 } else {
4054                     // 2nd through nth time through the loop.
4055                     // Back up start position for match by one.
4056                     if (*lbStartIdx == 0) {
4057                         (*lbStartIdx)--;
4058                     } else {
4059                         UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx);
4060                         UTEXT_PREVIOUS32(fInputText);
4061                         *lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
4062                     }
4063                 }
4064 
4065                 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
4066                     // We have tried all potential match starting points without
4067                     //  getting a match.  Backtrack out, and out of the
4068                     //   Look Behind altogether.
4069                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4070                     int64_t restoreInputLen = fData[opValue+3];
4071                     U_ASSERT(restoreInputLen >= fActiveLimit);
4072                     U_ASSERT(restoreInputLen <= fInputLength);
4073                     fActiveLimit = restoreInputLen;
4074                     break;
4075                 }
4076 
4077                 //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
4078                 //      (successful match will fall off the end of the loop.)
4079                 fp = StateSave(fp, fp->fPatIdx-3, status);
4080                 fp->fInputIdx = *lbStartIdx;
4081             }
4082             break;
4083 
4084         case URX_LB_END:
4085             // End of a look-behind block, after a successful match.
4086             {
4087                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4088                 if (fp->fInputIdx != fActiveLimit) {
4089                     //  The look-behind expression matched, but the match did not
4090                     //    extend all the way to the point that we are looking behind from.
4091                     //  FAIL out of here, which will take us back to the LB_CONT, which
4092                     //     will retry the match starting at another position or fail
4093                     //     the look-behind altogether, whichever is appropriate.
4094                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4095                     break;
4096                 }
4097 
4098                 // Look-behind match is good.  Restore the orignal input string length,
4099                 //   which had been truncated to pin the end of the lookbehind match to the
4100                 //   position being looked-behind.
4101                 int64_t originalInputLen = fData[opValue+3];
4102                 U_ASSERT(originalInputLen >= fActiveLimit);
4103                 U_ASSERT(originalInputLen <= fInputLength);
4104                 fActiveLimit = originalInputLen;
4105             }
4106             break;
4107 
4108 
4109         case URX_LBN_CONT:
4110             {
4111                 // Negative Look-Behind, at top of loop checking for matches of LB expression
4112                 //    at all possible input starting positions.
4113 
4114                 // Fetch the extra parameters of this op.
4115                 int32_t minML       = (int32_t)pat[fp->fPatIdx++];
4116                 int32_t maxML       = (int32_t)pat[fp->fPatIdx++];
4117                 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
4118                         continueLoc = URX_VAL(continueLoc);
4119                 U_ASSERT(minML <= maxML);
4120                 U_ASSERT(minML >= 0);
4121                 U_ASSERT(continueLoc > fp->fPatIdx);
4122 
4123                 // Fetch (from data) the last input index where a match was attempted.
4124                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4125                 int64_t  *lbStartIdx = &fData[opValue+2];
4126                 if (*lbStartIdx < 0) {
4127                     // First time through loop.
4128                     *lbStartIdx = fp->fInputIdx - minML;
4129                 } else {
4130                     // 2nd through nth time through the loop.
4131                     // Back up start position for match by one.
4132                     if (*lbStartIdx == 0) {
4133                         (*lbStartIdx)--;
4134                     } else {
4135                         UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx);
4136                         UTEXT_PREVIOUS32(fInputText);
4137                         *lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
4138                     }
4139                 }
4140 
4141                 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
4142                     // We have tried all potential match starting points without
4143                     //  getting a match, which means that the negative lookbehind as
4144                     //  a whole has succeeded.  Jump forward to the continue location
4145                     int64_t restoreInputLen = fData[opValue+3];
4146                     U_ASSERT(restoreInputLen >= fActiveLimit);
4147                     U_ASSERT(restoreInputLen <= fInputLength);
4148                     fActiveLimit = restoreInputLen;
4149                     fp->fPatIdx = continueLoc;
4150                     break;
4151                 }
4152 
4153                 //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
4154                 //      (successful match will cause a FAIL out of the loop altogether.)
4155                 fp = StateSave(fp, fp->fPatIdx-4, status);
4156                 fp->fInputIdx = *lbStartIdx;
4157             }
4158             break;
4159 
4160         case URX_LBN_END:
4161             // End of a negative look-behind block, after a successful match.
4162             {
4163                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4164                 if (fp->fInputIdx != fActiveLimit) {
4165                     //  The look-behind expression matched, but the match did not
4166                     //    extend all the way to the point that we are looking behind from.
4167                     //  FAIL out of here, which will take us back to the LB_CONT, which
4168                     //     will retry the match starting at another position or succeed
4169                     //     the look-behind altogether, whichever is appropriate.
4170                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4171                     break;
4172                 }
4173 
4174                 // Look-behind expression matched, which means look-behind test as
4175                 //   a whole Fails
4176 
4177                 //   Restore the orignal input string length, which had been truncated
4178                 //   inorder to pin the end of the lookbehind match
4179                 //   to the position being looked-behind.
4180                 int64_t originalInputLen = fData[opValue+3];
4181                 U_ASSERT(originalInputLen >= fActiveLimit);
4182                 U_ASSERT(originalInputLen <= fInputLength);
4183                 fActiveLimit = originalInputLen;
4184 
4185                 // Restore original stack position, discarding any state saved
4186                 //   by the successful pattern match.
4187                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4188                 int32_t newStackSize = (int32_t)fData[opValue];
4189                 U_ASSERT(fStack->size() > newStackSize);
4190                 fStack->setSize(newStackSize);
4191 
4192                 //  FAIL, which will take control back to someplace
4193                 //  prior to entering the look-behind test.
4194                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4195             }
4196             break;
4197 
4198 
4199         case URX_LOOP_SR_I:
4200             // Loop Initialization for the optimized implementation of
4201             //     [some character set]*
4202             //   This op scans through all matching input.
4203             //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
4204             {
4205                 U_ASSERT(opValue > 0 && opValue < sets->size());
4206                 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
4207                 UnicodeSet   *s  = (UnicodeSet *)sets->elementAt(opValue);
4208 
4209                 // Loop through input, until either the input is exhausted or
4210                 //   we reach a character that is not a member of the set.
4211                 int64_t ix = fp->fInputIdx;
4212                 UTEXT_SETNATIVEINDEX(fInputText, ix);
4213                 for (;;) {
4214                     if (ix >= fActiveLimit) {
4215                         fHitEnd = TRUE;
4216                         break;
4217                     }
4218                     UChar32 c = UTEXT_NEXT32(fInputText);
4219                     if (c<256) {
4220                         if (s8->contains(c) == FALSE) {
4221                             break;
4222                         }
4223                     } else {
4224                         if (s->contains(c) == FALSE) {
4225                             break;
4226                         }
4227                     }
4228                     ix = UTEXT_GETNATIVEINDEX(fInputText);
4229                 }
4230 
4231                 // If there were no matching characters, skip over the loop altogether.
4232                 //   The loop doesn't run at all, a * op always succeeds.
4233                 if (ix == fp->fInputIdx) {
4234                     fp->fPatIdx++;   // skip the URX_LOOP_C op.
4235                     break;
4236                 }
4237 
4238                 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
4239                 //   must follow.  It's operand is the stack location
4240                 //   that holds the starting input index for the match of this [set]*
4241                 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
4242                 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
4243                 int32_t stackLoc = URX_VAL(loopcOp);
4244                 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
4245                 fp->fExtra[stackLoc] = fp->fInputIdx;
4246                 #ifdef REGEX_SMART_BACKTRACKING
4247                 backSearchIndex = fp->fInputIdx;
4248                 #endif
4249                 fp->fInputIdx = ix;
4250 
4251                 // Save State to the URX_LOOP_C op that follows this one,
4252                 //   so that match failures in the following code will return to there.
4253                 //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
4254                 fp = StateSave(fp, fp->fPatIdx, status);
4255                 fp->fPatIdx++;
4256             }
4257             break;
4258 
4259 
4260         case URX_LOOP_DOT_I:
4261             // Loop Initialization for the optimized implementation of .*
4262             //   This op scans through all remaining input.
4263             //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
4264             {
4265                 // Loop through input until the input is exhausted (we reach an end-of-line)
4266                 // In DOTALL mode, we can just go straight to the end of the input.
4267                 int64_t ix;
4268                 if ((opValue & 1) == 1) {
4269                     // Dot-matches-All mode.  Jump straight to the end of the string.
4270                     ix = fActiveLimit;
4271                     fHitEnd = TRUE;
4272                 } else {
4273                     // NOT DOT ALL mode.  Line endings do not match '.'
4274                     // Scan forward until a line ending or end of input.
4275                     ix = fp->fInputIdx;
4276                     UTEXT_SETNATIVEINDEX(fInputText, ix);
4277                     for (;;) {
4278                         if (ix >= fActiveLimit) {
4279                             fHitEnd = TRUE;
4280                             break;
4281                         }
4282                         UChar32 c = UTEXT_NEXT32(fInputText);
4283                         if ((c & 0x7f) <= 0x29) {          // Fast filter of non-new-line-s
4284                             if ((c == 0x0a) ||             //  0x0a is newline in both modes.
4285                                (((opValue & 2) == 0) &&    // IF not UNIX_LINES mode
4286                                     (c<=0x0d && c>=0x0a)) || c==0x85 ||c==0x2028 || c==0x2029) {
4287                                 //  char is a line ending.  Exit the scanning loop.
4288                                 break;
4289                             }
4290                         }
4291                         ix = UTEXT_GETNATIVEINDEX(fInputText);
4292                     }
4293                 }
4294 
4295                 // If there were no matching characters, skip over the loop altogether.
4296                 //   The loop doesn't run at all, a * op always succeeds.
4297                 if (ix == fp->fInputIdx) {
4298                     fp->fPatIdx++;   // skip the URX_LOOP_C op.
4299                     break;
4300                 }
4301 
4302                 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
4303                 //   must follow.  It's operand is the stack location
4304                 //   that holds the starting input index for the match of this .*
4305                 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
4306                 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
4307                 int32_t stackLoc = URX_VAL(loopcOp);
4308                 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
4309                 fp->fExtra[stackLoc] = fp->fInputIdx;
4310                 #ifdef REGEX_SMART_BACKTRACKING
4311                 backSearchIndex = fp->fInputIdx;
4312                 #endif
4313                 fp->fInputIdx = ix;
4314 
4315                 // Save State to the URX_LOOP_C op that follows this one,
4316                 //   so that match failures in the following code will return to there.
4317                 //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
4318                 fp = StateSave(fp, fp->fPatIdx, status);
4319                 fp->fPatIdx++;
4320             }
4321             break;
4322 
4323 
4324         case URX_LOOP_C:
4325             {
4326                 U_ASSERT(opValue>=0 && opValue<fFrameSize);
4327                 backSearchIndex = fp->fExtra[opValue];
4328                 U_ASSERT(backSearchIndex <= fp->fInputIdx);
4329                 if (backSearchIndex == fp->fInputIdx) {
4330                     // We've backed up the input idx to the point that the loop started.
4331                     // The loop is done.  Leave here without saving state.
4332                     //  Subsequent failures won't come back here.
4333                     break;
4334                 }
4335                 // Set up for the next iteration of the loop, with input index
4336                 //   backed up by one from the last time through,
4337                 //   and a state save to this instruction in case the following code fails again.
4338                 //   (We're going backwards because this loop emulates stack unwinding, not
4339                 //    the initial scan forward.)
4340                 U_ASSERT(fp->fInputIdx > 0);
4341                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
4342                 UChar32 prevC = UTEXT_PREVIOUS32(fInputText);
4343                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
4344 
4345                 UChar32 twoPrevC = UTEXT_PREVIOUS32(fInputText);
4346                 if (prevC == 0x0a &&
4347                     fp->fInputIdx > backSearchIndex &&
4348                     twoPrevC == 0x0d) {
4349                     int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
4350                     if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
4351                         // .*, stepping back over CRLF pair.
4352                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
4353                     }
4354                 }
4355 
4356 
4357                 fp = StateSave(fp, fp->fPatIdx-1, status);
4358             }
4359             break;
4360 
4361 
4362 
4363         default:
4364             // Trouble.  The compiled pattern contains an entry with an
4365             //           unrecognized type tag.
4366             U_ASSERT(FALSE);
4367         }
4368 
4369         if (U_FAILURE(status)) {
4370             isMatch = FALSE;
4371             break;
4372         }
4373     }
4374 
4375 breakFromLoop:
4376     fMatch = isMatch;
4377     if (isMatch) {
4378         fLastMatchEnd = fMatchEnd;
4379         fMatchStart   = startIdx;
4380         fMatchEnd     = fp->fInputIdx;
4381         if (fTraceDebug) {
4382             REGEX_RUN_DEBUG_PRINTF(("Match.  start=%d   end=%d\n\n", fMatchStart, fMatchEnd));
4383         }
4384     }
4385     else
4386     {
4387         if (fTraceDebug) {
4388             REGEX_RUN_DEBUG_PRINTF(("No match\n\n"));
4389         }
4390     }
4391 
4392     fFrame = fp;                // The active stack frame when the engine stopped.
4393                                 //   Contains the capture group results that we need to
4394                                 //    access later.
4395     return;
4396 }
4397 
4398 
4399 //--------------------------------------------------------------------------------
4400 //
4401 //   MatchChunkAt   This is the actual matching engine. Like MatchAt, but with the
4402 //                  assumption that the entire string is available in the UText's
4403 //                  chunk buffer. For now, that means we can use int32_t indexes,
4404 //                  except for anything that needs to be saved (like group starts
4405 //                  and ends).
4406 //
4407 //                  startIdx:    begin matching a this index.
4408 //                  toEnd:       if true, match must extend to end of the input region
4409 //
4410 //--------------------------------------------------------------------------------
MatchChunkAt(int32_t startIdx,UBool toEnd,UErrorCode & status)4411 void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status) {
4412     UBool       isMatch  = FALSE;      // True if the we have a match.
4413 
4414     int32_t     backSearchIndex = INT32_MAX; // used after greedy single-character matches for searching backwards
4415 
4416     int32_t     op;                    // Operation from the compiled pattern, split into
4417     int32_t     opType;                //    the opcode
4418     int32_t     opValue;               //    and the operand value.
4419 
4420 #ifdef REGEX_RUN_DEBUG
4421     if (fTraceDebug)
4422     {
4423         printf("MatchAt(startIdx=%ld)\n", startIdx);
4424         printf("Original Pattern: ");
4425         UChar32 c = utext_next32From(fPattern->fPattern, 0);
4426         while (c != U_SENTINEL) {
4427             if (c<32 || c>256) {
4428                 c = '.';
4429             }
4430             REGEX_DUMP_DEBUG_PRINTF(("%c", c));
4431 
4432             c = UTEXT_NEXT32(fPattern->fPattern);
4433         }
4434         printf("\n");
4435         printf("Input String: ");
4436         c = utext_next32From(fInputText, 0);
4437         while (c != U_SENTINEL) {
4438             if (c<32 || c>256) {
4439                 c = '.';
4440             }
4441             printf("%c", c);
4442 
4443             c = UTEXT_NEXT32(fInputText);
4444         }
4445         printf("\n");
4446         printf("\n");
4447     }
4448 #endif
4449 
4450     if (U_FAILURE(status)) {
4451         return;
4452     }
4453 
4454     //  Cache frequently referenced items from the compiled pattern
4455     //
4456     int64_t             *pat           = fPattern->fCompiledPat->getBuffer();
4457 
4458     const UChar         *litText       = fPattern->fLiteralText.getBuffer();
4459     UVector             *sets          = fPattern->fSets;
4460 
4461     const UChar         *inputBuf      = fInputText->chunkContents;
4462 
4463     fFrameSize = fPattern->fFrameSize;
4464     REStackFrame        *fp            = resetStack();
4465 
4466     fp->fPatIdx   = 0;
4467     fp->fInputIdx = startIdx;
4468 
4469     // Zero out the pattern's static data
4470     int32_t i;
4471     for (i = 0; i<fPattern->fDataSize; i++) {
4472         fData[i] = 0;
4473     }
4474 
4475     //
4476     //  Main loop for interpreting the compiled pattern.
4477     //  One iteration of the loop per pattern operation performed.
4478     //
4479     for (;;) {
4480 #if 0
4481         if (_heapchk() != _HEAPOK) {
4482             fprintf(stderr, "Heap Trouble\n");
4483         }
4484 #endif
4485 
4486         op      = (int32_t)pat[fp->fPatIdx];
4487         opType  = URX_TYPE(op);
4488         opValue = URX_VAL(op);
4489 #ifdef REGEX_RUN_DEBUG
4490         if (fTraceDebug) {
4491             UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
4492             printf("inputIdx=%d   inputChar=%x   sp=%3d   activeLimit=%d  ", fp->fInputIdx,
4493                    UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
4494             fPattern->dumpOp(fp->fPatIdx);
4495         }
4496 #endif
4497         fp->fPatIdx++;
4498 
4499         switch (opType) {
4500 
4501 
4502         case URX_NOP:
4503             break;
4504 
4505 
4506         case URX_BACKTRACK:
4507             // Force a backtrack.  In some circumstances, the pattern compiler
4508             //   will notice that the pattern can't possibly match anything, and will
4509             //   emit one of these at that point.
4510             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4511             break;
4512 
4513 
4514         case URX_ONECHAR:
4515             if (fp->fInputIdx < fActiveLimit) {
4516                 UChar32 c;
4517                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4518                 if (c == opValue) {
4519                     break;
4520                 }
4521             } else {
4522                 fHitEnd = TRUE;
4523             }
4524 
4525             #ifdef REGEX_SMART_BACKTRACKING
4526             if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
4527                 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
4528                 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
4529                     int64_t reverseIndex = fp->fInputIdx;
4530                     UChar32 c;
4531                     do {
4532                         U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
4533                         if (c == opValue) {
4534                             break;
4535                         }
4536                     } while (reverseIndex > backSearchIndex);
4537                     if (c == opValue) {
4538                         fHitEnd = FALSE;
4539                         fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4540                         fp->fInputIdx = reverseIndex;
4541                         if (fp->fInputIdx > backSearchIndex) {
4542                             fp = StateSave(fp, fp->fPatIdx, status);
4543                         }
4544                         fp->fPatIdx++; // Skip the LOOP_C, we just did that
4545                         break;
4546                     }
4547                 }
4548             }
4549             #endif
4550 
4551             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4552             break;
4553 
4554 
4555         case URX_STRING:
4556             {
4557                 // Test input against a literal string.
4558                 // Strings require two slots in the compiled pattern, one for the
4559                 //   offset to the string text, and one for the length.
4560                 int32_t   stringStartIdx = opValue;
4561                 int32_t   stringLen;
4562 
4563                 op      = (int32_t)pat[fp->fPatIdx];     // Fetch the second operand
4564                 fp->fPatIdx++;
4565                 opType    = URX_TYPE(op);
4566                 stringLen = URX_VAL(op);
4567                 U_ASSERT(opType == URX_STRING_LEN);
4568                 U_ASSERT(stringLen >= 2);
4569 
4570                 if (fp->fInputIdx + stringLen > fActiveLimit) {
4571                     // No match.  String is longer than the remaining input text.
4572                     fHitEnd = TRUE;          //   TODO:  See ticket 6074
4573                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4574                     break;
4575                 }
4576 
4577                 const UChar * pInp = inputBuf + fp->fInputIdx;
4578                 const UChar * pPat = litText+stringStartIdx;
4579                 const UChar * pEnd = pInp + stringLen;
4580                 UBool success = FALSE;
4581                 for(;;) {
4582                     if (*pInp == *pPat) {
4583                         pInp++;
4584                         pPat++;
4585                         if (pInp == pEnd) {
4586                             // Successful Match.
4587                             success = TRUE;
4588                             break;
4589                         }
4590                     } else {
4591                         // Match failed.
4592                         break;
4593                     }
4594                 }
4595 
4596                 if (success) {
4597                     fp->fInputIdx += stringLen;
4598                 } else {
4599                     #ifdef REGEX_SMART_BACKTRACKING
4600                     if (fp->fInputIdx > backSearchIndex && fStack->size()) {
4601                         REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
4602                         if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
4603                             // Reset to last start point
4604                             int64_t reverseIndex = fp->fInputIdx;
4605                             UChar32 c;
4606                             pPat = litText+stringStartIdx;
4607 
4608                             // Search backwards for a possible start
4609                             do {
4610                                 U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
4611                                 if ((U_IS_BMP(c) && *pPat == c) ||
4612                                     (*pPat == U16_LEAD(c) && *(pPat+1) == U16_TRAIL(c))) {
4613                                     success = TRUE;
4614                                     break;
4615                                 }
4616                             } while (reverseIndex > backSearchIndex);
4617 
4618                             // And try again
4619                             if (success) {
4620                                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4621                                 fp->fInputIdx = reverseIndex;
4622                                 if (fp->fInputIdx > backSearchIndex) {
4623                                     fp = StateSave(fp, fp->fPatIdx, status);
4624                                 }
4625                                 fp->fPatIdx++; // Skip the LOOP_C, we just did that
4626                                 break;
4627                             }
4628                         }
4629                     }
4630                     #endif
4631                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4632                 }
4633             }
4634             break;
4635 
4636 
4637         case URX_STATE_SAVE:
4638             fp = StateSave(fp, opValue, status);
4639             break;
4640 
4641 
4642         case URX_END:
4643             // The match loop will exit via this path on a successful match,
4644             //   when we reach the end of the pattern.
4645             if (toEnd && fp->fInputIdx != fActiveLimit) {
4646                 // The pattern matched, but not to the end of input.  Try some more.
4647                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4648                 break;
4649             }
4650             isMatch = TRUE;
4651             goto  breakFromLoop;
4652 
4653             // Start and End Capture stack frame variables are laid out out like this:
4654             //  fp->fExtra[opValue]  - The start of a completed capture group
4655             //             opValue+1 - The end   of a completed capture group
4656             //             opValue+2 - the start of a capture group whose end
4657             //                          has not yet been reached (and might not ever be).
4658         case URX_START_CAPTURE:
4659             U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
4660             fp->fExtra[opValue+2] = fp->fInputIdx;
4661             break;
4662 
4663 
4664         case URX_END_CAPTURE:
4665             U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
4666             U_ASSERT(fp->fExtra[opValue+2] >= 0);            // Start pos for this group must be set.
4667             fp->fExtra[opValue]   = fp->fExtra[opValue+2];   // Tentative start becomes real.
4668             fp->fExtra[opValue+1] = fp->fInputIdx;           // End position
4669             U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
4670             break;
4671 
4672 
4673         case URX_DOLLAR:                   //  $, test for End of line
4674             //     or for position before new line at end of input
4675             if (fp->fInputIdx < fAnchorLimit-2) {
4676                 // We are no where near the end of input.  Fail.
4677                 //   This is the common case.  Keep it first.
4678                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4679                 break;
4680             }
4681             if (fp->fInputIdx >= fAnchorLimit) {
4682                 // We really are at the end of input.  Success.
4683                 fHitEnd = TRUE;
4684                 fRequireEnd = TRUE;
4685                 break;
4686             }
4687 
4688             // If we are positioned just before a new-line that is located at the
4689             //   end of input, succeed.
4690             if (fp->fInputIdx == fAnchorLimit-1) {
4691                 UChar32 c;
4692                 U16_GET(inputBuf, fAnchorStart, fp->fInputIdx, fAnchorLimit, c);
4693 
4694                 if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) {
4695                     if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
4696                         // At new-line at end of input. Success
4697                         fHitEnd = TRUE;
4698                         fRequireEnd = TRUE;
4699                         break;
4700                     }
4701                 }
4702             } else if (fp->fInputIdx == fAnchorLimit-2 &&
4703                 inputBuf[fp->fInputIdx]==0x0d && inputBuf[fp->fInputIdx+1]==0x0a) {
4704                     fHitEnd = TRUE;
4705                     fRequireEnd = TRUE;
4706                     break;                         // At CR/LF at end of input.  Success
4707             }
4708 
4709             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4710 
4711             break;
4712 
4713 
4714         case URX_DOLLAR_D:                   //  $, test for End of Line, in UNIX_LINES mode.
4715             if (fp->fInputIdx >= fAnchorLimit-1) {
4716                 // Either at the last character of input, or off the end.
4717                 if (fp->fInputIdx == fAnchorLimit-1) {
4718                     // At last char of input.  Success if it's a new line.
4719                     if (inputBuf[fp->fInputIdx] == 0x0a) {
4720                         fHitEnd = TRUE;
4721                         fRequireEnd = TRUE;
4722                         break;
4723                     }
4724                 } else {
4725                     // Off the end of input.  Success.
4726                     fHitEnd = TRUE;
4727                     fRequireEnd = TRUE;
4728                     break;
4729                 }
4730             }
4731 
4732             // Not at end of input.  Back-track out.
4733             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4734             break;
4735 
4736 
4737         case URX_DOLLAR_M:                //  $, test for End of line in multi-line mode
4738             {
4739                 if (fp->fInputIdx >= fAnchorLimit) {
4740                     // We really are at the end of input.  Success.
4741                     fHitEnd = TRUE;
4742                     fRequireEnd = TRUE;
4743                     break;
4744                 }
4745                 // If we are positioned just before a new-line, succeed.
4746                 // It makes no difference where the new-line is within the input.
4747                 UChar32 c = inputBuf[fp->fInputIdx];
4748                 if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) {
4749                     // At a line end, except for the odd chance of  being in the middle of a CR/LF sequence
4750                     //  In multi-line mode, hitting a new-line just before the end of input does not
4751                     //   set the hitEnd or requireEnd flags
4752                     if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
4753                         break;
4754                     }
4755                 }
4756                 // not at a new line.  Fail.
4757                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4758             }
4759             break;
4760 
4761 
4762         case URX_DOLLAR_MD:                //  $, test for End of line in multi-line and UNIX_LINES mode
4763             {
4764                 if (fp->fInputIdx >= fAnchorLimit) {
4765                     // We really are at the end of input.  Success.
4766                     fHitEnd = TRUE;
4767                     fRequireEnd = TRUE;  // Java set requireEnd in this case, even though
4768                     break;               //   adding a new-line would not lose the match.
4769                 }
4770                 // If we are not positioned just before a new-line, the test fails; backtrack out.
4771                 // It makes no difference where the new-line is within the input.
4772                 if (inputBuf[fp->fInputIdx] != 0x0a) {
4773                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4774                 }
4775             }
4776             break;
4777 
4778 
4779         case URX_CARET:                    //  ^, test for start of line
4780             if (fp->fInputIdx != fAnchorStart) {
4781                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4782             }
4783             break;
4784 
4785 
4786         case URX_CARET_M:                   //  ^, test for start of line in mulit-line mode
4787             {
4788                 if (fp->fInputIdx == fAnchorStart) {
4789                     // We are at the start input.  Success.
4790                     break;
4791                 }
4792                 // Check whether character just before the current pos is a new-line
4793                 //   unless we are at the end of input
4794                 UChar  c = inputBuf[fp->fInputIdx - 1];
4795                 if ((fp->fInputIdx < fAnchorLimit) &&
4796                     ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
4797                     //  It's a new-line.  ^ is true.  Success.
4798                     //  TODO:  what should be done with positions between a CR and LF?
4799                     break;
4800                 }
4801                 // Not at the start of a line.  Fail.
4802                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4803             }
4804             break;
4805 
4806 
4807         case URX_CARET_M_UNIX:       //  ^, test for start of line in mulit-line + Unix-line mode
4808             {
4809                 U_ASSERT(fp->fInputIdx >= fAnchorStart);
4810                 if (fp->fInputIdx <= fAnchorStart) {
4811                     // We are at the start input.  Success.
4812                     break;
4813                 }
4814                 // Check whether character just before the current pos is a new-line
4815                 U_ASSERT(fp->fInputIdx <= fAnchorLimit);
4816                 UChar  c = inputBuf[fp->fInputIdx - 1];
4817                 if (c != 0x0a) {
4818                     // Not at the start of a line.  Back-track out.
4819                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4820                 }
4821             }
4822             break;
4823 
4824         case URX_BACKSLASH_B:          // Test for word boundaries
4825             {
4826                 UBool success = isChunkWordBoundary((int32_t)fp->fInputIdx);
4827                 success ^= (opValue != 0);     // flip sense for \B
4828                 if (!success) {
4829                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4830                 }
4831             }
4832             break;
4833 
4834 
4835         case URX_BACKSLASH_BU:          // Test for word boundaries, Unicode-style
4836             {
4837                 UBool success = isUWordBoundary(fp->fInputIdx);
4838                 success ^= (opValue != 0);     // flip sense for \B
4839                 if (!success) {
4840                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4841                 }
4842             }
4843             break;
4844 
4845 
4846         case URX_BACKSLASH_D:            // Test for decimal digit
4847             {
4848                 if (fp->fInputIdx >= fActiveLimit) {
4849                     fHitEnd = TRUE;
4850                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4851                     break;
4852                 }
4853 
4854                 UChar32 c;
4855                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4856                 int8_t ctype = u_charType(c);     // TODO:  make a unicode set for this.  Will be faster.
4857                 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
4858                 success ^= (opValue != 0);        // flip sense for \D
4859                 if (!success) {
4860                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4861                 }
4862             }
4863             break;
4864 
4865 
4866         case URX_BACKSLASH_G:          // Test for position at end of previous match
4867             if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) {
4868                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4869             }
4870             break;
4871 
4872 
4873         case URX_BACKSLASH_X:
4874         //  Match a Grapheme, as defined by Unicode TR 29.
4875         //  Differs slightly from Perl, which consumes combining marks independently
4876         //    of context.
4877         {
4878 
4879             // Fail if at end of input
4880             if (fp->fInputIdx >= fActiveLimit) {
4881                 fHitEnd = TRUE;
4882                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4883                 break;
4884             }
4885 
4886             // Examine (and consume) the current char.
4887             //   Dispatch into a little state machine, based on the char.
4888             UChar32  c;
4889             U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4890             UnicodeSet **sets = fPattern->fStaticSets;
4891             if (sets[URX_GC_NORMAL]->contains(c))  goto GC_Extend;
4892             if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control;
4893             if (sets[URX_GC_L]->contains(c))       goto GC_L;
4894             if (sets[URX_GC_LV]->contains(c))      goto GC_V;
4895             if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
4896             if (sets[URX_GC_V]->contains(c))       goto GC_V;
4897             if (sets[URX_GC_T]->contains(c))       goto GC_T;
4898             goto GC_Extend;
4899 
4900 
4901 
4902 GC_L:
4903             if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
4904             U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4905             if (sets[URX_GC_L]->contains(c))       goto GC_L;
4906             if (sets[URX_GC_LV]->contains(c))      goto GC_V;
4907             if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
4908             if (sets[URX_GC_V]->contains(c))       goto GC_V;
4909             U16_PREV(inputBuf, 0, fp->fInputIdx, c);
4910             goto GC_Extend;
4911 
4912 GC_V:
4913             if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
4914             U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4915             if (sets[URX_GC_V]->contains(c))       goto GC_V;
4916             if (sets[URX_GC_T]->contains(c))       goto GC_T;
4917             U16_PREV(inputBuf, 0, fp->fInputIdx, c);
4918             goto GC_Extend;
4919 
4920 GC_T:
4921             if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
4922             U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4923             if (sets[URX_GC_T]->contains(c))       goto GC_T;
4924             U16_PREV(inputBuf, 0, fp->fInputIdx, c);
4925             goto GC_Extend;
4926 
4927 GC_Extend:
4928             // Combining characters are consumed here
4929             for (;;) {
4930                 if (fp->fInputIdx >= fActiveLimit) {
4931                     break;
4932                 }
4933                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4934                 if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
4935                     U16_BACK_1(inputBuf, 0, fp->fInputIdx);
4936                     break;
4937                 }
4938             }
4939             goto GC_Done;
4940 
4941 GC_Control:
4942             // Most control chars stand alone (don't combine with combining chars),
4943             //   except for that CR/LF sequence is a single grapheme cluster.
4944             if (c == 0x0d && fp->fInputIdx < fActiveLimit && inputBuf[fp->fInputIdx] == 0x0a) {
4945                 fp->fInputIdx++;
4946             }
4947 
4948 GC_Done:
4949             if (fp->fInputIdx >= fActiveLimit) {
4950                 fHitEnd = TRUE;
4951             }
4952             break;
4953         }
4954 
4955 
4956 
4957 
4958         case URX_BACKSLASH_Z:          // Test for end of Input
4959             if (fp->fInputIdx < fAnchorLimit) {
4960                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4961             } else {
4962                 fHitEnd = TRUE;
4963                 fRequireEnd = TRUE;
4964             }
4965             break;
4966 
4967 
4968 
4969         case URX_STATIC_SETREF:
4970             {
4971                 // Test input character against one of the predefined sets
4972                 //    (Word Characters, for example)
4973                 // The high bit of the op value is a flag for the match polarity.
4974                 //    0:   success if input char is in set.
4975                 //    1:   success if input char is not in set.
4976                 if (fp->fInputIdx >= fActiveLimit) {
4977                     fHitEnd = TRUE;
4978                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4979                     break;
4980                 }
4981 
4982                 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
4983                 opValue &= ~URX_NEG_SET;
4984                 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
4985 
4986                 UChar32 c;
4987                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4988                 if (c < 256) {
4989                     Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
4990                     if (s8->contains(c)) {
4991                         success = !success;
4992                     }
4993                 } else {
4994                     const UnicodeSet *s = fPattern->fStaticSets[opValue];
4995                     if (s->contains(c)) {
4996                         success = !success;
4997                     }
4998                 }
4999                 if (!success) {
5000                     #ifdef REGEX_SMART_BACKTRACKING
5001                     if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
5002                         REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
5003                         if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
5004                             // Try to find it, backwards
5005                             int64_t reverseIndex = fp->fInputIdx;
5006                             U16_BACK_1(inputBuf, backSearchIndex, reverseIndex); // skip the first character we tried
5007                             success = ((opValue & URX_NEG_SET) == URX_NEG_SET); // reset
5008                             do {
5009                                 U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
5010                                 if (c < 256) {
5011                                     Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
5012                                     if (s8->contains(c)) {
5013                                         success = !success;
5014                                     }
5015                                 } else {
5016                                     const UnicodeSet *s = fPattern->fStaticSets[opValue];
5017                                     if (s->contains(c)) {
5018                                         success = !success;
5019                                     }
5020                                 }
5021                             } while (reverseIndex > backSearchIndex && !success);
5022 
5023                             if (success) {
5024                                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5025                                 fp->fInputIdx = reverseIndex;
5026                                 if (fp->fInputIdx > backSearchIndex) {
5027                                     fp = StateSave(fp, fp->fPatIdx, status);
5028                                 }
5029                                 fp->fPatIdx++; // Skip the LOOP_C, we just did that
5030                                 break;
5031                             }
5032                         }
5033                     }
5034                     #endif
5035                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5036                 }
5037             }
5038             break;
5039 
5040 
5041         case URX_STAT_SETREF_N:
5042             {
5043                 // Test input character for NOT being a member of  one of
5044                 //    the predefined sets (Word Characters, for example)
5045                 if (fp->fInputIdx >= fActiveLimit) {
5046                     fHitEnd = TRUE;
5047                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5048                     break;
5049                 }
5050 
5051                 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
5052 
5053                 UChar32  c;
5054                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5055                 if (c < 256) {
5056                     Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
5057                     if (s8->contains(c) == FALSE) {
5058                         break;
5059                     }
5060                 } else {
5061                     const UnicodeSet *s = fPattern->fStaticSets[opValue];
5062                     if (s->contains(c) == FALSE) {
5063                         break;
5064                     }
5065                 }
5066 
5067                 #ifdef REGEX_SMART_BACKTRACKING
5068                 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
5069                     REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
5070                     if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
5071                         // Try to find it, backwards
5072                         int64_t reverseIndex = fp->fInputIdx;
5073                         U16_BACK_1(inputBuf, backSearchIndex, reverseIndex); // skip the first character we tried
5074                         UBool success = FALSE;
5075                         do {
5076                             U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
5077                             if (c < 256) {
5078                                 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
5079                                 if (s8->contains(c) == FALSE) {
5080                                     success = TRUE;
5081                                     break;
5082                                 }
5083                             } else {
5084                                 const UnicodeSet *s = fPattern->fStaticSets[opValue];
5085                                 if (s->contains(c) == FALSE) {
5086                                     success = TRUE;
5087                                     break;
5088                                 }
5089                             }
5090                         } while (reverseIndex > backSearchIndex);
5091 
5092                         if (success) {
5093                             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5094                             fp->fInputIdx = reverseIndex;
5095                             if (fp->fInputIdx > backSearchIndex) {
5096                                 fp = StateSave(fp, fp->fPatIdx, status);
5097                             }
5098                             fp->fPatIdx++; // Skip the LOOP_C, we just did that
5099                             break;
5100                         }
5101                     }
5102                 }
5103                 #endif
5104                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5105             }
5106             break;
5107 
5108 
5109         case URX_SETREF:
5110             {
5111                 if (fp->fInputIdx >= fActiveLimit) {
5112                     fHitEnd = TRUE;
5113                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5114                     break;
5115                 }
5116 
5117                 U_ASSERT(opValue > 0 && opValue < sets->size());
5118 
5119                 // There is input left.  Pick up one char and test it for set membership.
5120                 UChar32  c;
5121                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5122                 if (c<256) {
5123                     Regex8BitSet *s8 = &fPattern->fSets8[opValue];
5124                     if (s8->contains(c)) {
5125                         // The character is in the set.  A Match.
5126                         break;
5127                     }
5128                 } else {
5129                     UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
5130                     if (s->contains(c)) {
5131                         // The character is in the set.  A Match.
5132                         break;
5133                     }
5134                 }
5135 
5136                 // the character wasn't in the set.
5137                 #ifdef REGEX_SMART_BACKTRACKING
5138                 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
5139                     REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
5140                     if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
5141                         // Try to find it, backwards
5142                         int64_t reverseIndex = fp->fInputIdx;
5143                         U16_BACK_1(inputBuf, backSearchIndex, reverseIndex); // skip the first character we tried
5144                         UBool success = FALSE;
5145                         do {
5146                             U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
5147                             if (c < 256) {
5148                                 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
5149                                 if (s8->contains(c)) {
5150                                     success = TRUE;
5151                                     break;
5152                                 }
5153                             } else {
5154                                 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
5155                                 if (s->contains(c)) {
5156                                     success = TRUE;
5157                                     break;
5158                                 }
5159                             }
5160                         } while (reverseIndex > backSearchIndex);
5161 
5162                         if (success) {
5163                             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5164                             fp->fInputIdx = reverseIndex;
5165                             if (fp->fInputIdx > reverseIndex) {
5166                                 fp = StateSave(fp, fp->fPatIdx, status);
5167                             }
5168                             fp->fPatIdx++; // Skip the LOOP_C, we just did that
5169                             break;
5170                         }
5171                     }
5172                 }
5173                 #endif
5174                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5175             }
5176             break;
5177 
5178 
5179         case URX_DOTANY:
5180             {
5181                 // . matches anything, but stops at end-of-line.
5182                 if (fp->fInputIdx >= fActiveLimit) {
5183                     // At end of input.  Match failed.  Backtrack out.
5184                     fHitEnd = TRUE;
5185                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5186                     break;
5187                 }
5188 
5189                 // There is input left.  Advance over one char, unless we've hit end-of-line
5190                 UChar32  c;
5191                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5192                 if (((c & 0x7f) <= 0x29) &&     // First quickly bypass as many chars as possible
5193                     ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
5194                     // End of line in normal mode.   . does not match.
5195                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5196                     break;
5197                 }
5198             }
5199             break;
5200 
5201 
5202         case URX_DOTANY_ALL:
5203             {
5204                 // . in dot-matches-all (including new lines) mode
5205                 if (fp->fInputIdx >= fActiveLimit) {
5206                     // At end of input.  Match failed.  Backtrack out.
5207                     fHitEnd = TRUE;
5208                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5209                     break;
5210                 }
5211 
5212                 // There is input left.  Advance over one char, except if we are
5213                 //   at a cr/lf, advance over both of them.
5214                 UChar32 c;
5215                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5216                 if (c==0x0d && fp->fInputIdx < fActiveLimit) {
5217                     // In the case of a CR/LF, we need to advance over both.
5218                     if (inputBuf[fp->fInputIdx] == 0x0a) {
5219                         U16_FWD_1(inputBuf, fp->fInputIdx, fActiveLimit);
5220                     }
5221                 }
5222             }
5223             break;
5224 
5225 
5226         case URX_DOTANY_UNIX:
5227             {
5228                 // '.' operator, matches all, but stops at end-of-line.
5229                 //   UNIX_LINES mode, so 0x0a is the only recognized line ending.
5230                 if (fp->fInputIdx >= fActiveLimit) {
5231                     // At end of input.  Match failed.  Backtrack out.
5232                     fHitEnd = TRUE;
5233                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5234                     break;
5235                 }
5236 
5237                 // There is input left.  Advance over one char, unless we've hit end-of-line
5238                 UChar32 c;
5239                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5240                 if (c == 0x0a) {
5241                     // End of line in normal mode.   '.' does not match the \n
5242                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5243                 }
5244             }
5245             break;
5246 
5247 
5248         case URX_JMP:
5249             fp->fPatIdx = opValue;
5250             break;
5251 
5252         case URX_FAIL:
5253             isMatch = FALSE;
5254             goto breakFromLoop;
5255 
5256         case URX_JMP_SAV:
5257             U_ASSERT(opValue < fPattern->fCompiledPat->size());
5258             fp = StateSave(fp, fp->fPatIdx, status);       // State save to loc following current
5259             fp->fPatIdx = opValue;                         // Then JMP.
5260             break;
5261 
5262         case URX_JMP_SAV_X:
5263             // This opcode is used with (x)+, when x can match a zero length string.
5264             // Same as JMP_SAV, except conditional on the match having made forward progress.
5265             // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
5266             //   data address of the input position at the start of the loop.
5267             {
5268                 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
5269                 int32_t  stoOp = (int32_t)pat[opValue-1];
5270                 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
5271                 int32_t  frameLoc = URX_VAL(stoOp);
5272                 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
5273                 int32_t prevInputIdx = (int32_t)fp->fExtra[frameLoc];
5274                 U_ASSERT(prevInputIdx <= fp->fInputIdx);
5275                 if (prevInputIdx < fp->fInputIdx) {
5276                     // The match did make progress.  Repeat the loop.
5277                     fp = StateSave(fp, fp->fPatIdx, status);  // State save to loc following current
5278                     fp->fPatIdx = opValue;
5279                     fp->fExtra[frameLoc] = fp->fInputIdx;
5280                 }
5281                 // If the input position did not advance, we do nothing here,
5282                 //   execution will fall out of the loop.
5283             }
5284             break;
5285 
5286         case URX_CTR_INIT:
5287             {
5288                 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
5289                 fp->fExtra[opValue] = 0;       //  Set the loop counter variable to zero
5290 
5291                 // Pick up the three extra operands that CTR_INIT has, and
5292                 //    skip the pattern location counter past
5293                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
5294                 fp->fPatIdx += 3;
5295                 int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
5296                 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
5297                 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
5298                 U_ASSERT(minCount>=0);
5299                 U_ASSERT(maxCount>=minCount || maxCount==-1);
5300                 U_ASSERT(loopLoc>fp->fPatIdx);
5301 
5302                 if (minCount == 0) {
5303                     fp = StateSave(fp, loopLoc+1, status);
5304                 }
5305                 if (maxCount == 0) {
5306                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5307                 }
5308             }
5309             break;
5310 
5311         case URX_CTR_LOOP:
5312             {
5313                 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
5314                 int32_t initOp = (int32_t)pat[opValue];
5315                 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
5316                 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
5317                 int32_t minCount  = (int32_t)pat[opValue+2];
5318                 int32_t maxCount  = (int32_t)pat[opValue+3];
5319                 // Increment the counter.  Note: we DIDN'T worry about counter
5320                 //   overflow, since the data comes from UnicodeStrings, which
5321                 //   stores its length in an int32_t. Do we have to think about
5322                 //   this now that we're using UText? Probably not, since the length
5323                 //   in UChar32s is still an int32_t.
5324                 (*pCounter)++;
5325                 U_ASSERT(*pCounter > 0);
5326                 if ((uint64_t)*pCounter >= (uint32_t)maxCount) {
5327                     U_ASSERT(*pCounter == maxCount || maxCount == -1);
5328                     break;
5329                 }
5330                 if (*pCounter >= minCount) {
5331                     fp = StateSave(fp, fp->fPatIdx, status);
5332                 }
5333                 fp->fPatIdx = opValue + 4;    // Loop back.
5334             }
5335             break;
5336 
5337         case URX_CTR_INIT_NG:
5338             {
5339                 // Initialize a non-greedy loop
5340                 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
5341                 fp->fExtra[opValue] = 0;       //  Set the loop counter variable to zero
5342 
5343                 // Pick up the three extra operands that CTR_INIT has, and
5344                 //    skip the pattern location counter past
5345                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
5346                 fp->fPatIdx += 3;
5347                 int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
5348                 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
5349                 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
5350                 U_ASSERT(minCount>=0);
5351                 U_ASSERT(maxCount>=minCount || maxCount==-1);
5352                 U_ASSERT(loopLoc>fp->fPatIdx);
5353 
5354                 if (minCount == 0) {
5355                     if (maxCount != 0) {
5356                         fp = StateSave(fp, fp->fPatIdx, status);
5357                     }
5358                     fp->fPatIdx = loopLoc+1;   // Continue with stuff after repeated block
5359                 }
5360             }
5361             break;
5362 
5363         case URX_CTR_LOOP_NG:
5364             {
5365                 // Non-greedy {min, max} loops
5366                 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
5367                 int32_t initOp = (int32_t)pat[opValue];
5368                 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
5369                 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
5370                 int32_t minCount  = (int32_t)pat[opValue+2];
5371                 int32_t maxCount  = (int32_t)pat[opValue+3];
5372                 // Increment the counter.  Note: we DIDN'T worry about counter
5373                 //   overflow, since the data comes from UnicodeStrings, which
5374                 //   stores its length in an int32_t. Do we have to think about
5375                 //   this now that we're using UText? Probably not, since the length
5376                 //   in UChar32s is still an int32_t.
5377                 (*pCounter)++;
5378                 U_ASSERT(*pCounter > 0);
5379 
5380                 if ((uint64_t)*pCounter >= (uint32_t)maxCount) {
5381                     // The loop has matched the maximum permitted number of times.
5382                     //   Break out of here with no action.  Matching will
5383                     //   continue with the following pattern.
5384                     U_ASSERT(*pCounter == maxCount || maxCount == -1);
5385                     break;
5386                 }
5387 
5388                 if (*pCounter < minCount) {
5389                     // We haven't met the minimum number of matches yet.
5390                     //   Loop back for another one.
5391                     fp->fPatIdx = opValue + 4;    // Loop back.
5392                 } else {
5393                     // We do have the minimum number of matches.
5394                     //   Fall into the following pattern, but first do
5395                     //   a state save to the top of the loop, so that a failure
5396                     //   in the following pattern will try another iteration of the loop.
5397                     fp = StateSave(fp, opValue + 4, status);
5398                 }
5399             }
5400             break;
5401 
5402         case URX_STO_SP:
5403             U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
5404             fData[opValue] = fStack->size();
5405             break;
5406 
5407         case URX_LD_SP:
5408             {
5409                 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
5410                 int32_t newStackSize = (int32_t)fData[opValue];
5411                 U_ASSERT(newStackSize <= fStack->size());
5412                 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
5413                 if (newFP == (int64_t *)fp) {
5414                     break;
5415                 }
5416                 int32_t i;
5417                 for (i=0; i<fFrameSize; i++) {
5418                     newFP[i] = ((int64_t *)fp)[i];
5419                 }
5420                 fp = (REStackFrame *)newFP;
5421                 fStack->setSize(newStackSize);
5422             }
5423             break;
5424 
5425         case URX_BACKREF:
5426         case URX_BACKREF_I:
5427             {
5428                 U_ASSERT(opValue < fFrameSize);
5429                 int64_t groupStartIdx = fp->fExtra[opValue];
5430                 int64_t groupEndIdx   = fp->fExtra[opValue+1];
5431                 U_ASSERT(groupStartIdx <= groupEndIdx);
5432                 int64_t len = groupEndIdx-groupStartIdx;
5433                 if (groupStartIdx < 0) {
5434                     // This capture group has not participated in the match thus far,
5435                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
5436                 }
5437 
5438                 if (len == 0) {
5439                         //   The capture group match was of an empty string.
5440                         //   Verified by testing:  Perl matches succeed in this case, so
5441                         //   we do too.
5442                         break;
5443                     }
5444 
5445                 UBool  haveMatch = FALSE;
5446                 if (fp->fInputIdx + len <= fActiveLimit) {
5447                     if (opType == URX_BACKREF) {
5448                         if (u_strncmp(inputBuf+groupStartIdx, inputBuf+fp->fInputIdx, (int32_t)len) == 0) {
5449                             haveMatch = TRUE;
5450                         }
5451                     } else {
5452                         if (u_strncasecmp(inputBuf+groupStartIdx, inputBuf+fp->fInputIdx,
5453                                   (int32_t)len, U_FOLD_CASE_DEFAULT) == 0) {
5454                             haveMatch = TRUE;
5455                         }
5456                     }
5457                 } else {
5458                     // TODO: probably need to do a partial string comparison, and only
5459                     //       set HitEnd if the available input matched.  Ticket #6074
5460                     fHitEnd = TRUE;
5461                 }
5462                 if (haveMatch) {
5463                     fp->fInputIdx += len;     // Match.  Advance current input position.
5464                 } else {
5465                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
5466                 }
5467             }
5468             break;
5469 
5470         case URX_STO_INP_LOC:
5471             {
5472                 U_ASSERT(opValue >= 0 && opValue < fFrameSize);
5473                 fp->fExtra[opValue] = fp->fInputIdx;
5474             }
5475             break;
5476 
5477         case URX_JMPX:
5478             {
5479                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
5480                 fp->fPatIdx += 1;
5481                 int32_t dataLoc  = URX_VAL(pat[instrOperandLoc]);
5482                 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
5483                 int32_t savedInputIdx = (int32_t)fp->fExtra[dataLoc];
5484                 U_ASSERT(savedInputIdx <= fp->fInputIdx);
5485                 if (savedInputIdx < fp->fInputIdx) {
5486                     fp->fPatIdx = opValue;                               // JMP
5487                 } else {
5488                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no progress in loop.
5489                 }
5490             }
5491             break;
5492 
5493         case URX_LA_START:
5494             {
5495                 // Entering a lookahead block.
5496                 // Save Stack Ptr, Input Pos.
5497                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5498                 fData[opValue]   = fStack->size();
5499                 fData[opValue+1] = fp->fInputIdx;
5500                 fActiveStart     = fLookStart;          // Set the match region change for
5501                 fActiveLimit     = fLookLimit;          //   transparent bounds.
5502             }
5503             break;
5504 
5505         case URX_LA_END:
5506             {
5507                 // Leaving a look-ahead block.
5508                 //  restore Stack Ptr, Input Pos to positions they had on entry to block.
5509                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5510                 int32_t stackSize = fStack->size();
5511                 int32_t newStackSize = (int32_t)fData[opValue];
5512                 U_ASSERT(stackSize >= newStackSize);
5513                 if (stackSize > newStackSize) {
5514                     // Copy the current top frame back to the new (cut back) top frame.
5515                     //   This makes the capture groups from within the look-ahead
5516                     //   expression available.
5517                     int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
5518                     int32_t i;
5519                     for (i=0; i<fFrameSize; i++) {
5520                         newFP[i] = ((int64_t *)fp)[i];
5521                     }
5522                     fp = (REStackFrame *)newFP;
5523                     fStack->setSize(newStackSize);
5524                 }
5525                 fp->fInputIdx = fData[opValue+1];
5526 
5527                 // Restore the active region bounds in the input string; they may have
5528                 //    been changed because of transparent bounds on a Region.
5529                 fActiveStart = fRegionStart;
5530                 fActiveLimit = fRegionLimit;
5531             }
5532             break;
5533 
5534         case URX_ONECHAR_I:
5535             if (fp->fInputIdx < fActiveLimit) {
5536                 UChar32 c;
5537                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5538                 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
5539                     break;
5540                 }
5541             } else {
5542                 fHitEnd = TRUE;
5543             }
5544 
5545             #ifdef REGEX_SMART_BACKTRACKING
5546             if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
5547                 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
5548                 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
5549                     UBool success = FALSE;
5550                     int64_t reverseIndex = fp->fInputIdx;
5551                     UChar32 c;
5552                     while (reverseIndex > backSearchIndex) {
5553                         U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
5554                         if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
5555                             success = TRUE;
5556                             break;
5557                         } else if (c == U_SENTINEL) {
5558                             break;
5559                         }
5560                     }
5561                     if (success) {
5562                         fHitEnd = FALSE;
5563                         fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5564                         fp->fInputIdx = reverseIndex;
5565                         if (fp->fInputIdx > backSearchIndex) {
5566                             fp = StateSave(fp, fp->fPatIdx, status);
5567                         }
5568                         fp->fPatIdx++; // Skip the LOOP_C, we just did that
5569                         break;
5570                     }
5571                 }
5572             }
5573             #endif
5574 
5575             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5576             break;
5577 
5578         case URX_STRING_I:
5579             {
5580                 // Test input against a literal string.
5581                 // Strings require two slots in the compiled pattern, one for the
5582                 //   offset to the string text, and one for the length.
5583                 const UCaseProps *csp = ucase_getSingleton();
5584                 {
5585                     int32_t stringStartIdx, stringLen;
5586                     stringStartIdx = opValue;
5587 
5588                     op      = (int32_t)pat[fp->fPatIdx];
5589                     fp->fPatIdx++;
5590                     opType  = URX_TYPE(op);
5591                     opValue = URX_VAL(op);
5592                     U_ASSERT(opType == URX_STRING_LEN);
5593                     stringLen = opValue;
5594 
5595                     const UChar *patternChars = litText+stringStartIdx;
5596                     const UChar *patternEnd = patternChars+stringLen;
5597 
5598                     const UChar *foldChars = NULL;
5599                     int32_t foldOffset, foldLength;
5600                     UChar32 c;
5601                     UBool c_is_valid = FALSE;
5602 
5603                     #ifdef REGEX_SMART_BACKTRACKING
5604                     int32_t originalInputIdx = fp->fInputIdx;
5605                     #endif
5606                     UBool success = TRUE;
5607 
5608                     foldOffset = foldLength = 0;
5609 
5610                     while (patternChars < patternEnd && success) {
5611                         if (fp->fInputIdx < fActiveLimit) {  // don't read past end of string
5612                             if(foldOffset < foldLength) {
5613                                 U16_NEXT_UNSAFE(foldChars, foldOffset, c);
5614                                 c_is_valid = TRUE;
5615                             } else {
5616                                 // test pre-condition of U16_NEXT: i < length
5617                                 U_ASSERT(fp->fInputIdx < fActiveLimit);
5618                                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5619                                 c_is_valid = TRUE;
5620                                 foldLength = ucase_toFullFolding(csp, c, &foldChars, U_FOLD_CASE_DEFAULT);
5621                                 if(foldLength >= 0) {
5622                                     if(foldLength <= UCASE_MAX_STRING_LENGTH) {   // !!!: Does not correctly handle chars that fold to 0-length strings
5623                                         foldOffset = 0;
5624                                         U16_NEXT_UNSAFE(foldChars, foldOffset, c);
5625                                     } else {
5626                                         c = foldLength;
5627                                         foldLength = foldOffset; // to avoid reading chars from the folding buffer
5628                                     }
5629                                 }
5630                             }
5631                         } else {
5632                           c_is_valid = FALSE;
5633                         }
5634 
5635                         if (fp->fInputIdx <= fActiveLimit && c_is_valid) {
5636                             if (U_IS_BMP(c)) {
5637                                 success = (*patternChars == c);
5638                                 patternChars += 1;
5639                             } else if (patternChars+1 < patternEnd) {
5640                                 success = (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c));
5641                                 patternChars += 2;
5642                             }
5643                         } else {
5644                             success = FALSE;
5645                             fHitEnd = TRUE;          //   TODO:  See ticket 6074
5646                         }
5647                     }
5648 
5649                     if (!success) {
5650                         #ifdef REGEX_SMART_BACKTRACKING
5651                         if (fp->fInputIdx > backSearchIndex && fStack->size()) {
5652                             REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
5653                             if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
5654                                 // Reset to last start point
5655                                 int64_t reverseIndex = originalInputIdx;
5656                                 patternChars = litText+stringStartIdx;
5657 
5658                                 // Search backwards for a possible start
5659                                 do {
5660                                     U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
5661                                     foldLength = ucase_toFullFolding(csp, c, &foldChars, U_FOLD_CASE_DEFAULT);
5662                                     if(foldLength >= 0) {
5663                                         if(foldLength <= UCASE_MAX_STRING_LENGTH) {   // !!!: Does not correctly handle chars that fold to 0-length strings
5664                                             foldOffset = 0;
5665                                             U16_NEXT_UNSAFE(foldChars, foldOffset, c);
5666                                         } else {
5667                                             c = foldLength;
5668                                             foldLength = foldOffset; // to avoid reading chars from the folding buffer
5669                                         }
5670                                     }
5671 
5672                                     if ((U_IS_BMP(c) && *patternChars == c) ||
5673                                            (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c))) {
5674                                         success = TRUE;
5675                                         break;
5676                                     }
5677                                 } while (reverseIndex > backSearchIndex);
5678 
5679                                 // And try again
5680                                 if (success) {
5681                                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5682                                     fp->fInputIdx = reverseIndex;
5683                                     if (fp->fInputIdx > backSearchIndex) {
5684                                         fp = StateSave(fp, fp->fPatIdx, status);
5685                                     }
5686                                     fp->fPatIdx++; // Skip the LOOP_C, we just did that
5687                                     break;
5688                                 }
5689                             }
5690                         }
5691                         #endif
5692                         fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5693                     }
5694                 }
5695             }
5696             break;
5697 
5698         case URX_LB_START:
5699             {
5700                 // Entering a look-behind block.
5701                 // Save Stack Ptr, Input Pos.
5702                 //   TODO:  implement transparent bounds.  Ticket #6067
5703                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5704                 fData[opValue]   = fStack->size();
5705                 fData[opValue+1] = fp->fInputIdx;
5706                 // Init the variable containing the start index for attempted matches.
5707                 fData[opValue+2] = -1;
5708                 // Save input string length, then reset to pin any matches to end at
5709                 //   the current position.
5710                 fData[opValue+3] = fActiveLimit;
5711                 fActiveLimit     = fp->fInputIdx;
5712             }
5713             break;
5714 
5715 
5716         case URX_LB_CONT:
5717             {
5718                 // Positive Look-Behind, at top of loop checking for matches of LB expression
5719                 //    at all possible input starting positions.
5720 
5721                 // Fetch the min and max possible match lengths.  They are the operands
5722                 //   of this op in the pattern.
5723                 int32_t minML = (int32_t)pat[fp->fPatIdx++];
5724                 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
5725                 U_ASSERT(minML <= maxML);
5726                 U_ASSERT(minML >= 0);
5727 
5728                 // Fetch (from data) the last input index where a match was attempted.
5729                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5730                 int64_t  *lbStartIdx = &fData[opValue+2];
5731                 if (*lbStartIdx < 0) {
5732                     // First time through loop.
5733                     *lbStartIdx = fp->fInputIdx - minML;
5734                 } else {
5735                     // 2nd through nth time through the loop.
5736                     // Back up start position for match by one.
5737                     if (*lbStartIdx == 0) {
5738                         (*lbStartIdx)--;
5739                     } else {
5740                         U16_BACK_1(inputBuf, 0, *lbStartIdx);
5741                     }
5742                 }
5743 
5744                 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
5745                     // We have tried all potential match starting points without
5746                     //  getting a match.  Backtrack out, and out of the
5747                     //   Look Behind altogether.
5748                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5749                     int64_t restoreInputLen = fData[opValue+3];
5750                     U_ASSERT(restoreInputLen >= fActiveLimit);
5751                     U_ASSERT(restoreInputLen <= fInputLength);
5752                     fActiveLimit = restoreInputLen;
5753                     break;
5754                 }
5755 
5756                 //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
5757                 //      (successful match will fall off the end of the loop.)
5758                 fp = StateSave(fp, fp->fPatIdx-3, status);
5759                 fp->fInputIdx =  *lbStartIdx;
5760             }
5761             break;
5762 
5763         case URX_LB_END:
5764             // End of a look-behind block, after a successful match.
5765             {
5766                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5767                 if (fp->fInputIdx != fActiveLimit) {
5768                     //  The look-behind expression matched, but the match did not
5769                     //    extend all the way to the point that we are looking behind from.
5770                     //  FAIL out of here, which will take us back to the LB_CONT, which
5771                     //     will retry the match starting at another position or fail
5772                     //     the look-behind altogether, whichever is appropriate.
5773                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5774                     break;
5775                 }
5776 
5777                 // Look-behind match is good.  Restore the orignal input string length,
5778                 //   which had been truncated to pin the end of the lookbehind match to the
5779                 //   position being looked-behind.
5780                 int64_t originalInputLen = fData[opValue+3];
5781                 U_ASSERT(originalInputLen >= fActiveLimit);
5782                 U_ASSERT(originalInputLen <= fInputLength);
5783                 fActiveLimit = originalInputLen;
5784             }
5785             break;
5786 
5787 
5788         case URX_LBN_CONT:
5789             {
5790                 // Negative Look-Behind, at top of loop checking for matches of LB expression
5791                 //    at all possible input starting positions.
5792 
5793                 // Fetch the extra parameters of this op.
5794                 int32_t minML       = (int32_t)pat[fp->fPatIdx++];
5795                 int32_t maxML       = (int32_t)pat[fp->fPatIdx++];
5796                 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
5797                 continueLoc = URX_VAL(continueLoc);
5798                 U_ASSERT(minML <= maxML);
5799                 U_ASSERT(minML >= 0);
5800                 U_ASSERT(continueLoc > fp->fPatIdx);
5801 
5802                 // Fetch (from data) the last input index where a match was attempted.
5803                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5804                 int64_t  *lbStartIdx = &fData[opValue+2];
5805                 if (*lbStartIdx < 0) {
5806                     // First time through loop.
5807                     *lbStartIdx = fp->fInputIdx - minML;
5808                 } else {
5809                     // 2nd through nth time through the loop.
5810                     // Back up start position for match by one.
5811                     if (*lbStartIdx == 0) {
5812                         (*lbStartIdx)--;   // Because U16_BACK is unsafe starting at 0.
5813                     } else {
5814                         U16_BACK_1(inputBuf, 0, *lbStartIdx);
5815                     }
5816                 }
5817 
5818                 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
5819                     // We have tried all potential match starting points without
5820                     //  getting a match, which means that the negative lookbehind as
5821                     //  a whole has succeeded.  Jump forward to the continue location
5822                     int64_t restoreInputLen = fData[opValue+3];
5823                     U_ASSERT(restoreInputLen >= fActiveLimit);
5824                     U_ASSERT(restoreInputLen <= fInputLength);
5825                     fActiveLimit = restoreInputLen;
5826                     fp->fPatIdx = continueLoc;
5827                     break;
5828                 }
5829 
5830                 //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
5831                 //      (successful match will cause a FAIL out of the loop altogether.)
5832                 fp = StateSave(fp, fp->fPatIdx-4, status);
5833                 fp->fInputIdx =  *lbStartIdx;
5834             }
5835             break;
5836 
5837         case URX_LBN_END:
5838             // End of a negative look-behind block, after a successful match.
5839             {
5840                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5841                 if (fp->fInputIdx != fActiveLimit) {
5842                     //  The look-behind expression matched, but the match did not
5843                     //    extend all the way to the point that we are looking behind from.
5844                     //  FAIL out of here, which will take us back to the LB_CONT, which
5845                     //     will retry the match starting at another position or succeed
5846                     //     the look-behind altogether, whichever is appropriate.
5847                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5848                     break;
5849                 }
5850 
5851                 // Look-behind expression matched, which means look-behind test as
5852                 //   a whole Fails
5853 
5854                 //   Restore the orignal input string length, which had been truncated
5855                 //   inorder to pin the end of the lookbehind match
5856                 //   to the position being looked-behind.
5857                 int64_t originalInputLen = fData[opValue+3];
5858                 U_ASSERT(originalInputLen >= fActiveLimit);
5859                 U_ASSERT(originalInputLen <= fInputLength);
5860                 fActiveLimit = originalInputLen;
5861 
5862                 // Restore original stack position, discarding any state saved
5863                 //   by the successful pattern match.
5864                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5865                 int32_t newStackSize = (int32_t)fData[opValue];
5866                 U_ASSERT(fStack->size() > newStackSize);
5867                 fStack->setSize(newStackSize);
5868 
5869                 //  FAIL, which will take control back to someplace
5870                 //  prior to entering the look-behind test.
5871                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5872             }
5873             break;
5874 
5875 
5876         case URX_LOOP_SR_I:
5877             // Loop Initialization for the optimized implementation of
5878             //     [some character set]*
5879             //   This op scans through all matching input.
5880             //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
5881             {
5882                 U_ASSERT(opValue > 0 && opValue < sets->size());
5883                 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
5884                 UnicodeSet   *s  = (UnicodeSet *)sets->elementAt(opValue);
5885 
5886                 // Loop through input, until either the input is exhausted or
5887                 //   we reach a character that is not a member of the set.
5888                 int32_t ix = (int32_t)fp->fInputIdx;
5889                 for (;;) {
5890                     if (ix >= fActiveLimit) {
5891                         fHitEnd = TRUE;
5892                         break;
5893                     }
5894                     UChar32   c;
5895                     U16_NEXT(inputBuf, ix, fActiveLimit, c);
5896                     if (c<256) {
5897                         if (s8->contains(c) == FALSE) {
5898                             U16_BACK_1(inputBuf, 0, ix);
5899                             break;
5900                         }
5901                     } else {
5902                         if (s->contains(c) == FALSE) {
5903                             U16_BACK_1(inputBuf, 0, ix);
5904                             break;
5905                         }
5906                     }
5907                 }
5908 
5909                 // If there were no matching characters, skip over the loop altogether.
5910                 //   The loop doesn't run at all, a * op always succeeds.
5911                 if (ix == fp->fInputIdx) {
5912                     fp->fPatIdx++;   // skip the URX_LOOP_C op.
5913                     break;
5914                 }
5915 
5916                 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
5917                 //   must follow.  It's operand is the stack location
5918                 //   that holds the starting input index for the match of this [set]*
5919                 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
5920                 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
5921                 int32_t stackLoc = URX_VAL(loopcOp);
5922                 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
5923                 fp->fExtra[stackLoc] = fp->fInputIdx;
5924                 #ifdef REGEX_SMART_BACKTRACKING
5925                 backSearchIndex = fp->fInputIdx;
5926                 #endif
5927                 fp->fInputIdx = ix;
5928 
5929                 // Save State to the URX_LOOP_C op that follows this one,
5930                 //   so that match failures in the following code will return to there.
5931                 //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
5932                 fp = StateSave(fp, fp->fPatIdx, status);
5933                 fp->fPatIdx++;
5934             }
5935             break;
5936 
5937 
5938         case URX_LOOP_DOT_I:
5939             // Loop Initialization for the optimized implementation of .*
5940             //   This op scans through all remaining input.
5941             //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
5942             {
5943                 // Loop through input until the input is exhausted (we reach an end-of-line)
5944                 // In DOTALL mode, we can just go straight to the end of the input.
5945                 int32_t ix;
5946                 if ((opValue & 1) == 1) {
5947                     // Dot-matches-All mode.  Jump straight to the end of the string.
5948                     ix = (int32_t)fActiveLimit;
5949                     fHitEnd = TRUE;
5950                 } else {
5951                     // NOT DOT ALL mode.  Line endings do not match '.'
5952                     // Scan forward until a line ending or end of input.
5953                     ix = (int32_t)fp->fInputIdx;
5954                     for (;;) {
5955                         if (ix >= fActiveLimit) {
5956                             fHitEnd = TRUE;
5957                             break;
5958                         }
5959                         UChar32   c;
5960                         U16_NEXT(inputBuf, ix, fActiveLimit, c);   // c = inputBuf[ix++]
5961                         if ((c & 0x7f) <= 0x29) {          // Fast filter of non-new-line-s
5962                             if ((c == 0x0a) ||             //  0x0a is newline in both modes.
5963                                 (((opValue & 2) == 0) &&    // IF not UNIX_LINES mode
5964                                    ((c<=0x0d && c>=0x0a) || c==0x85 || c==0x2028 || c==0x2029))) {
5965                                 //  char is a line ending.  Put the input pos back to the
5966                                 //    line ending char, and exit the scanning loop.
5967                                 U16_BACK_1(inputBuf, 0, ix);
5968                                 break;
5969                             }
5970                         }
5971                     }
5972                 }
5973 
5974                 // If there were no matching characters, skip over the loop altogether.
5975                 //   The loop doesn't run at all, a * op always succeeds.
5976                 if (ix == fp->fInputIdx) {
5977                     fp->fPatIdx++;   // skip the URX_LOOP_C op.
5978                     break;
5979                 }
5980 
5981                 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
5982                 //   must follow.  It's operand is the stack location
5983                 //   that holds the starting input index for the match of this .*
5984                 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
5985                 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
5986                 int32_t stackLoc = URX_VAL(loopcOp);
5987                 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
5988                 fp->fExtra[stackLoc] = fp->fInputIdx;
5989                 #ifdef REGEX_SMART_BACKTRACKING
5990                 backSearchIndex = fp->fInputIdx;
5991                 #endif
5992                 fp->fInputIdx = ix;
5993 
5994                 // Save State to the URX_LOOP_C op that follows this one,
5995                 //   so that match failures in the following code will return to there.
5996                 //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
5997                 fp = StateSave(fp, fp->fPatIdx, status);
5998                 fp->fPatIdx++;
5999             }
6000             break;
6001 
6002 
6003         case URX_LOOP_C:
6004             {
6005                 U_ASSERT(opValue>=0 && opValue<fFrameSize);
6006                 backSearchIndex = (int32_t)fp->fExtra[opValue];
6007                 U_ASSERT(backSearchIndex <= fp->fInputIdx);
6008                 if (backSearchIndex == fp->fInputIdx) {
6009                     // We've backed up the input idx to the point that the loop started.
6010                     // The loop is done.  Leave here without saving state.
6011                     //  Subsequent failures won't come back here.
6012                     break;
6013                 }
6014                 // Set up for the next iteration of the loop, with input index
6015                 //   backed up by one from the last time through,
6016                 //   and a state save to this instruction in case the following code fails again.
6017                 //   (We're going backwards because this loop emulates stack unwinding, not
6018                 //    the initial scan forward.)
6019                 U_ASSERT(fp->fInputIdx > 0);
6020                 UChar32 prevC;
6021                 U16_PREV(inputBuf, 0, fp->fInputIdx, prevC); // !!!: should this 0 be one of f*Limit?
6022 
6023                 if (prevC == 0x0a &&
6024                     fp->fInputIdx > backSearchIndex &&
6025                     inputBuf[fp->fInputIdx-1] == 0x0d) {
6026                     int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
6027                     if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
6028                         // .*, stepping back over CRLF pair.
6029                         U16_BACK_1(inputBuf, 0, fp->fInputIdx);
6030                     }
6031                 }
6032 
6033 
6034                 fp = StateSave(fp, fp->fPatIdx-1, status);
6035             }
6036             break;
6037 
6038 
6039 
6040         default:
6041             // Trouble.  The compiled pattern contains an entry with an
6042             //           unrecognized type tag.
6043             U_ASSERT(FALSE);
6044         }
6045 
6046         if (U_FAILURE(status)) {
6047             isMatch = FALSE;
6048             break;
6049         }
6050     }
6051 
6052 breakFromLoop:
6053     fMatch = isMatch;
6054     if (isMatch) {
6055         fLastMatchEnd = fMatchEnd;
6056         fMatchStart   = startIdx;
6057         fMatchEnd     = fp->fInputIdx;
6058         if (fTraceDebug) {
6059             REGEX_RUN_DEBUG_PRINTF(("Match.  start=%d   end=%d\n\n", fMatchStart, fMatchEnd));
6060         }
6061     }
6062     else
6063     {
6064         if (fTraceDebug) {
6065             REGEX_RUN_DEBUG_PRINTF(("No match\n\n"));
6066         }
6067     }
6068 
6069     fFrame = fp;                // The active stack frame when the engine stopped.
6070     //   Contains the capture group results that we need to
6071     //    access later.
6072 
6073     return;
6074 }
6075 
6076 
6077 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher)
6078 
6079 U_NAMESPACE_END
6080 
6081 #endif  // !UCONFIG_NO_REGULAR_EXPRESSIONS
6082