• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 **************************************************************************
3 *   Copyright (C) 2002-2010 International Business Machines Corporation  *
4 *   and others. All rights reserved.                                     *
5 **************************************************************************
6 */
7 //
8 //  file:  rematch.cpp
9 //
10 //         Contains the implementation of class RegexMatcher,
11 //         which is one of the main API classes for the ICU regular expression package.
12 //
13 
14 #include "unicode/utypes.h"
15 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
16 
17 #include "unicode/regex.h"
18 #include "unicode/uniset.h"
19 #include "unicode/uchar.h"
20 #include "unicode/ustring.h"
21 #include "unicode/rbbi.h"
22 #include "uassert.h"
23 #include "cmemory.h"
24 #include "uvector.h"
25 #include "uvectr32.h"
26 #include "uvectr64.h"
27 #include "regeximp.h"
28 #include "regexst.h"
29 #include "regextxt.h"
30 #include "ucase.h"
31 
32 // #include <malloc.h>        // Needed for heapcheck testing
33 
34 
35 // Find progress callback
36 // ----------------------
37 // Macro to inline test & call to ReportFindProgress().  Eliminates unnecessary function call.
38 //
39 #define REGEXFINDPROGRESS_INTERRUPT(pos, status)     \
40     (fFindProgressCallbackFn != NULL) && (ReportFindProgress(pos, status) == FALSE)
41 
42 
43 // Smart Backtracking
44 // ------------------
45 // When a failure would go back to a LOOP_C instruction,
46 // strings, characters, and setrefs scan backwards for a valid start
47 // character themselves, pop the stack, and save state, emulating the
48 // LOOP_C's effect but assured that the next character of input is a
49 // possible matching character.
50 //
51 // Good idea in theory; unfortunately it only helps out a few specific
52 // cases and slows the engine down a little in the rest.
53 
54 //#define REGEX_SMART_BACKTRACKING 1
55 
56 U_NAMESPACE_BEGIN
57 
58 // Default limit for the size of the back track stack, to avoid system
59 //    failures causedby heap exhaustion.  Units are in 32 bit words, not bytes.
60 // This value puts ICU's limits higher than most other regexp implementations,
61 //    which use recursion rather than the heap, and take more storage per
62 //    backtrack point.
63 //
64 static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY = 8000000;
65 
66 // Time limit counter constant.
67 //   Time limits for expression evaluation are in terms of quanta of work by
68 //   the engine, each of which is 10,000 state saves.
69 //   This constant determines that state saves per tick number.
70 static const int32_t TIMER_INITIAL_VALUE = 10000;
71 
72 //-----------------------------------------------------------------------------
73 //
74 //   Constructor and Destructor
75 //
76 //-----------------------------------------------------------------------------
RegexMatcher(const RegexPattern * pat)77 RegexMatcher::RegexMatcher(const RegexPattern *pat)  {
78     fDeferredStatus = U_ZERO_ERROR;
79     init(fDeferredStatus);
80     if (U_FAILURE(fDeferredStatus)) {
81         return;
82     }
83     if (pat==NULL) {
84         fDeferredStatus = U_ILLEGAL_ARGUMENT_ERROR;
85         return;
86     }
87     fPattern = pat;
88     init2(RegexStaticSets::gStaticSets->fEmptyText, fDeferredStatus);
89 }
90 
91 
92 
RegexMatcher(const UnicodeString & regexp,const UnicodeString & input,uint32_t flags,UErrorCode & status)93 RegexMatcher::RegexMatcher(const UnicodeString &regexp, const UnicodeString &input,
94                            uint32_t flags, UErrorCode &status) {
95     init(status);
96     if (U_FAILURE(status)) {
97         return;
98     }
99     UParseError    pe;
100     fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
101     fPattern           = fPatternOwned;
102 
103     UText inputText = UTEXT_INITIALIZER;
104     utext_openConstUnicodeString(&inputText, &input, &status);
105     init2(&inputText, status);
106     utext_close(&inputText);
107 
108     fInputUniStrMaybeMutable = TRUE;
109 }
110 
111 
RegexMatcher(UText * regexp,UText * input,uint32_t flags,UErrorCode & status)112 RegexMatcher::RegexMatcher(UText *regexp, UText *input,
113                            uint32_t flags, UErrorCode &status) {
114     init(status);
115     if (U_FAILURE(status)) {
116         return;
117     }
118     UParseError    pe;
119     fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
120     if (U_FAILURE(status)) {
121         return;
122     }
123 
124     fPattern           = fPatternOwned;
125     init2(input, status);
126 }
127 
128 
RegexMatcher(const UnicodeString & regexp,uint32_t flags,UErrorCode & status)129 RegexMatcher::RegexMatcher(const UnicodeString &regexp,
130                            uint32_t flags, UErrorCode &status) {
131     init(status);
132     if (U_FAILURE(status)) {
133         return;
134     }
135     UParseError    pe;
136     fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
137     if (U_FAILURE(status)) {
138         return;
139     }
140     fPattern           = fPatternOwned;
141     init2(RegexStaticSets::gStaticSets->fEmptyText, status);
142 }
143 
RegexMatcher(UText * regexp,uint32_t flags,UErrorCode & status)144 RegexMatcher::RegexMatcher(UText *regexp,
145                            uint32_t flags, UErrorCode &status) {
146     init(status);
147     if (U_FAILURE(status)) {
148         return;
149     }
150     UParseError    pe;
151     fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
152         if (U_FAILURE(status)) {
153         return;
154     }
155 
156     fPattern           = fPatternOwned;
157     init2(RegexStaticSets::gStaticSets->fEmptyText, status);
158 }
159 
160 
161 
162 
~RegexMatcher()163 RegexMatcher::~RegexMatcher() {
164     delete fStack;
165     if (fData != fSmallData) {
166         uprv_free(fData);
167         fData = NULL;
168     }
169     if (fPatternOwned) {
170         delete fPatternOwned;
171         fPatternOwned = NULL;
172         fPattern = NULL;
173     }
174 
175     if (fInput) {
176         delete fInput;
177     }
178     if (fInputText) {
179         utext_close(fInputText);
180     }
181     if (fAltInputText) {
182         utext_close(fAltInputText);
183     }
184 
185     #if UCONFIG_NO_BREAK_ITERATION==0
186     delete fWordBreakItr;
187     #endif
188 }
189 
190 //
191 //   init()   common initialization for use by all constructors.
192 //            Initialize all fields, get the object into a consistent state.
193 //            This must be done even when the initial status shows an error,
194 //            so that the object is initialized sufficiently well for the destructor
195 //            to run safely.
196 //
init(UErrorCode & status)197 void RegexMatcher::init(UErrorCode &status) {
198     fPattern           = NULL;
199     fPatternOwned      = NULL;
200     fFrameSize         = 0;
201     fRegionStart       = 0;
202     fRegionLimit       = 0;
203     fAnchorStart       = 0;
204     fAnchorLimit       = 0;
205     fLookStart         = 0;
206     fLookLimit         = 0;
207     fActiveStart       = 0;
208     fActiveLimit       = 0;
209     fTransparentBounds = FALSE;
210     fAnchoringBounds   = TRUE;
211     fMatch             = FALSE;
212     fMatchStart        = 0;
213     fMatchEnd          = 0;
214     fLastMatchEnd      = -1;
215     fAppendPosition    = 0;
216     fHitEnd            = FALSE;
217     fRequireEnd        = FALSE;
218     fStack             = NULL;
219     fFrame             = NULL;
220     fTimeLimit         = 0;
221     fTime              = 0;
222     fTickCounter       = 0;
223     fStackLimit        = DEFAULT_BACKTRACK_STACK_CAPACITY;
224     fCallbackFn        = NULL;
225     fCallbackContext   = NULL;
226     fFindProgressCallbackFn      = NULL;
227     fFindProgressCallbackContext = NULL;
228     fTraceDebug        = FALSE;
229     fDeferredStatus    = status;
230     fData              = fSmallData;
231     fWordBreakItr      = NULL;
232 
233     fStack             = new UVector64(status);
234     fInputText         = NULL;
235     fAltInputText      = NULL;
236     fInput             = NULL;
237     fInputLength       = 0;
238     fInputUniStrMaybeMutable = FALSE;
239 
240     if (U_FAILURE(status)) {
241         fDeferredStatus = status;
242     }
243 }
244 
245 //
246 //  init2()   Common initialization for use by RegexMatcher constructors, part 2.
247 //            This handles the common setup to be done after the Pattern is available.
248 //
init2(UText * input,UErrorCode & status)249 void RegexMatcher::init2(UText *input, UErrorCode &status) {
250     if (U_FAILURE(status)) {
251         fDeferredStatus = status;
252         return;
253     }
254 
255     if (fPattern->fDataSize > (int32_t)(sizeof(fSmallData)/sizeof(fSmallData[0]))) {
256         fData = (int64_t *)uprv_malloc(fPattern->fDataSize * sizeof(int64_t));
257         if (fData == NULL) {
258             status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
259             return;
260         }
261     }
262 
263     reset(input);
264     setStackLimit(DEFAULT_BACKTRACK_STACK_CAPACITY, status);
265     if (U_FAILURE(status)) {
266         fDeferredStatus = status;
267         return;
268     }
269 }
270 
271 
272 static const UChar BACKSLASH  = 0x5c;
273 static const UChar DOLLARSIGN = 0x24;
274 //--------------------------------------------------------------------------------
275 //
276 //    appendReplacement
277 //
278 //--------------------------------------------------------------------------------
appendReplacement(UnicodeString & dest,const UnicodeString & replacement,UErrorCode & status)279 RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest,
280                                               const UnicodeString &replacement,
281                                               UErrorCode &status) {
282     UText replacementText = UTEXT_INITIALIZER;
283 
284     utext_openConstUnicodeString(&replacementText, &replacement, &status);
285     if (U_SUCCESS(status)) {
286         UText resultText = UTEXT_INITIALIZER;
287         utext_openUnicodeString(&resultText, &dest, &status);
288 
289         if (U_SUCCESS(status)) {
290             appendReplacement(&resultText, &replacementText, status);
291             utext_close(&resultText);
292         }
293         utext_close(&replacementText);
294     }
295 
296     return *this;
297 }
298 
299 //
300 //    appendReplacement, UText mode
301 //
appendReplacement(UText * dest,UText * replacement,UErrorCode & status)302 RegexMatcher &RegexMatcher::appendReplacement(UText *dest,
303                                               UText *replacement,
304                                               UErrorCode &status) {
305     if (U_FAILURE(status)) {
306         return *this;
307     }
308     if (U_FAILURE(fDeferredStatus)) {
309         status = fDeferredStatus;
310         return *this;
311     }
312     if (fMatch == FALSE) {
313         status = U_REGEX_INVALID_STATE;
314         return *this;
315     }
316 
317     // Copy input string from the end of previous match to start of current match
318     int64_t  destLen = utext_nativeLength(dest);
319     if (fMatchStart > fAppendPosition) {
320         if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
321             destLen += utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
322                                      (int32_t)(fMatchStart-fAppendPosition), &status);
323         } else {
324             int32_t len16;
325             if (UTEXT_USES_U16(fInputText)) {
326                 len16 = (int32_t)(fMatchStart-fAppendPosition);
327             } else {
328                 UErrorCode lengthStatus = U_ZERO_ERROR;
329                 len16 = utext_extract(fInputText, fAppendPosition, fMatchStart, NULL, 0, &lengthStatus);
330             }
331             UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
332             if (inputChars == NULL) {
333                 status = U_MEMORY_ALLOCATION_ERROR;
334                 return *this;
335             }
336             utext_extract(fInputText, fAppendPosition, fMatchStart, inputChars, len16+1, &status);
337             destLen += utext_replace(dest, destLen, destLen, inputChars, len16, &status);
338             uprv_free(inputChars);
339         }
340     }
341     fAppendPosition = fMatchEnd;
342 
343 
344     // scan the replacement text, looking for substitutions ($n) and \escapes.
345     //  TODO:  optimize this loop by efficiently scanning for '$' or '\',
346     //         move entire ranges not containing substitutions.
347     UTEXT_SETNATIVEINDEX(replacement, 0);
348     UChar32 c = UTEXT_NEXT32(replacement);
349     while (c != U_SENTINEL) {
350         if (c == BACKSLASH) {
351             // Backslash Escape.  Copy the following char out without further checks.
352             //                    Note:  Surrogate pairs don't need any special handling
353             //                           The second half wont be a '$' or a '\', and
354             //                           will move to the dest normally on the next
355             //                           loop iteration.
356             c = UTEXT_CURRENT32(replacement);
357             if (c == U_SENTINEL) {
358                 break;
359             }
360 
361             if (c==0x55/*U*/ || c==0x75/*u*/) {
362                 // We have a \udddd or \Udddddddd escape sequence.
363                 int32_t offset = 0;
364                 struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(replacement);
365                 UChar32 escapedChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context);
366                 if (escapedChar != (UChar32)0xFFFFFFFF) {
367                     if (U_IS_BMP(escapedChar)) {
368                         UChar c16 = (UChar)escapedChar;
369                         destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
370                     } else {
371                         UChar surrogate[2];
372                         surrogate[0] = U16_LEAD(escapedChar);
373                         surrogate[1] = U16_TRAIL(escapedChar);
374                         if (U_SUCCESS(status)) {
375                             destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
376                         }
377                     }
378                     // TODO:  Report errors for mal-formed \u escapes?
379                     //        As this is, the original sequence is output, which may be OK.
380                     if (context.lastOffset == offset) {
381                         UTEXT_PREVIOUS32(replacement);
382                     } else if (context.lastOffset != offset-1) {
383                         utext_moveIndex32(replacement, offset - context.lastOffset - 1);
384                     }
385                 }
386             } else {
387                 UTEXT_NEXT32(replacement);
388                 // Plain backslash escape.  Just put out the escaped character.
389                 if (U_IS_BMP(c)) {
390                     UChar c16 = (UChar)c;
391                     destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
392                 } else {
393                     UChar surrogate[2];
394                     surrogate[0] = U16_LEAD(c);
395                     surrogate[1] = U16_TRAIL(c);
396                     if (U_SUCCESS(status)) {
397                         destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
398                     }
399                 }
400             }
401         } else if (c != DOLLARSIGN) {
402             // Normal char, not a $.  Copy it out without further checks.
403             if (U_IS_BMP(c)) {
404                 UChar c16 = (UChar)c;
405                 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
406             } else {
407                 UChar surrogate[2];
408                 surrogate[0] = U16_LEAD(c);
409                 surrogate[1] = U16_TRAIL(c);
410                 if (U_SUCCESS(status)) {
411                     destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
412                 }
413             }
414         } else {
415             // We've got a $.  Pick up a capture group number if one follows.
416             // Consume at most the number of digits necessary for the largest capture
417             // number that is valid for this pattern.
418 
419             int32_t numDigits = 0;
420             int32_t groupNum  = 0;
421             UChar32 digitC;
422             for (;;) {
423                 digitC = UTEXT_CURRENT32(replacement);
424                 if (digitC == U_SENTINEL) {
425                     break;
426                 }
427                 if (u_isdigit(digitC) == FALSE) {
428                     break;
429                 }
430                 UTEXT_NEXT32(replacement);
431                 groupNum=groupNum*10 + u_charDigitValue(digitC);
432                 numDigits++;
433                 if (numDigits >= fPattern->fMaxCaptureDigits) {
434                     break;
435                 }
436             }
437 
438 
439             if (numDigits == 0) {
440                 // The $ didn't introduce a group number at all.
441                 // Treat it as just part of the substitution text.
442                 UChar c16 = DOLLARSIGN;
443                 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
444             } else {
445                 // Finally, append the capture group data to the destination.
446                 destLen += appendGroup(groupNum, dest, status);
447                 if (U_FAILURE(status)) {
448                     // Can fail if group number is out of range.
449                     break;
450                 }
451             }
452         }
453 
454         if (U_FAILURE(status)) {
455             break;
456         } else {
457             c = UTEXT_NEXT32(replacement);
458         }
459     }
460 
461     return *this;
462 }
463 
464 
465 
466 //--------------------------------------------------------------------------------
467 //
468 //    appendTail     Intended to be used in conjunction with appendReplacement()
469 //                   To the destination string, append everything following
470 //                   the last match position from the input string.
471 //
472 //                   Note:  Match ranges do not affect appendTail or appendReplacement
473 //
474 //--------------------------------------------------------------------------------
appendTail(UnicodeString & dest)475 UnicodeString &RegexMatcher::appendTail(UnicodeString &dest) {
476     UErrorCode status = U_ZERO_ERROR;
477     UText resultText = UTEXT_INITIALIZER;
478     utext_openUnicodeString(&resultText, &dest, &status);
479 
480     if (U_SUCCESS(status)) {
481         appendTail(&resultText, status);
482         utext_close(&resultText);
483     }
484 
485     return dest;
486 }
487 
488 //
489 //   appendTail, UText mode
490 //
appendTail(UText * dest,UErrorCode & status)491 UText *RegexMatcher::appendTail(UText *dest, UErrorCode &status) {
492     UBool bailOut = FALSE;
493     if (U_FAILURE(status)) {
494         bailOut = TRUE;
495     }
496     if (U_FAILURE(fDeferredStatus)) {
497         status = fDeferredStatus;
498         bailOut = TRUE;
499     }
500 
501     if (bailOut) {
502         //  dest must not be NULL
503         if (dest) {
504             utext_replace(dest, utext_nativeLength(dest), utext_nativeLength(dest), NULL, 0, &status);
505             return dest;
506         }
507     }
508 
509     if (fInputLength > fAppendPosition) {
510         if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
511             int64_t destLen = utext_nativeLength(dest);
512             utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
513                           (int32_t)(fInputLength-fAppendPosition), &status);
514         } else {
515             int32_t len16;
516             if (UTEXT_USES_U16(fInputText)) {
517                 len16 = (int32_t)(fInputLength-fAppendPosition);
518             } else {
519                 len16 = utext_extract(fInputText, fAppendPosition, fInputLength, NULL, 0, &status);
520                 status = U_ZERO_ERROR; // buffer overflow
521             }
522 
523             UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16));
524             if (inputChars == NULL) {
525                 fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
526             } else {
527                 utext_extract(fInputText, fAppendPosition, fInputLength, inputChars, len16, &status); // unterminated
528                 int64_t destLen = utext_nativeLength(dest);
529                 utext_replace(dest, destLen, destLen, inputChars, len16, &status);
530                 uprv_free(inputChars);
531             }
532         }
533     }
534     return dest;
535 }
536 
537 
538 
539 //--------------------------------------------------------------------------------
540 //
541 //   end
542 //
543 //--------------------------------------------------------------------------------
end(UErrorCode & err) const544 int32_t RegexMatcher::end(UErrorCode &err) const {
545     return end(0, err);
546 }
547 
end64(UErrorCode & err) const548 int64_t RegexMatcher::end64(UErrorCode &err) const {
549     return end64(0, err);
550 }
551 
end64(int32_t group,UErrorCode & err) const552 int64_t RegexMatcher::end64(int32_t group, UErrorCode &err) const {
553     if (U_FAILURE(err)) {
554         return -1;
555     }
556     if (fMatch == FALSE) {
557         err = U_REGEX_INVALID_STATE;
558         return -1;
559     }
560     if (group < 0 || group > fPattern->fGroupMap->size()) {
561         err = U_INDEX_OUTOFBOUNDS_ERROR;
562         return -1;
563     }
564     int64_t e = -1;
565     if (group == 0) {
566         e = fMatchEnd;
567     } else {
568         // Get the position within the stack frame of the variables for
569         //    this capture group.
570         int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
571         U_ASSERT(groupOffset < fPattern->fFrameSize);
572         U_ASSERT(groupOffset >= 0);
573         e = fFrame->fExtra[groupOffset + 1];
574     }
575 
576         return e;
577 }
578 
end(int32_t group,UErrorCode & err) const579 int32_t RegexMatcher::end(int32_t group, UErrorCode &err) const {
580     return (int32_t)end64(group, err);
581 }
582 
583 
584 //--------------------------------------------------------------------------------
585 //
586 //   find()
587 //
588 //--------------------------------------------------------------------------------
find()589 UBool RegexMatcher::find() {
590     // Start at the position of the last match end.  (Will be zero if the
591     //   matcher has been reset.)
592     //
593     if (U_FAILURE(fDeferredStatus)) {
594         return FALSE;
595     }
596 
597     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
598         return findUsingChunk();
599     }
600 
601     int64_t startPos = fMatchEnd;
602     if (startPos==0) {
603         startPos = fActiveStart;
604     }
605 
606     if (fMatch) {
607         // Save the position of any previous successful match.
608         fLastMatchEnd = fMatchEnd;
609 
610         if (fMatchStart == fMatchEnd) {
611             // Previous match had zero length.  Move start position up one position
612             //  to avoid sending find() into a loop on zero-length matches.
613             if (startPos >= fActiveLimit) {
614                 fMatch = FALSE;
615                 fHitEnd = TRUE;
616                 return FALSE;
617             }
618             UTEXT_SETNATIVEINDEX(fInputText, startPos);
619             UTEXT_NEXT32(fInputText);
620             startPos = UTEXT_GETNATIVEINDEX(fInputText);
621         }
622     } else {
623         if (fLastMatchEnd >= 0) {
624             // A previous find() failed to match.  Don't try again.
625             //   (without this test, a pattern with a zero-length match
626             //    could match again at the end of an input string.)
627             fHitEnd = TRUE;
628             return FALSE;
629         }
630     }
631 
632 
633     // Compute the position in the input string beyond which a match can not begin, because
634     //   the minimum length match would extend past the end of the input.
635     //   Note:  some patterns that cannot match anything will have fMinMatchLength==Max Int.
636     //          Be aware of possible overflows if making changes here.
637     int64_t testStartLimit;
638     if (UTEXT_USES_U16(fInputText)) {
639         testStartLimit = fActiveLimit - fPattern->fMinMatchLen;
640         if (startPos > testStartLimit) {
641             fMatch = FALSE;
642             fHitEnd = TRUE;
643             return FALSE;
644         }
645     } else {
646         // For now, let the matcher discover that it can't match on its own
647         // We don't know how long the match len is in native characters
648         testStartLimit = fActiveLimit;
649     }
650 
651     UChar32  c;
652     U_ASSERT(startPos >= 0);
653 
654     switch (fPattern->fStartType) {
655     case START_NO_INFO:
656         // No optimization was found.
657         //  Try a match at each input position.
658         for (;;) {
659             MatchAt(startPos, FALSE, fDeferredStatus);
660             if (U_FAILURE(fDeferredStatus)) {
661                 return FALSE;
662             }
663             if (fMatch) {
664                 return TRUE;
665             }
666             if (startPos >= testStartLimit) {
667                 fHitEnd = TRUE;
668                 return FALSE;
669             }
670             UTEXT_SETNATIVEINDEX(fInputText, startPos);
671             UTEXT_NEXT32(fInputText);
672             startPos = UTEXT_GETNATIVEINDEX(fInputText);
673             // Note that it's perfectly OK for a pattern to have a zero-length
674             //   match at the end of a string, so we must make sure that the loop
675             //   runs with startPos == testStartLimit the last time through.
676             if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
677                 return FALSE;
678         }
679         U_ASSERT(FALSE);
680 
681     case START_START:
682         // Matches are only possible at the start of the input string
683         //   (pattern begins with ^ or \A)
684         if (startPos > fActiveStart) {
685             fMatch = FALSE;
686             return FALSE;
687         }
688         MatchAt(startPos, FALSE, fDeferredStatus);
689         if (U_FAILURE(fDeferredStatus)) {
690             return FALSE;
691         }
692         return fMatch;
693 
694 
695     case START_SET:
696         {
697             // Match may start on any char from a pre-computed set.
698             U_ASSERT(fPattern->fMinMatchLen > 0);
699             int64_t pos;
700             UTEXT_SETNATIVEINDEX(fInputText, startPos);
701             for (;;) {
702                 c = UTEXT_NEXT32(fInputText);
703                 pos = UTEXT_GETNATIVEINDEX(fInputText);
704                 // c will be -1 (U_SENTINEL) at end of text, in which case we
705                 // skip this next block (so we don't have a negative array index)
706                 // and handle end of text in the following block.
707                 if (c >= 0 && ((c<256 && fPattern->fInitialChars8->contains(c)) ||
708                               (c>=256 && fPattern->fInitialChars->contains(c)))) {
709                     MatchAt(startPos, FALSE, fDeferredStatus);
710                     if (U_FAILURE(fDeferredStatus)) {
711                         return FALSE;
712                     }
713                     if (fMatch) {
714                         return TRUE;
715                     }
716                     UTEXT_SETNATIVEINDEX(fInputText, pos);
717                 }
718                 if (startPos >= testStartLimit) {
719                     fMatch = FALSE;
720                     fHitEnd = TRUE;
721                     return FALSE;
722                 }
723                 startPos = pos;
724 	            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
725                     return FALSE;
726             }
727         }
728         U_ASSERT(FALSE);
729 
730     case START_STRING:
731     case START_CHAR:
732         {
733             // Match starts on exactly one char.
734             U_ASSERT(fPattern->fMinMatchLen > 0);
735             UChar32 theChar = fPattern->fInitialChar;
736             int64_t pos;
737             UTEXT_SETNATIVEINDEX(fInputText, startPos);
738             for (;;) {
739                 c = UTEXT_NEXT32(fInputText);
740                 pos = UTEXT_GETNATIVEINDEX(fInputText);
741                 if (c == theChar) {
742                     MatchAt(startPos, FALSE, fDeferredStatus);
743                     if (U_FAILURE(fDeferredStatus)) {
744                         return FALSE;
745                     }
746                     if (fMatch) {
747                         return TRUE;
748                     }
749                     UTEXT_SETNATIVEINDEX(fInputText, pos);
750                 }
751                 if (startPos >= testStartLimit) {
752                     fMatch = FALSE;
753                     fHitEnd = TRUE;
754                     return FALSE;
755                 }
756                 startPos = pos;
757 	            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
758                     return FALSE;
759            }
760         }
761         U_ASSERT(FALSE);
762 
763     case START_LINE:
764         {
765             UChar32  c;
766             if (startPos == fAnchorStart) {
767                 MatchAt(startPos, FALSE, fDeferredStatus);
768                 if (U_FAILURE(fDeferredStatus)) {
769                     return FALSE;
770                 }
771                 if (fMatch) {
772                     return TRUE;
773                 }
774                 UTEXT_SETNATIVEINDEX(fInputText, startPos);
775                 c = UTEXT_NEXT32(fInputText);
776                 startPos = UTEXT_GETNATIVEINDEX(fInputText);
777             } else {
778                 UTEXT_SETNATIVEINDEX(fInputText, startPos);
779                 c = UTEXT_PREVIOUS32(fInputText);
780                 UTEXT_SETNATIVEINDEX(fInputText, startPos);
781             }
782 
783             if (fPattern->fFlags & UREGEX_UNIX_LINES) {
784                 for (;;) {
785                     if (c == 0x0a) {
786                             MatchAt(startPos, FALSE, fDeferredStatus);
787                             if (U_FAILURE(fDeferredStatus)) {
788                                 return FALSE;
789                             }
790                             if (fMatch) {
791                                 return TRUE;
792                             }
793                             UTEXT_SETNATIVEINDEX(fInputText, startPos);
794                     }
795                     if (startPos >= testStartLimit) {
796                         fMatch = FALSE;
797                         fHitEnd = TRUE;
798                         return FALSE;
799                     }
800                     c = UTEXT_NEXT32(fInputText);
801                     startPos = UTEXT_GETNATIVEINDEX(fInputText);
802                     // Note that it's perfectly OK for a pattern to have a zero-length
803                     //   match at the end of a string, so we must make sure that the loop
804                     //   runs with startPos == testStartLimit the last time through.
805 		            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
806                         return FALSE;
807                 }
808             } else {
809                 for (;;) {
810                     if (((c & 0x7f) <= 0x29) &&     // First quickly bypass as many chars as possible
811                         ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 )) {
812                             if (c == 0x0d && startPos < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
813                                 UTEXT_NEXT32(fInputText);
814                                 startPos = UTEXT_GETNATIVEINDEX(fInputText);
815                             }
816                             MatchAt(startPos, FALSE, fDeferredStatus);
817                             if (U_FAILURE(fDeferredStatus)) {
818                                 return FALSE;
819                             }
820                             if (fMatch) {
821                                 return TRUE;
822                             }
823                             UTEXT_SETNATIVEINDEX(fInputText, startPos);
824                     }
825                     if (startPos >= testStartLimit) {
826                         fMatch = FALSE;
827                         fHitEnd = TRUE;
828                         return FALSE;
829                     }
830                     c = UTEXT_NEXT32(fInputText);
831                     startPos = UTEXT_GETNATIVEINDEX(fInputText);
832                     // Note that it's perfectly OK for a pattern to have a zero-length
833                     //   match at the end of a string, so we must make sure that the loop
834                     //   runs with startPos == testStartLimit the last time through.
835 		            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
836                         return FALSE;
837                 }
838             }
839         }
840 
841     default:
842         U_ASSERT(FALSE);
843     }
844 
845     U_ASSERT(FALSE);
846     return FALSE;
847 }
848 
849 
850 
find(int64_t start,UErrorCode & status)851 UBool RegexMatcher::find(int64_t start, UErrorCode &status) {
852     if (U_FAILURE(status)) {
853         return FALSE;
854     }
855     if (U_FAILURE(fDeferredStatus)) {
856         status = fDeferredStatus;
857         return FALSE;
858     }
859     this->reset();                        // Note:  Reset() is specified by Java Matcher documentation.
860                                           //        This will reset the region to be the full input length.
861     if (start < 0) {
862         status = U_INDEX_OUTOFBOUNDS_ERROR;
863         return FALSE;
864     }
865 
866     int64_t nativeStart = start;
867     if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
868         status = U_INDEX_OUTOFBOUNDS_ERROR;
869         return FALSE;
870     }
871     fMatchEnd = nativeStart;
872     return find();
873 }
874 
875 
876 //--------------------------------------------------------------------------------
877 //
878 //   findUsingChunk() -- like find(), but with the advance knowledge that the
879 //                       entire string is available in the UText's chunk buffer.
880 //
881 //--------------------------------------------------------------------------------
findUsingChunk()882 UBool RegexMatcher::findUsingChunk() {
883     // Start at the position of the last match end.  (Will be zero if the
884     //   matcher has been reset.
885     //
886 
887     int32_t startPos = (int32_t)fMatchEnd;
888     if (startPos==0) {
889         startPos = (int32_t)fActiveStart;
890     }
891 
892     const UChar *inputBuf = fInputText->chunkContents;
893 
894     if (fMatch) {
895         // Save the position of any previous successful match.
896         fLastMatchEnd = fMatchEnd;
897 
898         if (fMatchStart == fMatchEnd) {
899             // Previous match had zero length.  Move start position up one position
900             //  to avoid sending find() into a loop on zero-length matches.
901             if (startPos >= fActiveLimit) {
902                 fMatch = FALSE;
903                 fHitEnd = TRUE;
904                 return FALSE;
905             }
906             U16_FWD_1(inputBuf, startPos, fInputLength);
907         }
908     } else {
909         if (fLastMatchEnd >= 0) {
910             // A previous find() failed to match.  Don't try again.
911             //   (without this test, a pattern with a zero-length match
912             //    could match again at the end of an input string.)
913             fHitEnd = TRUE;
914             return FALSE;
915         }
916     }
917 
918 
919     // Compute the position in the input string beyond which a match can not begin, because
920     //   the minimum length match would extend past the end of the input.
921     //   Note:  some patterns that cannot match anything will have fMinMatchLength==Max Int.
922     //          Be aware of possible overflows if making changes here.
923     int32_t testLen  = (int32_t)(fActiveLimit - fPattern->fMinMatchLen);
924     if (startPos > testLen) {
925         fMatch = FALSE;
926         fHitEnd = TRUE;
927         return FALSE;
928     }
929 
930     UChar32  c;
931     U_ASSERT(startPos >= 0);
932 
933     switch (fPattern->fStartType) {
934     case START_NO_INFO:
935         // No optimization was found.
936         //  Try a match at each input position.
937         for (;;) {
938             MatchChunkAt(startPos, FALSE, fDeferredStatus);
939             if (U_FAILURE(fDeferredStatus)) {
940                 return FALSE;
941             }
942             if (fMatch) {
943                 return TRUE;
944             }
945             if (startPos >= testLen) {
946                 fHitEnd = TRUE;
947                 return FALSE;
948             }
949             U16_FWD_1(inputBuf, startPos, fActiveLimit);
950             // Note that it's perfectly OK for a pattern to have a zero-length
951             //   match at the end of a string, so we must make sure that the loop
952             //   runs with startPos == testLen the last time through.
953             if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
954                 return FALSE;
955         }
956         U_ASSERT(FALSE);
957 
958     case START_START:
959         // Matches are only possible at the start of the input string
960         //   (pattern begins with ^ or \A)
961         if (startPos > fActiveStart) {
962             fMatch = FALSE;
963             return FALSE;
964         }
965         MatchChunkAt(startPos, FALSE, fDeferredStatus);
966         if (U_FAILURE(fDeferredStatus)) {
967             return FALSE;
968         }
969         return fMatch;
970 
971 
972     case START_SET:
973     {
974         // Match may start on any char from a pre-computed set.
975         U_ASSERT(fPattern->fMinMatchLen > 0);
976         for (;;) {
977             int32_t pos = startPos;
978             U16_NEXT(inputBuf, startPos, fActiveLimit, c);  // like c = inputBuf[startPos++];
979             if ((c<256 && fPattern->fInitialChars8->contains(c)) ||
980                 (c>=256 && fPattern->fInitialChars->contains(c))) {
981                 MatchChunkAt(pos, FALSE, fDeferredStatus);
982                 if (U_FAILURE(fDeferredStatus)) {
983                     return FALSE;
984                 }
985                 if (fMatch) {
986                     return TRUE;
987                 }
988             }
989             if (pos >= testLen) {
990                 fMatch = FALSE;
991                 fHitEnd = TRUE;
992                 return FALSE;
993             }
994             if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
995                 return FALSE;
996         }
997     }
998         U_ASSERT(FALSE);
999 
1000     case START_STRING:
1001     case START_CHAR:
1002     {
1003         // Match starts on exactly one char.
1004         U_ASSERT(fPattern->fMinMatchLen > 0);
1005         UChar32 theChar = fPattern->fInitialChar;
1006         for (;;) {
1007             int32_t pos = startPos;
1008             U16_NEXT(inputBuf, startPos, fActiveLimit, c);  // like c = inputBuf[startPos++];
1009             if (c == theChar) {
1010                 MatchChunkAt(pos, FALSE, fDeferredStatus);
1011                 if (U_FAILURE(fDeferredStatus)) {
1012                     return FALSE;
1013                 }
1014                 if (fMatch) {
1015                     return TRUE;
1016                 }
1017             }
1018             if (pos >= testLen) {
1019                 fMatch = FALSE;
1020                 fHitEnd = TRUE;
1021                 return FALSE;
1022             }
1023             if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
1024                 return FALSE;
1025         }
1026     }
1027         U_ASSERT(FALSE);
1028 
1029     case START_LINE:
1030     {
1031         UChar32  c;
1032         if (startPos == fAnchorStart) {
1033             MatchChunkAt(startPos, FALSE, fDeferredStatus);
1034             if (U_FAILURE(fDeferredStatus)) {
1035                 return FALSE;
1036             }
1037             if (fMatch) {
1038                 return TRUE;
1039             }
1040             U16_FWD_1(inputBuf, startPos, fActiveLimit);
1041         }
1042 
1043         if (fPattern->fFlags & UREGEX_UNIX_LINES) {
1044             for (;;) {
1045                 c = inputBuf[startPos-1];
1046                 if (c == 0x0a) {
1047                     MatchChunkAt(startPos, FALSE, fDeferredStatus);
1048                     if (U_FAILURE(fDeferredStatus)) {
1049                         return FALSE;
1050                     }
1051                     if (fMatch) {
1052                         return TRUE;
1053                     }
1054                 }
1055                 if (startPos >= testLen) {
1056                     fMatch = FALSE;
1057                     fHitEnd = TRUE;
1058                     return FALSE;
1059                 }
1060                 U16_FWD_1(inputBuf, startPos, fActiveLimit);
1061                 // Note that it's perfectly OK for a pattern to have a zero-length
1062                 //   match at the end of a string, so we must make sure that the loop
1063                 //   runs with startPos == testLen the last time through.
1064 	            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
1065                     return FALSE;
1066             }
1067         } else {
1068             for (;;) {
1069                 c = inputBuf[startPos-1];
1070                 if (((c & 0x7f) <= 0x29) &&     // First quickly bypass as many chars as possible
1071                     ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 )) {
1072                     if (c == 0x0d && startPos < fActiveLimit && inputBuf[startPos] == 0x0a) {
1073                         startPos++;
1074                     }
1075                     MatchChunkAt(startPos, FALSE, fDeferredStatus);
1076                     if (U_FAILURE(fDeferredStatus)) {
1077                         return FALSE;
1078                     }
1079                     if (fMatch) {
1080                         return TRUE;
1081                     }
1082                 }
1083                 if (startPos >= testLen) {
1084                     fMatch = FALSE;
1085                     fHitEnd = TRUE;
1086                     return FALSE;
1087                 }
1088                 U16_FWD_1(inputBuf, startPos, fActiveLimit);
1089                 // Note that it's perfectly OK for a pattern to have a zero-length
1090                 //   match at the end of a string, so we must make sure that the loop
1091                 //   runs with startPos == testLen the last time through.
1092 	            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
1093                     return FALSE;
1094             }
1095         }
1096     }
1097 
1098     default:
1099         U_ASSERT(FALSE);
1100     }
1101 
1102     U_ASSERT(FALSE);
1103     return FALSE;
1104 }
1105 
1106 
1107 
1108 //--------------------------------------------------------------------------------
1109 //
1110 //  group()
1111 //
1112 //--------------------------------------------------------------------------------
group(UErrorCode & status) const1113 UnicodeString RegexMatcher::group(UErrorCode &status) const {
1114     return group(0, status);
1115 }
1116 
1117 //  Return immutable shallow clone
group(UText * dest,int64_t & group_len,UErrorCode & status) const1118 UText *RegexMatcher::group(UText *dest, int64_t &group_len, UErrorCode &status) const {
1119     return group(0, dest, group_len, status);
1120 }
1121 
1122 //  Return immutable shallow clone
group(int32_t groupNum,UText * dest,int64_t & group_len,UErrorCode & status) const1123 UText *RegexMatcher::group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const {
1124     group_len = 0;
1125     UBool bailOut = FALSE;
1126     if (U_FAILURE(status)) {
1127         return dest;
1128     }
1129     if (U_FAILURE(fDeferredStatus)) {
1130         status = fDeferredStatus;
1131         bailOut = TRUE;
1132     }
1133     if (fMatch == FALSE) {
1134         status = U_REGEX_INVALID_STATE;
1135         bailOut = TRUE;
1136     }
1137     if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
1138         status = U_INDEX_OUTOFBOUNDS_ERROR;
1139         bailOut = TRUE;
1140     }
1141 
1142     if (bailOut) {
1143         return (dest) ? dest : utext_openUChars(NULL, NULL, 0, &status);
1144     }
1145 
1146     int64_t s, e;
1147     if (groupNum == 0) {
1148         s = fMatchStart;
1149         e = fMatchEnd;
1150     } else {
1151         int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1152         U_ASSERT(groupOffset < fPattern->fFrameSize);
1153         U_ASSERT(groupOffset >= 0);
1154         s = fFrame->fExtra[groupOffset];
1155         e = fFrame->fExtra[groupOffset+1];
1156     }
1157 
1158     if (s < 0) {
1159         // A capture group wasn't part of the match
1160         return utext_clone(dest, fInputText, FALSE, TRUE, &status);
1161     }
1162     U_ASSERT(s <= e);
1163     group_len = e - s;
1164 
1165     dest = utext_clone(dest, fInputText, FALSE, TRUE, &status);
1166     if (dest)
1167         UTEXT_SETNATIVEINDEX(dest, s);
1168     return dest;
1169 }
1170 
group(int32_t groupNum,UErrorCode & status) const1171 UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const {
1172     UnicodeString result;
1173     if (U_FAILURE(status)) {
1174         return result;
1175     }
1176     UText resultText = UTEXT_INITIALIZER;
1177     utext_openUnicodeString(&resultText, &result, &status);
1178     group(groupNum, &resultText, status);
1179     utext_close(&resultText);
1180     return result;
1181 }
1182 
1183 
1184 //  Return deep (mutable) clone
1185 //		Technology Preview (as an API), but note that the UnicodeString API is implemented
1186 //		using this function.
group(int32_t groupNum,UText * dest,UErrorCode & status) const1187 UText *RegexMatcher::group(int32_t groupNum, UText *dest, UErrorCode &status) const {
1188     UBool bailOut = FALSE;
1189     if (U_FAILURE(status)) {
1190         return dest;
1191     }
1192     if (U_FAILURE(fDeferredStatus)) {
1193         status = fDeferredStatus;
1194         bailOut = TRUE;
1195     }
1196 
1197     if (fMatch == FALSE) {
1198         status = U_REGEX_INVALID_STATE;
1199         bailOut = TRUE;
1200     }
1201     if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
1202         status = U_INDEX_OUTOFBOUNDS_ERROR;
1203         bailOut = TRUE;
1204     }
1205 
1206     if (bailOut) {
1207         if (dest) {
1208             utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status);
1209             return dest;
1210         } else {
1211             return utext_openUChars(NULL, NULL, 0, &status);
1212         }
1213     }
1214 
1215     int64_t s, e;
1216     if (groupNum == 0) {
1217         s = fMatchStart;
1218         e = fMatchEnd;
1219     } else {
1220         int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1221         U_ASSERT(groupOffset < fPattern->fFrameSize);
1222         U_ASSERT(groupOffset >= 0);
1223         s = fFrame->fExtra[groupOffset];
1224         e = fFrame->fExtra[groupOffset+1];
1225     }
1226 
1227     if (s < 0) {
1228         // A capture group wasn't part of the match
1229         if (dest) {
1230             utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status);
1231             return dest;
1232         } else {
1233             return utext_openUChars(NULL, NULL, 0, &status);
1234         }
1235     }
1236     U_ASSERT(s <= e);
1237 
1238     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1239         U_ASSERT(e <= fInputLength);
1240         if (dest) {
1241             utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents+s, (int32_t)(e-s), &status);
1242         } else {
1243             UText groupText = UTEXT_INITIALIZER;
1244             utext_openUChars(&groupText, fInputText->chunkContents+s, e-s, &status);
1245             dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status);
1246             utext_close(&groupText);
1247         }
1248     } else {
1249         int32_t len16;
1250         if (UTEXT_USES_U16(fInputText)) {
1251             len16 = (int32_t)(e-s);
1252         } else {
1253             UErrorCode lengthStatus = U_ZERO_ERROR;
1254             len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus);
1255         }
1256         UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
1257         if (groupChars == NULL) {
1258             status = U_MEMORY_ALLOCATION_ERROR;
1259             return dest;
1260         }
1261         utext_extract(fInputText, s, e, groupChars, len16+1, &status);
1262 
1263         if (dest) {
1264             utext_replace(dest, 0, utext_nativeLength(dest), groupChars, len16, &status);
1265         } else {
1266             UText groupText = UTEXT_INITIALIZER;
1267             utext_openUChars(&groupText, groupChars, len16, &status);
1268             dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status);
1269             utext_close(&groupText);
1270         }
1271 
1272         uprv_free(groupChars);
1273     }
1274     return dest;
1275 }
1276 
1277 //--------------------------------------------------------------------------------
1278 //
1279 //  appendGroup() -- currently internal only, appends a group to a UText rather
1280 //                   than replacing its contents
1281 //
1282 //--------------------------------------------------------------------------------
1283 
appendGroup(int32_t groupNum,UText * dest,UErrorCode & status) const1284 int64_t RegexMatcher::appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const {
1285     if (U_FAILURE(status)) {
1286         return 0;
1287     }
1288     if (U_FAILURE(fDeferredStatus)) {
1289         status = fDeferredStatus;
1290         return 0;
1291     }
1292     int64_t destLen = utext_nativeLength(dest);
1293 
1294     if (fMatch == FALSE) {
1295         status = U_REGEX_INVALID_STATE;
1296         return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1297     }
1298     if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
1299         status = U_INDEX_OUTOFBOUNDS_ERROR;
1300         return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1301     }
1302 
1303     int64_t s, e;
1304     if (groupNum == 0) {
1305         s = fMatchStart;
1306         e = fMatchEnd;
1307     } else {
1308         int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1309         U_ASSERT(groupOffset < fPattern->fFrameSize);
1310         U_ASSERT(groupOffset >= 0);
1311         s = fFrame->fExtra[groupOffset];
1312         e = fFrame->fExtra[groupOffset+1];
1313     }
1314 
1315     if (s < 0) {
1316         // A capture group wasn't part of the match
1317         return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1318     }
1319     U_ASSERT(s <= e);
1320 
1321     int64_t deltaLen;
1322     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1323         U_ASSERT(e <= fInputLength);
1324         deltaLen = utext_replace(dest, destLen, destLen, fInputText->chunkContents+s, (int32_t)(e-s), &status);
1325     } else {
1326         int32_t len16;
1327         if (UTEXT_USES_U16(fInputText)) {
1328             len16 = (int32_t)(e-s);
1329         } else {
1330             UErrorCode lengthStatus = U_ZERO_ERROR;
1331             len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus);
1332         }
1333         UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
1334         if (groupChars == NULL) {
1335             status = U_MEMORY_ALLOCATION_ERROR;
1336             return 0;
1337         }
1338         utext_extract(fInputText, s, e, groupChars, len16+1, &status);
1339 
1340         deltaLen = utext_replace(dest, destLen, destLen, groupChars, len16, &status);
1341         uprv_free(groupChars);
1342     }
1343     return deltaLen;
1344 }
1345 
1346 
1347 
1348 //--------------------------------------------------------------------------------
1349 //
1350 //  groupCount()
1351 //
1352 //--------------------------------------------------------------------------------
groupCount() const1353 int32_t RegexMatcher::groupCount() const {
1354     return fPattern->fGroupMap->size();
1355 }
1356 
1357 
1358 
1359 //--------------------------------------------------------------------------------
1360 //
1361 //  hasAnchoringBounds()
1362 //
1363 //--------------------------------------------------------------------------------
hasAnchoringBounds() const1364 UBool RegexMatcher::hasAnchoringBounds() const {
1365     return fAnchoringBounds;
1366 }
1367 
1368 
1369 //--------------------------------------------------------------------------------
1370 //
1371 //  hasTransparentBounds()
1372 //
1373 //--------------------------------------------------------------------------------
hasTransparentBounds() const1374 UBool RegexMatcher::hasTransparentBounds() const {
1375     return fTransparentBounds;
1376 }
1377 
1378 
1379 
1380 //--------------------------------------------------------------------------------
1381 //
1382 //  hitEnd()
1383 //
1384 //--------------------------------------------------------------------------------
hitEnd() const1385 UBool RegexMatcher::hitEnd() const {
1386     return fHitEnd;
1387 }
1388 
1389 
1390 //--------------------------------------------------------------------------------
1391 //
1392 //  input()
1393 //
1394 //--------------------------------------------------------------------------------
input() const1395 const UnicodeString &RegexMatcher::input() const {
1396     if (!fInput) {
1397         UErrorCode status = U_ZERO_ERROR;
1398         int32_t len16;
1399         if (UTEXT_USES_U16(fInputText)) {
1400             len16 = (int32_t)fInputLength;
1401         } else {
1402             len16 = utext_extract(fInputText, 0, fInputLength, NULL, 0, &status);
1403             status = U_ZERO_ERROR; // overflow, length status
1404         }
1405         UnicodeString *result = new UnicodeString(len16, 0, 0);
1406 
1407         UChar *inputChars = result->getBuffer(len16);
1408         utext_extract(fInputText, 0, fInputLength, inputChars, len16, &status); // unterminated warning
1409         result->releaseBuffer(len16);
1410 
1411         (*(const UnicodeString **)&fInput) = result; // pointer assignment, rather than operator=
1412     }
1413 
1414     return *fInput;
1415 }
1416 
1417 //--------------------------------------------------------------------------------
1418 //
1419 //  inputText()
1420 //
1421 //--------------------------------------------------------------------------------
inputText() const1422 UText *RegexMatcher::inputText() const {
1423     return fInputText;
1424 }
1425 
1426 
1427 //--------------------------------------------------------------------------------
1428 //
1429 //  getInput() -- like inputText(), but makes a clone or copies into another UText
1430 //
1431 //--------------------------------------------------------------------------------
getInput(UText * dest,UErrorCode & status) const1432 UText *RegexMatcher::getInput (UText *dest, UErrorCode &status) const {
1433     UBool bailOut = FALSE;
1434     if (U_FAILURE(status)) {
1435         return dest;
1436     }
1437     if (U_FAILURE(fDeferredStatus)) {
1438         status = fDeferredStatus;
1439         bailOut = TRUE;
1440     }
1441 
1442     if (bailOut) {
1443         if (dest) {
1444             utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status);
1445             return dest;
1446         } else {
1447             return utext_clone(NULL, fInputText, FALSE, TRUE, &status);
1448         }
1449     }
1450 
1451     if (dest) {
1452         if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1453             utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents, (int32_t)fInputLength, &status);
1454         } else {
1455             int32_t input16Len;
1456             if (UTEXT_USES_U16(fInputText)) {
1457                 input16Len = (int32_t)fInputLength;
1458             } else {
1459                 UErrorCode lengthStatus = U_ZERO_ERROR;
1460                 input16Len = utext_extract(fInputText, 0, fInputLength, NULL, 0, &lengthStatus); // buffer overflow error
1461             }
1462             UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(input16Len));
1463             if (inputChars == NULL) {
1464                 return dest;
1465             }
1466 
1467             status = U_ZERO_ERROR;
1468             utext_extract(fInputText, 0, fInputLength, inputChars, input16Len, &status); // not terminated warning
1469             status = U_ZERO_ERROR;
1470             utext_replace(dest, 0, utext_nativeLength(dest), inputChars, input16Len, &status);
1471 
1472             uprv_free(inputChars);
1473         }
1474         return dest;
1475     } else {
1476         return utext_clone(NULL, fInputText, FALSE, TRUE, &status);
1477     }
1478 }
1479 
1480 
1481 static UBool compat_SyncMutableUTextContents(UText *ut);
compat_SyncMutableUTextContents(UText * ut)1482 static UBool compat_SyncMutableUTextContents(UText *ut) {
1483     UBool retVal = FALSE;
1484 
1485     //  In the following test, we're really only interested in whether the UText should switch
1486     //  between heap and stack allocation.  If length hasn't changed, we won't, so the chunkContents
1487     //  will still point to the correct data.
1488     if (utext_nativeLength(ut) != ut->nativeIndexingLimit) {
1489         UnicodeString *us=(UnicodeString *)ut->context;
1490 
1491         // Update to the latest length.
1492         // For example, (utext_nativeLength(ut) != ut->nativeIndexingLimit).
1493         int32_t newLength = us->length();
1494 
1495         // Update the chunk description.
1496         // The buffer may have switched between stack- and heap-based.
1497         ut->chunkContents    = us->getBuffer();
1498         ut->chunkLength      = newLength;
1499         ut->chunkNativeLimit = newLength;
1500         ut->nativeIndexingLimit = newLength;
1501         retVal = TRUE;
1502     }
1503 
1504     return retVal;
1505 }
1506 
1507 //--------------------------------------------------------------------------------
1508 //
1509 //  lookingAt()
1510 //
1511 //--------------------------------------------------------------------------------
lookingAt(UErrorCode & status)1512 UBool RegexMatcher::lookingAt(UErrorCode &status) {
1513     if (U_FAILURE(status)) {
1514         return FALSE;
1515     }
1516     if (U_FAILURE(fDeferredStatus)) {
1517         status = fDeferredStatus;
1518         return FALSE;
1519     }
1520 
1521     if (fInputUniStrMaybeMutable) {
1522         if (compat_SyncMutableUTextContents(fInputText)) {
1523         fInputLength = utext_nativeLength(fInputText);
1524         reset();
1525         }
1526     }
1527     else {
1528         resetPreserveRegion();
1529     }
1530     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1531         MatchChunkAt((int32_t)fActiveStart, FALSE, status);
1532     } else {
1533         MatchAt(fActiveStart, FALSE, status);
1534     }
1535     return fMatch;
1536 }
1537 
1538 
lookingAt(int64_t start,UErrorCode & status)1539 UBool RegexMatcher::lookingAt(int64_t start, UErrorCode &status) {
1540     if (U_FAILURE(status)) {
1541         return FALSE;
1542     }
1543     if (U_FAILURE(fDeferredStatus)) {
1544         status = fDeferredStatus;
1545         return FALSE;
1546     }
1547     reset();
1548 
1549     if (start < 0) {
1550         status = U_INDEX_OUTOFBOUNDS_ERROR;
1551         return FALSE;
1552     }
1553 
1554     if (fInputUniStrMaybeMutable) {
1555         if (compat_SyncMutableUTextContents(fInputText)) {
1556         fInputLength = utext_nativeLength(fInputText);
1557         reset();
1558         }
1559     }
1560 
1561     int64_t nativeStart;
1562     nativeStart = start;
1563     if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
1564         status = U_INDEX_OUTOFBOUNDS_ERROR;
1565         return FALSE;
1566     }
1567 
1568     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1569         MatchChunkAt((int32_t)nativeStart, FALSE, status);
1570     } else {
1571         MatchAt(nativeStart, FALSE, status);
1572     }
1573     return fMatch;
1574 }
1575 
1576 
1577 
1578 //--------------------------------------------------------------------------------
1579 //
1580 //  matches()
1581 //
1582 //--------------------------------------------------------------------------------
matches(UErrorCode & status)1583 UBool RegexMatcher::matches(UErrorCode &status) {
1584     if (U_FAILURE(status)) {
1585         return FALSE;
1586     }
1587     if (U_FAILURE(fDeferredStatus)) {
1588         status = fDeferredStatus;
1589         return FALSE;
1590     }
1591 
1592     if (fInputUniStrMaybeMutable) {
1593         if (compat_SyncMutableUTextContents(fInputText)) {
1594         fInputLength = utext_nativeLength(fInputText);
1595         reset();
1596         }
1597     }
1598     else {
1599         resetPreserveRegion();
1600     }
1601 
1602     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1603         MatchChunkAt((int32_t)fActiveStart, TRUE, status);
1604     } else {
1605         MatchAt(fActiveStart, TRUE, status);
1606     }
1607     return fMatch;
1608 }
1609 
1610 
matches(int64_t start,UErrorCode & status)1611 UBool RegexMatcher::matches(int64_t start, UErrorCode &status) {
1612     if (U_FAILURE(status)) {
1613         return FALSE;
1614     }
1615     if (U_FAILURE(fDeferredStatus)) {
1616         status = fDeferredStatus;
1617         return FALSE;
1618     }
1619     reset();
1620 
1621     if (start < 0) {
1622         status = U_INDEX_OUTOFBOUNDS_ERROR;
1623         return FALSE;
1624     }
1625 
1626     if (fInputUniStrMaybeMutable) {
1627         if (compat_SyncMutableUTextContents(fInputText)) {
1628         fInputLength = utext_nativeLength(fInputText);
1629         reset();
1630         }
1631     }
1632 
1633     int64_t nativeStart;
1634     nativeStart = start;
1635     if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
1636         status = U_INDEX_OUTOFBOUNDS_ERROR;
1637         return FALSE;
1638     }
1639 
1640     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1641         MatchChunkAt((int32_t)nativeStart, TRUE, status);
1642     } else {
1643         MatchAt(nativeStart, TRUE, status);
1644     }
1645     return fMatch;
1646 }
1647 
1648 
1649 
1650 //--------------------------------------------------------------------------------
1651 //
1652 //    pattern
1653 //
1654 //--------------------------------------------------------------------------------
pattern() const1655 const RegexPattern &RegexMatcher::pattern() const {
1656     return *fPattern;
1657 }
1658 
1659 
1660 
1661 //--------------------------------------------------------------------------------
1662 //
1663 //    region
1664 //
1665 //--------------------------------------------------------------------------------
region(int64_t regionStart,int64_t regionLimit,int64_t startIndex,UErrorCode & status)1666 RegexMatcher &RegexMatcher::region(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status) {
1667     if (U_FAILURE(status)) {
1668         return *this;
1669     }
1670 
1671     if (regionStart>regionLimit || regionStart<0 || regionLimit<0) {
1672         status = U_ILLEGAL_ARGUMENT_ERROR;
1673     }
1674 
1675     int64_t nativeStart = regionStart;
1676     int64_t nativeLimit = regionLimit;
1677     if (nativeStart > fInputLength || nativeLimit > fInputLength) {
1678       status = U_ILLEGAL_ARGUMENT_ERROR;
1679     }
1680 
1681     if (startIndex == -1)
1682       this->reset();
1683     else
1684       resetPreserveRegion();
1685 
1686     fRegionStart = nativeStart;
1687     fRegionLimit = nativeLimit;
1688     fActiveStart = nativeStart;
1689     fActiveLimit = nativeLimit;
1690 
1691     if (startIndex != -1) {
1692       if (startIndex < fActiveStart || startIndex > fActiveLimit) {
1693           status = U_INDEX_OUTOFBOUNDS_ERROR;
1694       }
1695       fMatchEnd = startIndex;
1696     }
1697 
1698     if (!fTransparentBounds) {
1699         fLookStart = nativeStart;
1700         fLookLimit = nativeLimit;
1701     }
1702     if (fAnchoringBounds) {
1703         fAnchorStart = nativeStart;
1704         fAnchorLimit = nativeLimit;
1705     }
1706     return *this;
1707 }
1708 
region(int64_t start,int64_t limit,UErrorCode & status)1709 RegexMatcher &RegexMatcher::region(int64_t start, int64_t limit, UErrorCode &status) {
1710   return region(start, limit, -1, status);
1711 }
1712 
1713 //--------------------------------------------------------------------------------
1714 //
1715 //    regionEnd
1716 //
1717 //--------------------------------------------------------------------------------
regionEnd() const1718 int32_t RegexMatcher::regionEnd() const {
1719     return (int32_t)fRegionLimit;
1720 }
1721 
regionEnd64() const1722 int64_t RegexMatcher::regionEnd64() const {
1723     return fRegionLimit;
1724 }
1725 
1726 //--------------------------------------------------------------------------------
1727 //
1728 //    regionStart
1729 //
1730 //--------------------------------------------------------------------------------
regionStart() const1731 int32_t RegexMatcher::regionStart() const {
1732     return (int32_t)fRegionStart;
1733 }
1734 
regionStart64() const1735 int64_t RegexMatcher::regionStart64() const {
1736     return fRegionStart;
1737 }
1738 
1739 
1740 //--------------------------------------------------------------------------------
1741 //
1742 //    replaceAll
1743 //
1744 //--------------------------------------------------------------------------------
replaceAll(const UnicodeString & replacement,UErrorCode & status)1745 UnicodeString RegexMatcher::replaceAll(const UnicodeString &replacement, UErrorCode &status) {
1746     UText replacementText = UTEXT_INITIALIZER;
1747     UText resultText = UTEXT_INITIALIZER;
1748     UnicodeString resultString;
1749     if (U_FAILURE(status)) {
1750         return resultString;
1751     }
1752 
1753     utext_openConstUnicodeString(&replacementText, &replacement, &status);
1754     utext_openUnicodeString(&resultText, &resultString, &status);
1755 
1756     replaceAll(&replacementText, &resultText, status);
1757 
1758     utext_close(&resultText);
1759     utext_close(&replacementText);
1760 
1761     return resultString;
1762 }
1763 
1764 
1765 //
1766 //    replaceAll, UText mode
1767 //
replaceAll(UText * replacement,UText * dest,UErrorCode & status)1768 UText *RegexMatcher::replaceAll(UText *replacement, UText *dest, UErrorCode &status) {
1769     if (U_FAILURE(status)) {
1770         return dest;
1771     }
1772     if (U_FAILURE(fDeferredStatus)) {
1773         status = fDeferredStatus;
1774         return dest;
1775     }
1776 
1777     if (dest == NULL) {
1778         UnicodeString emptyString;
1779         UText empty = UTEXT_INITIALIZER;
1780 
1781         utext_openUnicodeString(&empty, &emptyString, &status);
1782         dest = utext_clone(NULL, &empty, TRUE, FALSE, &status);
1783         utext_close(&empty);
1784     }
1785 
1786     if (U_SUCCESS(status)) {
1787         reset();
1788         while (find()) {
1789             appendReplacement(dest, replacement, status);
1790             if (U_FAILURE(status)) {
1791                 break;
1792             }
1793         }
1794         appendTail(dest, status);
1795     }
1796 
1797     return dest;
1798 }
1799 
1800 
1801 //--------------------------------------------------------------------------------
1802 //
1803 //    replaceFirst
1804 //
1805 //--------------------------------------------------------------------------------
replaceFirst(const UnicodeString & replacement,UErrorCode & status)1806 UnicodeString RegexMatcher::replaceFirst(const UnicodeString &replacement, UErrorCode &status) {
1807     UText replacementText = UTEXT_INITIALIZER;
1808     UText resultText = UTEXT_INITIALIZER;
1809     UnicodeString resultString;
1810 
1811     utext_openConstUnicodeString(&replacementText, &replacement, &status);
1812     utext_openUnicodeString(&resultText, &resultString, &status);
1813 
1814     replaceFirst(&replacementText, &resultText, status);
1815 
1816     utext_close(&resultText);
1817     utext_close(&replacementText);
1818 
1819     return resultString;
1820 }
1821 
1822 //
1823 //    replaceFirst, UText mode
1824 //
replaceFirst(UText * replacement,UText * dest,UErrorCode & status)1825 UText *RegexMatcher::replaceFirst(UText *replacement, UText *dest, UErrorCode &status) {
1826     if (U_FAILURE(status)) {
1827         return dest;
1828     }
1829     if (U_FAILURE(fDeferredStatus)) {
1830         status = fDeferredStatus;
1831         return dest;
1832     }
1833 
1834     reset();
1835     if (!find()) {
1836         return getInput(dest, status);
1837     }
1838 
1839     if (dest == NULL) {
1840         UnicodeString emptyString;
1841         UText empty = UTEXT_INITIALIZER;
1842 
1843         utext_openUnicodeString(&empty, &emptyString, &status);
1844         dest = utext_clone(NULL, &empty, TRUE, FALSE, &status);
1845         utext_close(&empty);
1846     }
1847 
1848     appendReplacement(dest, replacement, status);
1849     appendTail(dest, status);
1850 
1851     return dest;
1852 }
1853 
1854 
1855 //--------------------------------------------------------------------------------
1856 //
1857 //     requireEnd
1858 //
1859 //--------------------------------------------------------------------------------
requireEnd() const1860 UBool RegexMatcher::requireEnd() const {
1861     return fRequireEnd;
1862 }
1863 
1864 
1865 //--------------------------------------------------------------------------------
1866 //
1867 //     reset
1868 //
1869 //--------------------------------------------------------------------------------
reset()1870 RegexMatcher &RegexMatcher::reset() {
1871     fRegionStart    = 0;
1872     fRegionLimit    = fInputLength;
1873     fActiveStart    = 0;
1874     fActiveLimit    = fInputLength;
1875     fAnchorStart    = 0;
1876     fAnchorLimit    = fInputLength;
1877     fLookStart      = 0;
1878     fLookLimit      = fInputLength;
1879     resetPreserveRegion();
1880     return *this;
1881 }
1882 
1883 
1884 
resetPreserveRegion()1885 void RegexMatcher::resetPreserveRegion() {
1886     fMatchStart     = 0;
1887     fMatchEnd       = 0;
1888     fLastMatchEnd   = -1;
1889     fAppendPosition = 0;
1890     fMatch          = FALSE;
1891     fHitEnd         = FALSE;
1892     fRequireEnd     = FALSE;
1893     fTime           = 0;
1894     fTickCounter    = TIMER_INITIAL_VALUE;
1895     //resetStack(); // more expensive than it looks...
1896 }
1897 
1898 
reset(const UnicodeString & input)1899 RegexMatcher &RegexMatcher::reset(const UnicodeString &input) {
1900     fInputText = utext_openConstUnicodeString(fInputText, &input, &fDeferredStatus);
1901     if (fPattern->fNeedsAltInput) {
1902         fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
1903     }
1904     fInputLength = utext_nativeLength(fInputText);
1905 
1906     reset();
1907     delete fInput;
1908     fInput = NULL;
1909 
1910     //  Do the following for any UnicodeString.
1911     //  This is for compatibility for those clients who modify the input string "live" during regex operations.
1912     fInputUniStrMaybeMutable = TRUE;
1913 
1914     if (fWordBreakItr != NULL) {
1915 #if UCONFIG_NO_BREAK_ITERATION==0
1916         UErrorCode status = U_ZERO_ERROR;
1917         fWordBreakItr->setText(fInputText, status);
1918 #endif
1919     }
1920     return *this;
1921 }
1922 
1923 
reset(UText * input)1924 RegexMatcher &RegexMatcher::reset(UText *input) {
1925     if (fInputText != input) {
1926         fInputText = utext_clone(fInputText, input, FALSE, TRUE, &fDeferredStatus);
1927         if (fPattern->fNeedsAltInput) fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
1928         fInputLength = utext_nativeLength(fInputText);
1929 
1930         delete fInput;
1931         fInput = NULL;
1932 
1933         if (fWordBreakItr != NULL) {
1934 #if UCONFIG_NO_BREAK_ITERATION==0
1935             UErrorCode status = U_ZERO_ERROR;
1936             fWordBreakItr->setText(input, status);
1937 #endif
1938         }
1939     }
1940     reset();
1941     fInputUniStrMaybeMutable = FALSE;
1942 
1943     return *this;
1944 }
1945 
1946 /*RegexMatcher &RegexMatcher::reset(const UChar *) {
1947     fDeferredStatus = U_INTERNAL_PROGRAM_ERROR;
1948     return *this;
1949 }*/
1950 
reset(int64_t position,UErrorCode & status)1951 RegexMatcher &RegexMatcher::reset(int64_t position, UErrorCode &status) {
1952     if (U_FAILURE(status)) {
1953         return *this;
1954     }
1955     reset();       // Reset also resets the region to be the entire string.
1956 
1957     if (position < 0 || position > fActiveLimit) {
1958         status = U_INDEX_OUTOFBOUNDS_ERROR;
1959         return *this;
1960     }
1961     fMatchEnd = position;
1962     return *this;
1963 }
1964 
1965 
1966 // BEGIN android-added
1967 // Removed this function after Android upgrad to ICU4.8.
1968 //--------------------------------------------------------------------------------
1969 //
1970 //    refresh
1971 //
1972 //--------------------------------------------------------------------------------
refreshInputText(UText * input,UErrorCode & status)1973 RegexMatcher &RegexMatcher::refreshInputText(UText *input, UErrorCode &status) {
1974     if (U_FAILURE(status)) {
1975         return *this;
1976     }
1977     if (input == NULL) {
1978         status = U_ILLEGAL_ARGUMENT_ERROR;
1979         return *this;
1980     }
1981     if (utext_nativeLength(fInputText) != utext_nativeLength(input)) {
1982         status = U_ILLEGAL_ARGUMENT_ERROR;
1983         return *this;
1984     }
1985     int64_t  pos = utext_getNativeIndex(fInputText);
1986     //  Shallow read-only clone of the new UText into the existing input UText
1987     fInputText = utext_clone(fInputText, input, FALSE, TRUE, &status);
1988     if (U_FAILURE(status)) {
1989         return *this;
1990     }
1991     utext_setNativeIndex(fInputText, pos);
1992 
1993     if (fAltInputText != NULL) {
1994         pos = utext_getNativeIndex(fAltInputText);
1995         fAltInputText = utext_clone(fAltInputText, input, FALSE, TRUE, &status);
1996         if (U_FAILURE(status)) {
1997             return *this;
1998         }
1999         utext_setNativeIndex(fAltInputText, pos);
2000     }
2001     return *this;
2002 }
2003 // END android-added
2004 
2005 
2006 //--------------------------------------------------------------------------------
2007 //
2008 //    setTrace
2009 //
2010 //--------------------------------------------------------------------------------
setTrace(UBool state)2011 void RegexMatcher::setTrace(UBool state) {
2012     fTraceDebug = state;
2013 }
2014 
2015 
2016 
2017 //---------------------------------------------------------------------
2018 //
2019 //   split
2020 //
2021 //---------------------------------------------------------------------
split(const UnicodeString & input,UnicodeString dest[],int32_t destCapacity,UErrorCode & status)2022 int32_t  RegexMatcher::split(const UnicodeString &input,
2023         UnicodeString    dest[],
2024         int32_t          destCapacity,
2025         UErrorCode      &status)
2026 {
2027     UText inputText = UTEXT_INITIALIZER;
2028     utext_openConstUnicodeString(&inputText, &input, &status);
2029     if (U_FAILURE(status)) {
2030         return 0;
2031     }
2032 
2033     UText **destText = (UText **)uprv_malloc(sizeof(UText*)*destCapacity);
2034     if (destText == NULL) {
2035         status = U_MEMORY_ALLOCATION_ERROR;
2036         return 0;
2037     }
2038     int32_t i;
2039     for (i = 0; i < destCapacity; i++) {
2040         destText[i] = utext_openUnicodeString(NULL, &dest[i], &status);
2041     }
2042 
2043     int32_t fieldCount = split(&inputText, destText, destCapacity, status);
2044 
2045     for (i = 0; i < destCapacity; i++) {
2046         utext_close(destText[i]);
2047     }
2048 
2049     uprv_free(destText);
2050     utext_close(&inputText);
2051     return fieldCount;
2052 }
2053 
2054 //
2055 //   split, UText mode
2056 //
split(UText * input,UText * dest[],int32_t destCapacity,UErrorCode & status)2057 int32_t  RegexMatcher::split(UText *input,
2058         UText           *dest[],
2059         int32_t          destCapacity,
2060         UErrorCode      &status)
2061 {
2062     //
2063     // Check arguements for validity
2064     //
2065     if (U_FAILURE(status)) {
2066         return 0;
2067     };
2068 
2069     if (destCapacity < 1) {
2070         status = U_ILLEGAL_ARGUMENT_ERROR;
2071         return 0;
2072     }
2073 
2074     //
2075     // Reset for the input text
2076     //
2077     reset(input);
2078     int64_t   nextOutputStringStart = 0;
2079     if (fActiveLimit == 0) {
2080         return 0;
2081     }
2082 
2083     //
2084     // Loop through the input text, searching for the delimiter pattern
2085     //
2086     int32_t i;
2087     int32_t numCaptureGroups = fPattern->fGroupMap->size();
2088     for (i=0; ; i++) {
2089         if (i>=destCapacity-1) {
2090             // There is one or zero output string left.
2091             // Fill the last output string with whatever is left from the input, then exit the loop.
2092             //  ( i will be == destCapacity if we filled the output array while processing
2093             //    capture groups of the delimiter expression, in which case we will discard the
2094             //    last capture group saved in favor of the unprocessed remainder of the
2095             //    input string.)
2096             i = destCapacity-1;
2097             if (fActiveLimit > nextOutputStringStart) {
2098                 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2099                     if (dest[i]) {
2100                         utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2101                                       input->chunkContents+nextOutputStringStart,
2102                                       (int32_t)(fActiveLimit-nextOutputStringStart), &status);
2103                     } else {
2104                         UText remainingText = UTEXT_INITIALIZER;
2105                         utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2106                                          fActiveLimit-nextOutputStringStart, &status);
2107                         dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2108                         utext_close(&remainingText);
2109                     }
2110                 } else {
2111                     UErrorCode lengthStatus = U_ZERO_ERROR;
2112                     int32_t remaining16Length =
2113                         utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus);
2114                     UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2115                     if (remainingChars == NULL) {
2116                         status = U_MEMORY_ALLOCATION_ERROR;
2117                         break;
2118                     }
2119 
2120                     utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
2121                     if (dest[i]) {
2122                         utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2123                     } else {
2124                         UText remainingText = UTEXT_INITIALIZER;
2125                         utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2126                         dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2127                         utext_close(&remainingText);
2128                     }
2129 
2130                     uprv_free(remainingChars);
2131                 }
2132             }
2133             break;
2134         }
2135         if (find()) {
2136             // We found another delimiter.  Move everything from where we started looking
2137             //  up until the start of the delimiter into the next output string.
2138             if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2139                 if (dest[i]) {
2140                     utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2141                                   input->chunkContents+nextOutputStringStart,
2142                                   (int32_t)(fMatchStart-nextOutputStringStart), &status);
2143                 } else {
2144                     UText remainingText = UTEXT_INITIALIZER;
2145                     utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2146                                       fMatchStart-nextOutputStringStart, &status);
2147                     dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2148                     utext_close(&remainingText);
2149                 }
2150             } else {
2151                 UErrorCode lengthStatus = U_ZERO_ERROR;
2152                 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fMatchStart, NULL, 0, &lengthStatus);
2153                 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2154                 if (remainingChars == NULL) {
2155                     status = U_MEMORY_ALLOCATION_ERROR;
2156                     break;
2157                 }
2158                 utext_extract(input, nextOutputStringStart, fMatchStart, remainingChars, remaining16Length+1, &status);
2159                 if (dest[i]) {
2160                     utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2161                 } else {
2162                     UText remainingText = UTEXT_INITIALIZER;
2163                     utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2164                     dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2165                     utext_close(&remainingText);
2166                 }
2167 
2168                 uprv_free(remainingChars);
2169             }
2170             nextOutputStringStart = fMatchEnd;
2171 
2172             // If the delimiter pattern has capturing parentheses, the captured
2173             //  text goes out into the next n destination strings.
2174             int32_t groupNum;
2175             UBool lastGroupWasNullUText = FALSE;
2176             for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
2177                 if (i==destCapacity-1) {
2178                     break;
2179                 }
2180                 i++;
2181                 lastGroupWasNullUText = (dest[i] == NULL ? TRUE : FALSE);
2182                 dest[i] = group(groupNum, dest[i], status);
2183             }
2184 
2185             if (nextOutputStringStart == fActiveLimit) {
2186                 // The delimiter was at the end of the string.  We're done.
2187                 break;
2188             } else if (i == destCapacity-1) {
2189                 // We're out of capture groups, and the rest of the string is more important
2190                 if (lastGroupWasNullUText) {
2191                     utext_close(dest[i]);
2192                     dest[i] = NULL;
2193                 }
2194             }
2195 
2196         }
2197         else
2198         {
2199             // We ran off the end of the input while looking for the next delimiter.
2200             // All the remaining text goes into the current output string.
2201             if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2202                 if (dest[i]) {
2203                     utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2204                                   input->chunkContents+nextOutputStringStart,
2205                                   (int32_t)(fActiveLimit-nextOutputStringStart), &status);
2206                 } else {
2207                     UText remainingText = UTEXT_INITIALIZER;
2208                     utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2209                                      fActiveLimit-nextOutputStringStart, &status);
2210                     dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2211                     utext_close(&remainingText);
2212                 }
2213             } else {
2214                 UErrorCode lengthStatus = U_ZERO_ERROR;
2215                 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus);
2216                 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2217                 if (remainingChars == NULL) {
2218                     status = U_MEMORY_ALLOCATION_ERROR;
2219                     break;
2220                 }
2221 
2222                 utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
2223                 if (dest[i]) {
2224                     utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2225                 } else {
2226                     UText remainingText = UTEXT_INITIALIZER;
2227                     utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2228                     dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2229                     utext_close(&remainingText);
2230                 }
2231 
2232                 uprv_free(remainingChars);
2233             }
2234             break;
2235         }
2236         if (U_FAILURE(status)) {
2237             break;
2238         }
2239     }   // end of for loop
2240     return i+1;
2241 }
2242 
2243 
2244 //--------------------------------------------------------------------------------
2245 //
2246 //     start
2247 //
2248 //--------------------------------------------------------------------------------
start(UErrorCode & status) const2249 int32_t RegexMatcher::start(UErrorCode &status) const {
2250     return start(0, status);
2251 }
2252 
start64(UErrorCode & status) const2253 int64_t RegexMatcher::start64(UErrorCode &status) const {
2254     return start64(0, status);
2255 }
2256 
2257 //--------------------------------------------------------------------------------
2258 //
2259 //     start(int32_t group, UErrorCode &status)
2260 //
2261 //--------------------------------------------------------------------------------
2262 
start64(int32_t group,UErrorCode & status) const2263 int64_t RegexMatcher::start64(int32_t group, UErrorCode &status) const {
2264     if (U_FAILURE(status)) {
2265         return -1;
2266     }
2267     if (U_FAILURE(fDeferredStatus)) {
2268         status = fDeferredStatus;
2269         return -1;
2270     }
2271     if (fMatch == FALSE) {
2272         status = U_REGEX_INVALID_STATE;
2273         return -1;
2274     }
2275     if (group < 0 || group > fPattern->fGroupMap->size()) {
2276         status = U_INDEX_OUTOFBOUNDS_ERROR;
2277         return -1;
2278     }
2279     int64_t s;
2280     if (group == 0) {
2281         s = fMatchStart;
2282     } else {
2283         int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
2284         U_ASSERT(groupOffset < fPattern->fFrameSize);
2285         U_ASSERT(groupOffset >= 0);
2286         s = fFrame->fExtra[groupOffset];
2287     }
2288 
2289     return s;
2290 }
2291 
2292 
start(int32_t group,UErrorCode & status) const2293 int32_t RegexMatcher::start(int32_t group, UErrorCode &status) const {
2294     return (int32_t)start64(group, status);
2295 }
2296 
2297 //--------------------------------------------------------------------------------
2298 //
2299 //     useAnchoringBounds
2300 //
2301 //--------------------------------------------------------------------------------
useAnchoringBounds(UBool b)2302 RegexMatcher &RegexMatcher::useAnchoringBounds(UBool b) {
2303     fAnchoringBounds = b;
2304     fAnchorStart = (fAnchoringBounds ? fRegionStart : 0);
2305     fAnchorLimit = (fAnchoringBounds ? fRegionLimit : fInputLength);
2306     return *this;
2307 }
2308 
2309 
2310 //--------------------------------------------------------------------------------
2311 //
2312 //     useTransparentBounds
2313 //
2314 //--------------------------------------------------------------------------------
useTransparentBounds(UBool b)2315 RegexMatcher &RegexMatcher::useTransparentBounds(UBool b) {
2316     fTransparentBounds = b;
2317     fLookStart = (fTransparentBounds ? 0 : fRegionStart);
2318     fLookLimit = (fTransparentBounds ? fInputLength : fRegionLimit);
2319     return *this;
2320 }
2321 
2322 //--------------------------------------------------------------------------------
2323 //
2324 //     setTimeLimit
2325 //
2326 //--------------------------------------------------------------------------------
setTimeLimit(int32_t limit,UErrorCode & status)2327 void RegexMatcher::setTimeLimit(int32_t limit, UErrorCode &status) {
2328     if (U_FAILURE(status)) {
2329         return;
2330     }
2331     if (U_FAILURE(fDeferredStatus)) {
2332         status = fDeferredStatus;
2333         return;
2334     }
2335     if (limit < 0) {
2336         status = U_ILLEGAL_ARGUMENT_ERROR;
2337         return;
2338     }
2339     fTimeLimit = limit;
2340 }
2341 
2342 
2343 //--------------------------------------------------------------------------------
2344 //
2345 //     getTimeLimit
2346 //
2347 //--------------------------------------------------------------------------------
getTimeLimit() const2348 int32_t RegexMatcher::getTimeLimit() const {
2349     return fTimeLimit;
2350 }
2351 
2352 
2353 //--------------------------------------------------------------------------------
2354 //
2355 //     setStackLimit
2356 //
2357 //--------------------------------------------------------------------------------
setStackLimit(int32_t limit,UErrorCode & status)2358 void RegexMatcher::setStackLimit(int32_t limit, UErrorCode &status) {
2359     if (U_FAILURE(status)) {
2360         return;
2361     }
2362     if (U_FAILURE(fDeferredStatus)) {
2363         status = fDeferredStatus;
2364         return;
2365     }
2366     if (limit < 0) {
2367         status = U_ILLEGAL_ARGUMENT_ERROR;
2368         return;
2369     }
2370 
2371     // Reset the matcher.  This is needed here in case there is a current match
2372     //    whose final stack frame (containing the match results, pointed to by fFrame)
2373     //    would be lost by resizing to a smaller stack size.
2374     reset();
2375 
2376     if (limit == 0) {
2377         // Unlimited stack expansion
2378         fStack->setMaxCapacity(0);
2379     } else {
2380         // Change the units of the limit  from bytes to ints, and bump the size up
2381         //   to be big enough to hold at least one stack frame for the pattern,
2382         //   if it isn't there already.
2383         int32_t adjustedLimit = limit / sizeof(int32_t);
2384         if (adjustedLimit < fPattern->fFrameSize) {
2385             adjustedLimit = fPattern->fFrameSize;
2386         }
2387         fStack->setMaxCapacity(adjustedLimit);
2388     }
2389     fStackLimit = limit;
2390 }
2391 
2392 
2393 //--------------------------------------------------------------------------------
2394 //
2395 //     getStackLimit
2396 //
2397 //--------------------------------------------------------------------------------
getStackLimit() const2398 int32_t RegexMatcher::getStackLimit() const {
2399     return fStackLimit;
2400 }
2401 
2402 
2403 //--------------------------------------------------------------------------------
2404 //
2405 //     setMatchCallback
2406 //
2407 //--------------------------------------------------------------------------------
setMatchCallback(URegexMatchCallback * callback,const void * context,UErrorCode & status)2408 void RegexMatcher::setMatchCallback(URegexMatchCallback     *callback,
2409                                     const void              *context,
2410                                     UErrorCode              &status) {
2411     if (U_FAILURE(status)) {
2412         return;
2413     }
2414     fCallbackFn = callback;
2415     fCallbackContext = context;
2416 }
2417 
2418 
2419 //--------------------------------------------------------------------------------
2420 //
2421 //     getMatchCallback
2422 //
2423 //--------------------------------------------------------------------------------
getMatchCallback(URegexMatchCallback * & callback,const void * & context,UErrorCode & status)2424 void RegexMatcher::getMatchCallback(URegexMatchCallback   *&callback,
2425                                   const void              *&context,
2426                                   UErrorCode              &status) {
2427     if (U_FAILURE(status)) {
2428        return;
2429     }
2430     callback = fCallbackFn;
2431     context  = fCallbackContext;
2432 }
2433 
2434 
2435 //--------------------------------------------------------------------------------
2436 //
2437 //     setMatchCallback
2438 //
2439 //--------------------------------------------------------------------------------
setFindProgressCallback(URegexFindProgressCallback * callback,const void * context,UErrorCode & status)2440 void RegexMatcher::setFindProgressCallback(URegexFindProgressCallback      *callback,
2441                                                 const void                      *context,
2442                                                 UErrorCode                      &status) {
2443     if (U_FAILURE(status)) {
2444         return;
2445     }
2446     fFindProgressCallbackFn = callback;
2447     fFindProgressCallbackContext = context;
2448 }
2449 
2450 
2451 //--------------------------------------------------------------------------------
2452 //
2453 //     getMatchCallback
2454 //
2455 //--------------------------------------------------------------------------------
getFindProgressCallback(URegexFindProgressCallback * & callback,const void * & context,UErrorCode & status)2456 void RegexMatcher::getFindProgressCallback(URegexFindProgressCallback    *&callback,
2457                                                 const void                    *&context,
2458                                                 UErrorCode                    &status) {
2459     if (U_FAILURE(status)) {
2460        return;
2461     }
2462     callback = fFindProgressCallbackFn;
2463     context  = fFindProgressCallbackContext;
2464 }
2465 
2466 
2467 //================================================================================
2468 //
2469 //    Code following this point in this file is the internal
2470 //    Match Engine Implementation.
2471 //
2472 //================================================================================
2473 
2474 
2475 //--------------------------------------------------------------------------------
2476 //
2477 //   resetStack
2478 //           Discard any previous contents of the state save stack, and initialize a
2479 //           new stack frame to all -1.  The -1s are needed for capture group limits,
2480 //           where they indicate that a group has not yet matched anything.
2481 //--------------------------------------------------------------------------------
resetStack()2482 REStackFrame *RegexMatcher::resetStack() {
2483     // Discard any previous contents of the state save stack, and initialize a
2484     //  new stack frame with all -1 data.  The -1s are needed for capture group limits,
2485     //  where they indicate that a group has not yet matched anything.
2486     fStack->removeAllElements();
2487 
2488     REStackFrame *iFrame = (REStackFrame *)fStack->reserveBlock(fPattern->fFrameSize, fDeferredStatus);
2489     int32_t i;
2490     for (i=0; i<fPattern->fFrameSize-RESTACKFRAME_HDRCOUNT; i++) {
2491         iFrame->fExtra[i] = -1;
2492     }
2493     return iFrame;
2494 }
2495 
2496 
2497 
2498 //--------------------------------------------------------------------------------
2499 //
2500 //   isWordBoundary
2501 //                     in perl, "xab..cd..", \b is true at positions 0,3,5,7
2502 //                     For us,
2503 //                       If the current char is a combining mark,
2504 //                          \b is FALSE.
2505 //                       Else Scan backwards to the first non-combining char.
2506 //                            We are at a boundary if the this char and the original chars are
2507 //                               opposite in membership in \w set
2508 //
2509 //          parameters:   pos   - the current position in the input buffer
2510 //
2511 //              TODO:  double-check edge cases at region boundaries.
2512 //
2513 //--------------------------------------------------------------------------------
isWordBoundary(int64_t pos)2514 UBool RegexMatcher::isWordBoundary(int64_t pos) {
2515     UBool isBoundary = FALSE;
2516     UBool cIsWord    = FALSE;
2517 
2518     if (pos >= fLookLimit) {
2519         fHitEnd = TRUE;
2520     } else {
2521         // Determine whether char c at current position is a member of the word set of chars.
2522         // If we're off the end of the string, behave as though we're not at a word char.
2523         UTEXT_SETNATIVEINDEX(fInputText, pos);
2524         UChar32  c = UTEXT_CURRENT32(fInputText);
2525         if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
2526             // Current char is a combining one.  Not a boundary.
2527             return FALSE;
2528         }
2529         cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
2530     }
2531 
2532     // Back up until we come to a non-combining char, determine whether
2533     //  that char is a word char.
2534     UBool prevCIsWord = FALSE;
2535     for (;;) {
2536         if (UTEXT_GETNATIVEINDEX(fInputText) <= fLookStart) {
2537             break;
2538         }
2539         UChar32 prevChar = UTEXT_PREVIOUS32(fInputText);
2540         if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
2541               || u_charType(prevChar) == U_FORMAT_CHAR)) {
2542             prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar);
2543             break;
2544         }
2545     }
2546     isBoundary = cIsWord ^ prevCIsWord;
2547     return isBoundary;
2548 }
2549 
isChunkWordBoundary(int32_t pos)2550 UBool RegexMatcher::isChunkWordBoundary(int32_t pos) {
2551     UBool isBoundary = FALSE;
2552     UBool cIsWord    = FALSE;
2553 
2554     const UChar *inputBuf = fInputText->chunkContents;
2555 
2556     if (pos >= fLookLimit) {
2557         fHitEnd = TRUE;
2558     } else {
2559         // Determine whether char c at current position is a member of the word set of chars.
2560         // If we're off the end of the string, behave as though we're not at a word char.
2561         UChar32 c;
2562         U16_GET(inputBuf, fLookStart, pos, fLookLimit, c);
2563         if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
2564             // Current char is a combining one.  Not a boundary.
2565             return FALSE;
2566         }
2567         cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
2568     }
2569 
2570     // Back up until we come to a non-combining char, determine whether
2571     //  that char is a word char.
2572     UBool prevCIsWord = FALSE;
2573     for (;;) {
2574         if (pos <= fLookStart) {
2575             break;
2576         }
2577         UChar32 prevChar;
2578         U16_PREV(inputBuf, fLookStart, pos, prevChar);
2579         if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
2580               || u_charType(prevChar) == U_FORMAT_CHAR)) {
2581             prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar);
2582             break;
2583         }
2584     }
2585     isBoundary = cIsWord ^ prevCIsWord;
2586     return isBoundary;
2587 }
2588 
2589 //--------------------------------------------------------------------------------
2590 //
2591 //   isUWordBoundary
2592 //
2593 //         Test for a word boundary using RBBI word break.
2594 //
2595 //          parameters:   pos   - the current position in the input buffer
2596 //
2597 //--------------------------------------------------------------------------------
isUWordBoundary(int64_t pos)2598 UBool RegexMatcher::isUWordBoundary(int64_t pos) {
2599     UBool       returnVal = FALSE;
2600 #if UCONFIG_NO_BREAK_ITERATION==0
2601 
2602     // If we haven't yet created a break iterator for this matcher, do it now.
2603     if (fWordBreakItr == NULL) {
2604         fWordBreakItr =
2605             (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), fDeferredStatus);
2606         if (U_FAILURE(fDeferredStatus)) {
2607             return FALSE;
2608         }
2609         fWordBreakItr->setText(fInputText, fDeferredStatus);
2610     }
2611 
2612     if (pos >= fLookLimit) {
2613         fHitEnd = TRUE;
2614         returnVal = TRUE;   // With Unicode word rules, only positions within the interior of "real"
2615                             //    words are not boundaries.  All non-word chars stand by themselves,
2616                             //    with word boundaries on both sides.
2617     } else {
2618         if (!UTEXT_USES_U16(fInputText)) {
2619             // !!!: Would like a better way to do this!
2620             UErrorCode status = U_ZERO_ERROR;
2621             pos = utext_extract(fInputText, 0, pos, NULL, 0, &status);
2622         }
2623         returnVal = fWordBreakItr->isBoundary((int32_t)pos);
2624     }
2625 #endif
2626     return   returnVal;
2627 }
2628 
2629 //--------------------------------------------------------------------------------
2630 //
2631 //   IncrementTime     This function is called once each TIMER_INITIAL_VALUE state
2632 //                     saves. Increment the "time" counter, and call the
2633 //                     user callback function if there is one installed.
2634 //
2635 //                     If the match operation needs to be aborted, either for a time-out
2636 //                     or because the user callback asked for it, just set an error status.
2637 //                     The engine will pick that up and stop in its outer loop.
2638 //
2639 //--------------------------------------------------------------------------------
IncrementTime(UErrorCode & status)2640 void RegexMatcher::IncrementTime(UErrorCode &status) {
2641     fTickCounter = TIMER_INITIAL_VALUE;
2642     fTime++;
2643     if (fCallbackFn != NULL) {
2644         if ((*fCallbackFn)(fCallbackContext, fTime) == FALSE) {
2645             status = U_REGEX_STOPPED_BY_CALLER;
2646             return;
2647         }
2648     }
2649     if (fTimeLimit > 0 && fTime >= fTimeLimit) {
2650         status = U_REGEX_TIME_OUT;
2651     }
2652 }
2653 
2654 //--------------------------------------------------------------------------------
2655 //
2656 //   ReportFindProgress     This function is called once for each advance in the target
2657 //                          string from the find() function, and calls the user progress callback
2658 //                          function if there is one installed.
2659 //
2660 //                          NOTE:
2661 //
2662 //                          If the match operation needs to be aborted because the user
2663 //                          callback asked for it, just set an error status.
2664 //                          The engine will pick that up and stop in its outer loop.
2665 //
2666 //--------------------------------------------------------------------------------
ReportFindProgress(int64_t matchIndex,UErrorCode & status)2667 UBool RegexMatcher::ReportFindProgress(int64_t matchIndex, UErrorCode &status) {
2668     if (fFindProgressCallbackFn != NULL) {
2669         if ((*fFindProgressCallbackFn)(fFindProgressCallbackContext, matchIndex) == FALSE) {
2670             status = U_ZERO_ERROR /*U_REGEX_STOPPED_BY_CALLER*/;
2671             return FALSE;
2672         }
2673     }
2674     return TRUE;
2675 }
2676 
2677 //--------------------------------------------------------------------------------
2678 //
2679 //   StateSave
2680 //       Make a new stack frame, initialized as a copy of the current stack frame.
2681 //       Set the pattern index in the original stack frame from the operand value
2682 //       in the opcode.  Execution of the engine continues with the state in
2683 //       the newly created stack frame
2684 //
2685 //       Note that reserveBlock() may grow the stack, resulting in the
2686 //       whole thing being relocated in memory.
2687 //
2688 //    Parameters:
2689 //       fp           The top frame pointer when called.  At return, a new
2690 //                    fame will be present
2691 //       savePatIdx   An index into the compiled pattern.  Goes into the original
2692 //                    (not new) frame.  If execution ever back-tracks out of the
2693 //                    new frame, this will be where we continue from in the pattern.
2694 //    Return
2695 //                    The new frame pointer.
2696 //
2697 //--------------------------------------------------------------------------------
StateSave(REStackFrame * fp,int64_t savePatIdx,UErrorCode & status)2698 inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status) {
2699     // push storage for a new frame.
2700     int64_t *newFP = fStack->reserveBlock(fFrameSize, status);
2701     if (newFP == NULL) {
2702         // Failure on attempted stack expansion.
2703         //   Stack function set some other error code, change it to a more
2704         //   specific one for regular expressions.
2705         status = U_REGEX_STACK_OVERFLOW;
2706         // We need to return a writable stack frame, so just return the
2707         //    previous frame.  The match operation will stop quickly
2708         //    because of the error status, after which the frame will never
2709         //    be looked at again.
2710         return fp;
2711     }
2712     fp = (REStackFrame *)(newFP - fFrameSize);  // in case of realloc of stack.
2713 
2714     // New stack frame = copy of old top frame.
2715     int64_t *source = (int64_t *)fp;
2716     int64_t *dest   = newFP;
2717     for (;;) {
2718         *dest++ = *source++;
2719         if (source == newFP) {
2720             break;
2721         }
2722     }
2723 
2724     fTickCounter--;
2725     if (fTickCounter <= 0) {
2726        IncrementTime(status);    // Re-initializes fTickCounter
2727     }
2728     fp->fPatIdx = savePatIdx;
2729     return (REStackFrame *)newFP;
2730 }
2731 
2732 
2733 //--------------------------------------------------------------------------------
2734 //
2735 //   MatchAt      This is the actual matching engine.
2736 //
2737 //                  startIdx:    begin matching a this index.
2738 //                  toEnd:       if true, match must extend to end of the input region
2739 //
2740 //--------------------------------------------------------------------------------
MatchAt(int64_t startIdx,UBool toEnd,UErrorCode & status)2741 void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) {
2742     UBool       isMatch  = FALSE;      // True if the we have a match.
2743 
2744     int64_t     backSearchIndex = U_INT64_MAX; // used after greedy single-character matches for searching backwards
2745 
2746     int32_t     op;                    // Operation from the compiled pattern, split into
2747     int32_t     opType;                //    the opcode
2748     int32_t     opValue;               //    and the operand value.
2749 
2750     #ifdef REGEX_RUN_DEBUG
2751     if (fTraceDebug)
2752     {
2753         printf("MatchAt(startIdx=%ld)\n", startIdx);
2754         printf("Original Pattern: ");
2755         UChar32 c = utext_next32From(fPattern->fPattern, 0);
2756         while (c != U_SENTINEL) {
2757             if (c<32 || c>256) {
2758                 c = '.';
2759             }
2760             REGEX_DUMP_DEBUG_PRINTF(("%c", c));
2761 
2762             c = UTEXT_NEXT32(fPattern->fPattern);
2763         }
2764         printf("\n");
2765         printf("Input String: ");
2766         c = utext_next32From(fInputText, 0);
2767         while (c != U_SENTINEL) {
2768             if (c<32 || c>256) {
2769                 c = '.';
2770             }
2771             printf("%c", c);
2772 
2773             c = UTEXT_NEXT32(fInputText);
2774         }
2775         printf("\n");
2776         printf("\n");
2777     }
2778     #endif
2779 
2780     if (U_FAILURE(status)) {
2781         return;
2782     }
2783 
2784     //  Cache frequently referenced items from the compiled pattern
2785     //
2786     int64_t             *pat           = fPattern->fCompiledPat->getBuffer();
2787 
2788     const UChar         *litText       = fPattern->fLiteralText.getBuffer();
2789     UVector             *sets          = fPattern->fSets;
2790 
2791     fFrameSize = fPattern->fFrameSize;
2792     REStackFrame        *fp            = resetStack();
2793 
2794     fp->fPatIdx   = 0;
2795     fp->fInputIdx = startIdx;
2796 
2797     // Zero out the pattern's static data
2798     int32_t i;
2799     for (i = 0; i<fPattern->fDataSize; i++) {
2800         fData[i] = 0;
2801     }
2802 
2803     //
2804     //  Main loop for interpreting the compiled pattern.
2805     //  One iteration of the loop per pattern operation performed.
2806     //
2807     for (;;) {
2808 #if 0
2809         if (_heapchk() != _HEAPOK) {
2810             fprintf(stderr, "Heap Trouble\n");
2811         }
2812 #endif
2813 
2814         op      = (int32_t)pat[fp->fPatIdx];
2815         opType  = URX_TYPE(op);
2816         opValue = URX_VAL(op);
2817         #ifdef REGEX_RUN_DEBUG
2818         if (fTraceDebug) {
2819             UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2820             printf("inputIdx=%d   inputChar=%x   sp=%3d   activeLimit=%d  ", fp->fInputIdx,
2821                 UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
2822             fPattern->dumpOp(fp->fPatIdx);
2823         }
2824         #endif
2825         fp->fPatIdx++;
2826 
2827         switch (opType) {
2828 
2829 
2830         case URX_NOP:
2831             break;
2832 
2833 
2834         case URX_BACKTRACK:
2835             // Force a backtrack.  In some circumstances, the pattern compiler
2836             //   will notice that the pattern can't possibly match anything, and will
2837             //   emit one of these at that point.
2838             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2839             break;
2840 
2841 
2842         case URX_ONECHAR:
2843             if (fp->fInputIdx < fActiveLimit) {
2844                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2845                 UChar32 c = UTEXT_NEXT32(fInputText);
2846                 if (c == opValue) {
2847                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
2848                     break;
2849                 }
2850             } else {
2851                 fHitEnd = TRUE;
2852             }
2853 
2854             #ifdef REGEX_SMART_BACKTRACKING
2855             if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
2856                 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
2857                 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
2858                     UBool success = FALSE;
2859                     UChar32 c = UTEXT_PREVIOUS32(fInputText);
2860                     while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex) {
2861                         if (c == opValue) {
2862                             success = TRUE;
2863                             break;
2864                         } else if (c == U_SENTINEL) {
2865                             break;
2866                         }
2867                         c = UTEXT_PREVIOUS32(fInputText);
2868                     }
2869                     if (success) {
2870                         fHitEnd = FALSE;
2871                         fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2872                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
2873                         if (fp->fInputIdx > backSearchIndex) {
2874                             fp = StateSave(fp, fp->fPatIdx, status);
2875                         }
2876                         fp->fPatIdx++; // Skip the LOOP_C, we just did that
2877                         break;
2878                     }
2879                 }
2880             }
2881             #endif
2882 
2883             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2884             break;
2885 
2886 
2887         case URX_STRING:
2888             {
2889                 // Test input against a literal string.
2890                 // Strings require two slots in the compiled pattern, one for the
2891                 //   offset to the string text, and one for the length.
2892                 int32_t   stringStartIdx = opValue;
2893                 int32_t   stringLen;
2894 
2895                 op      = (int32_t)pat[fp->fPatIdx];     // Fetch the second operand
2896                 fp->fPatIdx++;
2897                 opType    = URX_TYPE(op);
2898                 stringLen = URX_VAL(op);
2899                 U_ASSERT(opType == URX_STRING_LEN);
2900                 U_ASSERT(stringLen >= 2);
2901 
2902                 const UChar *patternChars = litText+stringStartIdx;
2903                 const UChar *patternEnd = patternChars+stringLen;
2904 
2905                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2906                 UChar32 c;
2907                 UBool success = TRUE;
2908 
2909                 while (patternChars < patternEnd && success) {
2910                     c = UTEXT_NEXT32(fInputText);
2911 
2912                     if (c != U_SENTINEL && UTEXT_GETNATIVEINDEX(fInputText) <= fActiveLimit) {
2913                         if (U_IS_BMP(c)) {
2914                             success = (*patternChars == c);
2915                             patternChars += 1;
2916                         } else if (patternChars+1 < patternEnd) {
2917                             success = (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c));
2918                             patternChars += 2;
2919                         }
2920                     } else {
2921                         success = FALSE;
2922                         fHitEnd = TRUE;          //   TODO:  See ticket 6074
2923                     }
2924                 }
2925 
2926                 if (success) {
2927                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
2928                 } else {
2929                     #ifdef REGEX_SMART_BACKTRACKING
2930                     if (fp->fInputIdx > backSearchIndex && fStack->size()) {
2931                         REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
2932                         if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
2933                             // Reset to last start point
2934                             UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2935                             patternChars = litText+stringStartIdx;
2936 
2937                             // Search backwards for a possible start
2938                             do {
2939                                 c = UTEXT_PREVIOUS32(fInputText);
2940                                 if (c == U_SENTINEL) {
2941                                     break;
2942                                 } else if ((U_IS_BMP(c) && *patternChars == c) ||
2943                                     (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c))) {
2944                                     success = TRUE;
2945                                     break;
2946                                 }
2947                             } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex);
2948 
2949                             // And try again
2950                             if (success) {
2951                                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2952                                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
2953                                 if (fp->fInputIdx > backSearchIndex) {
2954                                     fp = StateSave(fp, fp->fPatIdx, status);
2955                                 }
2956                                 fp->fPatIdx++; // Skip the LOOP_C, we just did that
2957                                 break;
2958                             }
2959                         }
2960                     }
2961                     #endif
2962                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2963                 }
2964             }
2965             break;
2966 
2967 
2968         case URX_STATE_SAVE:
2969             fp = StateSave(fp, opValue, status);
2970             break;
2971 
2972 
2973         case URX_END:
2974             // The match loop will exit via this path on a successful match,
2975             //   when we reach the end of the pattern.
2976             if (toEnd && fp->fInputIdx != fActiveLimit) {
2977                 // The pattern matched, but not to the end of input.  Try some more.
2978                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2979                 break;
2980             }
2981             isMatch = TRUE;
2982             goto  breakFromLoop;
2983 
2984         // Start and End Capture stack frame variables are laid out out like this:
2985             //  fp->fExtra[opValue]  - The start of a completed capture group
2986             //             opValue+1 - The end   of a completed capture group
2987             //             opValue+2 - the start of a capture group whose end
2988             //                          has not yet been reached (and might not ever be).
2989         case URX_START_CAPTURE:
2990             U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
2991             fp->fExtra[opValue+2] = fp->fInputIdx;
2992             break;
2993 
2994 
2995         case URX_END_CAPTURE:
2996             U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
2997             U_ASSERT(fp->fExtra[opValue+2] >= 0);            // Start pos for this group must be set.
2998             fp->fExtra[opValue]   = fp->fExtra[opValue+2];   // Tentative start becomes real.
2999             fp->fExtra[opValue+1] = fp->fInputIdx;           // End position
3000             U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
3001             break;
3002 
3003 
3004         case URX_DOLLAR:                   //  $, test for End of line
3005                                            //     or for position before new line at end of input
3006             {
3007                 if (fp->fInputIdx >= fAnchorLimit) {
3008                     // We really are at the end of input.  Success.
3009                     fHitEnd = TRUE;
3010                     fRequireEnd = TRUE;
3011                     break;
3012                 }
3013 
3014                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3015 
3016                 // If we are positioned just before a new-line that is located at the
3017                 //   end of input, succeed.
3018                 UChar32 c = UTEXT_NEXT32(fInputText);
3019                 if (UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
3020                     if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) {
3021                         // If not in the middle of a CR/LF sequence
3022                         if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && (UTEXT_PREVIOUS32(fInputText), UTEXT_PREVIOUS32(fInputText))==0x0d)) {
3023                             // At new-line at end of input. Success
3024                             fHitEnd = TRUE;
3025                             fRequireEnd = TRUE;
3026 
3027                             break;
3028                         }
3029                     }
3030                 } else {
3031                     UChar32 nextC = UTEXT_NEXT32(fInputText);
3032                     if (c == 0x0d && nextC == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
3033                         fHitEnd = TRUE;
3034                         fRequireEnd = TRUE;
3035                         break;                         // At CR/LF at end of input.  Success
3036                     }
3037                 }
3038 
3039                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3040             }
3041             break;
3042 
3043 
3044          case URX_DOLLAR_D:                   //  $, test for End of Line, in UNIX_LINES mode.
3045             if (fp->fInputIdx >= fAnchorLimit) {
3046                 // Off the end of input.  Success.
3047                 fHitEnd = TRUE;
3048                 fRequireEnd = TRUE;
3049                 break;
3050             } else {
3051                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3052                 UChar32 c = UTEXT_NEXT32(fInputText);
3053                 // Either at the last character of input, or off the end.
3054                 if (c == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) == fAnchorLimit) {
3055                     fHitEnd = TRUE;
3056                     fRequireEnd = TRUE;
3057                     break;
3058                 }
3059             }
3060 
3061             // Not at end of input.  Back-track out.
3062             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3063             break;
3064 
3065 
3066          case URX_DOLLAR_M:                //  $, test for End of line in multi-line mode
3067              {
3068                  if (fp->fInputIdx >= fAnchorLimit) {
3069                      // We really are at the end of input.  Success.
3070                      fHitEnd = TRUE;
3071                      fRequireEnd = TRUE;
3072                      break;
3073                  }
3074                  // If we are positioned just before a new-line, succeed.
3075                  // It makes no difference where the new-line is within the input.
3076                  UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3077                  UChar32 c = UTEXT_CURRENT32(fInputText);
3078                  if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) {
3079                      // At a line end, except for the odd chance of  being in the middle of a CR/LF sequence
3080                      //  In multi-line mode, hitting a new-line just before the end of input does not
3081                      //   set the hitEnd or requireEnd flags
3082                      if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && UTEXT_PREVIOUS32(fInputText)==0x0d)) {
3083                         break;
3084                      }
3085                  }
3086                  // not at a new line.  Fail.
3087                  fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3088              }
3089              break;
3090 
3091 
3092          case URX_DOLLAR_MD:                //  $, test for End of line in multi-line and UNIX_LINES mode
3093              {
3094                  if (fp->fInputIdx >= fAnchorLimit) {
3095                      // We really are at the end of input.  Success.
3096                      fHitEnd = TRUE;
3097                      fRequireEnd = TRUE;  // Java set requireEnd in this case, even though
3098                      break;               //   adding a new-line would not lose the match.
3099                  }
3100                  // If we are not positioned just before a new-line, the test fails; backtrack out.
3101                  // It makes no difference where the new-line is within the input.
3102                  UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3103                  if (UTEXT_CURRENT32(fInputText) != 0x0a) {
3104                      fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3105                  }
3106              }
3107              break;
3108 
3109 
3110        case URX_CARET:                    //  ^, test for start of line
3111             if (fp->fInputIdx != fAnchorStart) {
3112                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3113             }
3114             break;
3115 
3116 
3117        case URX_CARET_M:                   //  ^, test for start of line in mulit-line mode
3118            {
3119                if (fp->fInputIdx == fAnchorStart) {
3120                    // We are at the start input.  Success.
3121                    break;
3122                }
3123                // Check whether character just before the current pos is a new-line
3124                //   unless we are at the end of input
3125                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3126                UChar32  c = UTEXT_PREVIOUS32(fInputText);
3127                if ((fp->fInputIdx < fAnchorLimit) &&
3128                    ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
3129                    //  It's a new-line.  ^ is true.  Success.
3130                    //  TODO:  what should be done with positions between a CR and LF?
3131                    break;
3132                }
3133                // Not at the start of a line.  Fail.
3134                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3135            }
3136            break;
3137 
3138 
3139        case URX_CARET_M_UNIX:       //  ^, test for start of line in mulit-line + Unix-line mode
3140            {
3141                U_ASSERT(fp->fInputIdx >= fAnchorStart);
3142                if (fp->fInputIdx <= fAnchorStart) {
3143                    // We are at the start input.  Success.
3144                    break;
3145                }
3146                // Check whether character just before the current pos is a new-line
3147                U_ASSERT(fp->fInputIdx <= fAnchorLimit);
3148                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3149                UChar32  c = UTEXT_PREVIOUS32(fInputText);
3150                if (c != 0x0a) {
3151                    // Not at the start of a line.  Back-track out.
3152                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3153                }
3154            }
3155            break;
3156 
3157         case URX_BACKSLASH_B:          // Test for word boundaries
3158             {
3159                 UBool success = isWordBoundary(fp->fInputIdx);
3160                 success ^= (opValue != 0);     // flip sense for \B
3161                 if (!success) {
3162                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3163                 }
3164             }
3165             break;
3166 
3167 
3168         case URX_BACKSLASH_BU:          // Test for word boundaries, Unicode-style
3169             {
3170                 UBool success = isUWordBoundary(fp->fInputIdx);
3171                 success ^= (opValue != 0);     // flip sense for \B
3172                 if (!success) {
3173                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3174                 }
3175             }
3176             break;
3177 
3178 
3179         case URX_BACKSLASH_D:            // Test for decimal digit
3180             {
3181                 if (fp->fInputIdx >= fActiveLimit) {
3182                     fHitEnd = TRUE;
3183                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3184                     break;
3185                 }
3186 
3187                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3188 
3189                 UChar32 c = UTEXT_NEXT32(fInputText);
3190                 int8_t ctype = u_charType(c);     // TODO:  make a unicode set for this.  Will be faster.
3191                 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
3192                 success ^= (opValue != 0);        // flip sense for \D
3193                 if (success) {
3194                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3195                 } else {
3196                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3197                 }
3198             }
3199             break;
3200 
3201 
3202         case URX_BACKSLASH_G:          // Test for position at end of previous match
3203             if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) {
3204                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3205             }
3206             break;
3207 
3208 
3209         case URX_BACKSLASH_X:
3210             //  Match a Grapheme, as defined by Unicode TR 29.
3211             //  Differs slightly from Perl, which consumes combining marks independently
3212             //    of context.
3213             {
3214 
3215                 // Fail if at end of input
3216                 if (fp->fInputIdx >= fActiveLimit) {
3217                     fHitEnd = TRUE;
3218                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3219                     break;
3220                 }
3221 
3222                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3223 
3224                 // Examine (and consume) the current char.
3225                 //   Dispatch into a little state machine, based on the char.
3226                 UChar32  c;
3227                 c = UTEXT_NEXT32(fInputText);
3228                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3229                 UnicodeSet **sets = fPattern->fStaticSets;
3230                 if (sets[URX_GC_NORMAL]->contains(c))  goto GC_Extend;
3231                 if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control;
3232                 if (sets[URX_GC_L]->contains(c))       goto GC_L;
3233                 if (sets[URX_GC_LV]->contains(c))      goto GC_V;
3234                 if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
3235                 if (sets[URX_GC_V]->contains(c))       goto GC_V;
3236                 if (sets[URX_GC_T]->contains(c))       goto GC_T;
3237                 goto GC_Extend;
3238 
3239 
3240 
3241 GC_L:
3242                 if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
3243                 c = UTEXT_NEXT32(fInputText);
3244                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3245                 if (sets[URX_GC_L]->contains(c))       goto GC_L;
3246                 if (sets[URX_GC_LV]->contains(c))      goto GC_V;
3247                 if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
3248                 if (sets[URX_GC_V]->contains(c))       goto GC_V;
3249                 UTEXT_PREVIOUS32(fInputText);
3250                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3251                 goto GC_Extend;
3252 
3253 GC_V:
3254                 if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
3255                 c = UTEXT_NEXT32(fInputText);
3256                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3257                 if (sets[URX_GC_V]->contains(c))       goto GC_V;
3258                 if (sets[URX_GC_T]->contains(c))       goto GC_T;
3259                 UTEXT_PREVIOUS32(fInputText);
3260                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3261                 goto GC_Extend;
3262 
3263 GC_T:
3264                 if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
3265                 c = UTEXT_NEXT32(fInputText);
3266                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3267                 if (sets[URX_GC_T]->contains(c))       goto GC_T;
3268                 UTEXT_PREVIOUS32(fInputText);
3269                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3270                 goto GC_Extend;
3271 
3272 GC_Extend:
3273                 // Combining characters are consumed here
3274                 for (;;) {
3275                     if (fp->fInputIdx >= fActiveLimit) {
3276                         break;
3277                     }
3278                     c = UTEXT_CURRENT32(fInputText);
3279                     if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
3280                         break;
3281                     }
3282                     UTEXT_NEXT32(fInputText);
3283                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3284                 }
3285                 goto GC_Done;
3286 
3287 GC_Control:
3288                 // Most control chars stand alone (don't combine with combining chars),
3289                 //   except for that CR/LF sequence is a single grapheme cluster.
3290                 if (c == 0x0d && fp->fInputIdx < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
3291                     c = UTEXT_NEXT32(fInputText);
3292                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3293                 }
3294 
3295 GC_Done:
3296                 if (fp->fInputIdx >= fActiveLimit) {
3297                     fHitEnd = TRUE;
3298                 }
3299                 break;
3300             }
3301 
3302 
3303 
3304 
3305         case URX_BACKSLASH_Z:          // Test for end of Input
3306             if (fp->fInputIdx < fAnchorLimit) {
3307                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3308             } else {
3309                 fHitEnd = TRUE;
3310                 fRequireEnd = TRUE;
3311             }
3312             break;
3313 
3314 
3315 
3316         case URX_STATIC_SETREF:
3317             {
3318                 // Test input character against one of the predefined sets
3319                 //    (Word Characters, for example)
3320                 // The high bit of the op value is a flag for the match polarity.
3321                 //    0:   success if input char is in set.
3322                 //    1:   success if input char is not in set.
3323                 if (fp->fInputIdx >= fActiveLimit) {
3324                     fHitEnd = TRUE;
3325                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3326                     break;
3327                 }
3328 
3329                 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
3330                 opValue &= ~URX_NEG_SET;
3331                 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
3332 
3333                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3334                 UChar32 c = UTEXT_NEXT32(fInputText);
3335                 if (c < 256) {
3336                     Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
3337                     if (s8->contains(c)) {
3338                         success = !success;
3339                     }
3340                 } else {
3341                     const UnicodeSet *s = fPattern->fStaticSets[opValue];
3342                     if (s->contains(c)) {
3343                         success = !success;
3344                     }
3345                 }
3346                 if (success) {
3347                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3348                 } else {
3349                     // the character wasn't in the set.
3350                     #ifdef REGEX_SMART_BACKTRACKING
3351                     if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
3352                         REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
3353                         if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
3354                             // Try to find it, backwards
3355                             UTEXT_PREVIOUS32(fInputText); // skip the first character we tried
3356                             success = ((opValue & URX_NEG_SET) == URX_NEG_SET); // reset
3357                             do {
3358                                 c = UTEXT_PREVIOUS32(fInputText);
3359                                 if (c == U_SENTINEL) {
3360                                     break;
3361                                 } else if (c < 256) {
3362                                     Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
3363                                     if (s8->contains(c)) {
3364                                         success = !success;
3365                                     }
3366                                 } else {
3367                                     const UnicodeSet *s = fPattern->fStaticSets[opValue];
3368                                     if (s->contains(c)) {
3369                                         success = !success;
3370                                     }
3371                                 }
3372                             } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex && !success);
3373 
3374                             if (success && c != U_SENTINEL) {
3375                                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3376                                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3377                                 if (fp->fInputIdx > backSearchIndex) {
3378                                     fp = StateSave(fp, fp->fPatIdx, status);
3379                                 }
3380                                 fp->fPatIdx++; // Skip the LOOP_C, we just did that
3381                                 break;
3382                             }
3383                         }
3384                     }
3385                     #endif
3386                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3387                 }
3388             }
3389             break;
3390 
3391 
3392         case URX_STAT_SETREF_N:
3393             {
3394                 // Test input character for NOT being a member of  one of
3395                 //    the predefined sets (Word Characters, for example)
3396                 if (fp->fInputIdx >= fActiveLimit) {
3397                     fHitEnd = TRUE;
3398                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3399                     break;
3400                 }
3401 
3402                 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
3403 
3404                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3405 
3406                 UChar32 c = UTEXT_NEXT32(fInputText);
3407                 if (c < 256) {
3408                     Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
3409                     if (s8->contains(c) == FALSE) {
3410                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3411                         break;
3412                     }
3413                 } else {
3414                     const UnicodeSet *s = fPattern->fStaticSets[opValue];
3415                     if (s->contains(c) == FALSE) {
3416                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3417                         break;
3418                     }
3419                 }
3420                 // the character wasn't in the set.
3421                 #ifdef REGEX_SMART_BACKTRACKING
3422                 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
3423                     REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
3424                     if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
3425                         // Try to find it, backwards
3426                         UTEXT_PREVIOUS32(fInputText); // skip the first character we tried
3427                         UBool success = FALSE;
3428                         do {
3429                             c = UTEXT_PREVIOUS32(fInputText);
3430                             if (c == U_SENTINEL) {
3431                                 break;
3432                             } else if (c < 256) {
3433                                 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
3434                                 if (s8->contains(c) == FALSE) {
3435                                     success = TRUE;
3436                                     break;
3437                                 }
3438                             } else {
3439                                 const UnicodeSet *s = fPattern->fStaticSets[opValue];
3440                                 if (s->contains(c) == FALSE) {
3441                                     success = TRUE;
3442                                     break;
3443                                 }
3444                             }
3445                         } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex);
3446 
3447                         if (success) {
3448                             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3449                             fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3450                             if (fp->fInputIdx > backSearchIndex) {
3451                                 fp = StateSave(fp, fp->fPatIdx, status);
3452                             }
3453                             fp->fPatIdx++; // Skip the LOOP_C, we just did that
3454                             break;
3455                         }
3456                     }
3457                 }
3458                 #endif
3459                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3460             }
3461             break;
3462 
3463 
3464         case URX_SETREF:
3465             if (fp->fInputIdx >= fActiveLimit) {
3466                 fHitEnd = TRUE;
3467                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3468                 break;
3469             } else {
3470                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3471 
3472                 // There is input left.  Pick up one char and test it for set membership.
3473                 UChar32 c = UTEXT_NEXT32(fInputText);
3474                 U_ASSERT(opValue > 0 && opValue < sets->size());
3475                 if (c<256) {
3476                     Regex8BitSet *s8 = &fPattern->fSets8[opValue];
3477                     if (s8->contains(c)) {
3478                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3479                         break;
3480                     }
3481                 } else {
3482                     UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
3483                     if (s->contains(c)) {
3484                         // The character is in the set.  A Match.
3485                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3486                         break;
3487                     }
3488                 }
3489 
3490                 // the character wasn't in the set.
3491                 #ifdef REGEX_SMART_BACKTRACKING
3492                 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
3493                     REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
3494                     if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
3495                         // Try to find it, backwards
3496                         UTEXT_PREVIOUS32(fInputText); // skip the first character we tried
3497                         UBool success = FALSE;
3498                         do {
3499                             c = UTEXT_PREVIOUS32(fInputText);
3500                             if (c == U_SENTINEL) {
3501                                 break;
3502                             } else if (c < 256) {
3503                                 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
3504                                 if (s8->contains(c)) {
3505                                     success = TRUE;
3506                                     break;
3507                                 }
3508                             } else {
3509                                 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
3510                                 if (s->contains(c)) {
3511                                     success = TRUE;
3512                                     break;
3513                                 }
3514                             }
3515                         } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex);
3516 
3517                         if (success) {
3518                             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3519                             fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3520                             if (fp->fInputIdx > backSearchIndex) {
3521                                 fp = StateSave(fp, fp->fPatIdx, status);
3522                             }
3523                             fp->fPatIdx++; // Skip the LOOP_C, we just did that
3524                             break;
3525                         }
3526                     }
3527                 }
3528                 #endif
3529                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3530             }
3531             break;
3532 
3533 
3534         case URX_DOTANY:
3535             {
3536                 // . matches anything, but stops at end-of-line.
3537                 if (fp->fInputIdx >= fActiveLimit) {
3538                     // At end of input.  Match failed.  Backtrack out.
3539                     fHitEnd = TRUE;
3540                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3541                     break;
3542                 }
3543 
3544                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3545 
3546                 // There is input left.  Advance over one char, unless we've hit end-of-line
3547                 UChar32 c = UTEXT_NEXT32(fInputText);
3548                 if (((c & 0x7f) <= 0x29) &&     // First quickly bypass as many chars as possible
3549                     ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
3550                     // End of line in normal mode.   . does not match.
3551                         fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3552                     break;
3553                 }
3554                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3555             }
3556             break;
3557 
3558 
3559         case URX_DOTANY_ALL:
3560             {
3561                 // ., in dot-matches-all (including new lines) mode
3562                 if (fp->fInputIdx >= fActiveLimit) {
3563                     // At end of input.  Match failed.  Backtrack out.
3564                     fHitEnd = TRUE;
3565                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3566                     break;
3567                 }
3568 
3569                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3570 
3571                 // There is input left.  Advance over one char, except if we are
3572                 //   at a cr/lf, advance over both of them.
3573                 UChar32 c;
3574                 c = UTEXT_NEXT32(fInputText);
3575                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3576                 if (c==0x0d && fp->fInputIdx < fActiveLimit) {
3577                     // In the case of a CR/LF, we need to advance over both.
3578                     UChar32 nextc = UTEXT_CURRENT32(fInputText);
3579                     if (nextc == 0x0a) {
3580                         UTEXT_NEXT32(fInputText);
3581                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3582                     }
3583                 }
3584             }
3585             break;
3586 
3587 
3588         case URX_DOTANY_UNIX:
3589             {
3590                 // '.' operator, matches all, but stops at end-of-line.
3591                 //   UNIX_LINES mode, so 0x0a is the only recognized line ending.
3592                 if (fp->fInputIdx >= fActiveLimit) {
3593                     // At end of input.  Match failed.  Backtrack out.
3594                     fHitEnd = TRUE;
3595                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3596                     break;
3597                 }
3598 
3599                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3600 
3601                 // There is input left.  Advance over one char, unless we've hit end-of-line
3602                 UChar32 c = UTEXT_NEXT32(fInputText);
3603                 if (c == 0x0a) {
3604                     // End of line in normal mode.   '.' does not match the \n
3605                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3606                 } else {
3607                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3608                 }
3609             }
3610             break;
3611 
3612 
3613         case URX_JMP:
3614             fp->fPatIdx = opValue;
3615             break;
3616 
3617         case URX_FAIL:
3618             isMatch = FALSE;
3619             goto breakFromLoop;
3620 
3621         case URX_JMP_SAV:
3622             U_ASSERT(opValue < fPattern->fCompiledPat->size());
3623             fp = StateSave(fp, fp->fPatIdx, status);       // State save to loc following current
3624             fp->fPatIdx = opValue;                         // Then JMP.
3625             break;
3626 
3627         case URX_JMP_SAV_X:
3628             // This opcode is used with (x)+, when x can match a zero length string.
3629             // Same as JMP_SAV, except conditional on the match having made forward progress.
3630             // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
3631             //   data address of the input position at the start of the loop.
3632             {
3633                 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
3634                 int32_t  stoOp = (int32_t)pat[opValue-1];
3635                 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
3636                 int32_t  frameLoc = URX_VAL(stoOp);
3637                 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
3638                 int64_t prevInputIdx = fp->fExtra[frameLoc];
3639                 U_ASSERT(prevInputIdx <= fp->fInputIdx);
3640                 if (prevInputIdx < fp->fInputIdx) {
3641                     // The match did make progress.  Repeat the loop.
3642                     fp = StateSave(fp, fp->fPatIdx, status);  // State save to loc following current
3643                     fp->fPatIdx = opValue;
3644                     fp->fExtra[frameLoc] = fp->fInputIdx;
3645                 }
3646                 // If the input position did not advance, we do nothing here,
3647                 //   execution will fall out of the loop.
3648             }
3649             break;
3650 
3651         case URX_CTR_INIT:
3652             {
3653                 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
3654                 fp->fExtra[opValue] = 0;       //  Set the loop counter variable to zero
3655 
3656                 // Pick up the three extra operands that CTR_INIT has, and
3657                 //    skip the pattern location counter past
3658                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3659                 fp->fPatIdx += 3;
3660                 int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
3661                 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
3662                 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
3663                 U_ASSERT(minCount>=0);
3664                 U_ASSERT(maxCount>=minCount || maxCount==-1);
3665                 U_ASSERT(loopLoc>fp->fPatIdx);
3666 
3667                 if (minCount == 0) {
3668                     fp = StateSave(fp, loopLoc+1, status);
3669                 }
3670                 if (maxCount == 0) {
3671                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3672                 }
3673             }
3674             break;
3675 
3676         case URX_CTR_LOOP:
3677             {
3678                 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
3679                 int32_t initOp = (int32_t)pat[opValue];
3680                 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
3681                 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
3682                 int32_t minCount  = (int32_t)pat[opValue+2];
3683                 int32_t maxCount  = (int32_t)pat[opValue+3];
3684                 // Increment the counter.  Note: we DIDN'T worry about counter
3685                 //   overflow, since the data comes from UnicodeStrings, which
3686                 //   stores its length in an int32_t. Do we have to think about
3687                 //   this now that we're using UText? Probably not, since the length
3688                 //   in UChar32s is still an int32_t.
3689                 (*pCounter)++;
3690                 U_ASSERT(*pCounter > 0);
3691                 if ((uint64_t)*pCounter >= (uint32_t)maxCount) {
3692                     U_ASSERT(*pCounter == maxCount || maxCount == -1);
3693                     break;
3694                 }
3695                 if (*pCounter >= minCount) {
3696                     fp = StateSave(fp, fp->fPatIdx, status);
3697                 }
3698                 fp->fPatIdx = opValue + 4;    // Loop back.
3699             }
3700             break;
3701 
3702         case URX_CTR_INIT_NG:
3703             {
3704                 // Initialize a non-greedy loop
3705                 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
3706                 fp->fExtra[opValue] = 0;       //  Set the loop counter variable to zero
3707 
3708                 // Pick up the three extra operands that CTR_INIT has, and
3709                 //    skip the pattern location counter past
3710                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3711                 fp->fPatIdx += 3;
3712                 int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
3713                 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
3714                 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
3715                 U_ASSERT(minCount>=0);
3716                 U_ASSERT(maxCount>=minCount || maxCount==-1);
3717                 U_ASSERT(loopLoc>fp->fPatIdx);
3718 
3719                 if (minCount == 0) {
3720                     if (maxCount != 0) {
3721                         fp = StateSave(fp, fp->fPatIdx, status);
3722                     }
3723                     fp->fPatIdx = loopLoc+1;   // Continue with stuff after repeated block
3724                 }
3725             }
3726             break;
3727 
3728         case URX_CTR_LOOP_NG:
3729             {
3730                 // Non-greedy {min, max} loops
3731                 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
3732                 int32_t initOp = (int32_t)pat[opValue];
3733                 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
3734                 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
3735                 int32_t minCount  = (int32_t)pat[opValue+2];
3736                 int32_t maxCount  = (int32_t)pat[opValue+3];
3737                 // Increment the counter.  Note: we DIDN'T worry about counter
3738                 //   overflow, since the data comes from UnicodeStrings, which
3739                 //   stores its length in an int32_t. Do we have to think about
3740                 //   this now that we're using UText? Probably not, since the length
3741                 //   in UChar32s is still an int32_t.
3742                 (*pCounter)++;
3743                 U_ASSERT(*pCounter > 0);
3744 
3745                 if ((uint64_t)*pCounter >= (uint32_t)maxCount) {
3746                     // The loop has matched the maximum permitted number of times.
3747                     //   Break out of here with no action.  Matching will
3748                     //   continue with the following pattern.
3749                     U_ASSERT(*pCounter == maxCount || maxCount == -1);
3750                     break;
3751                 }
3752 
3753                 if (*pCounter < minCount) {
3754                     // We haven't met the minimum number of matches yet.
3755                     //   Loop back for another one.
3756                     fp->fPatIdx = opValue + 4;    // Loop back.
3757                 } else {
3758                     // We do have the minimum number of matches.
3759                     //   Fall into the following pattern, but first do
3760                     //   a state save to the top of the loop, so that a failure
3761                     //   in the following pattern will try another iteration of the loop.
3762                     fp = StateSave(fp, opValue + 4, status);
3763                 }
3764             }
3765             break;
3766 
3767         case URX_STO_SP:
3768             U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
3769             fData[opValue] = fStack->size();
3770             break;
3771 
3772         case URX_LD_SP:
3773             {
3774                 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
3775                 int32_t newStackSize = (int32_t)fData[opValue];
3776                 U_ASSERT(newStackSize <= fStack->size());
3777                 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
3778                 if (newFP == (int64_t *)fp) {
3779                     break;
3780                 }
3781                 int32_t i;
3782                 for (i=0; i<fFrameSize; i++) {
3783                     newFP[i] = ((int64_t *)fp)[i];
3784                 }
3785                 fp = (REStackFrame *)newFP;
3786                 fStack->setSize(newStackSize);
3787             }
3788             break;
3789 
3790         case URX_BACKREF:
3791         case URX_BACKREF_I:
3792             {
3793                 U_ASSERT(opValue < fFrameSize);
3794                 int64_t groupStartIdx = fp->fExtra[opValue];
3795                 int64_t groupEndIdx   = fp->fExtra[opValue+1];
3796                 U_ASSERT(groupStartIdx <= groupEndIdx);
3797                 if (groupStartIdx < 0) {
3798                     // This capture group has not participated in the match thus far,
3799                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
3800                 }
3801 
3802                 if (groupEndIdx == groupStartIdx) {
3803                     //   The capture group match was of an empty string.
3804                     //   Verified by testing:  Perl matches succeed in this case, so
3805                     //   we do too.
3806                     break;
3807                 }
3808 
3809                 UTEXT_SETNATIVEINDEX(fAltInputText, groupStartIdx);
3810                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3811 
3812                 UBool haveMatch = (opType == URX_BACKREF ?
3813                     (0 == utext_compareNativeLimit(fAltInputText, groupEndIdx, fInputText, -1)) :
3814                     (0 == utext_caseCompareNativeLimit(fAltInputText, groupEndIdx, fInputText, -1, U_FOLD_CASE_DEFAULT, &status)));
3815                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3816 
3817                 if (fp->fInputIdx > fActiveLimit) {
3818                     fHitEnd = TRUE;
3819                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
3820                 } else if (!haveMatch) {
3821                     if (fp->fInputIdx == fActiveLimit) {
3822                         fHitEnd = TRUE;
3823                     }
3824                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
3825                 }
3826             }
3827             break;
3828 
3829         case URX_STO_INP_LOC:
3830             {
3831                 U_ASSERT(opValue >= 0 && opValue < fFrameSize);
3832                 fp->fExtra[opValue] = fp->fInputIdx;
3833             }
3834             break;
3835 
3836         case URX_JMPX:
3837             {
3838                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3839                 fp->fPatIdx += 1;
3840                 int32_t dataLoc  = URX_VAL(pat[instrOperandLoc]);
3841                 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
3842                 int64_t savedInputIdx = fp->fExtra[dataLoc];
3843                 U_ASSERT(savedInputIdx <= fp->fInputIdx);
3844                 if (savedInputIdx < fp->fInputIdx) {
3845                     fp->fPatIdx = opValue;                               // JMP
3846                 } else {
3847                      fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no progress in loop.
3848                 }
3849             }
3850             break;
3851 
3852         case URX_LA_START:
3853             {
3854                 // Entering a lookahead block.
3855                 // Save Stack Ptr, Input Pos.
3856                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3857                 fData[opValue]   = fStack->size();
3858                 fData[opValue+1] = fp->fInputIdx;
3859                 fActiveStart     = fLookStart;          // Set the match region change for
3860                 fActiveLimit     = fLookLimit;          //   transparent bounds.
3861             }
3862             break;
3863 
3864         case URX_LA_END:
3865             {
3866                 // Leaving a look-ahead block.
3867                 //  restore Stack Ptr, Input Pos to positions they had on entry to block.
3868                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3869                 int32_t stackSize = fStack->size();
3870                 int32_t newStackSize =(int32_t)fData[opValue];
3871                 U_ASSERT(stackSize >= newStackSize);
3872                 if (stackSize > newStackSize) {
3873                     // Copy the current top frame back to the new (cut back) top frame.
3874                     //   This makes the capture groups from within the look-ahead
3875                     //   expression available.
3876                     int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
3877                     int32_t i;
3878                     for (i=0; i<fFrameSize; i++) {
3879                         newFP[i] = ((int64_t *)fp)[i];
3880                     }
3881                     fp = (REStackFrame *)newFP;
3882                     fStack->setSize(newStackSize);
3883                 }
3884                 fp->fInputIdx = fData[opValue+1];
3885 
3886                 // Restore the active region bounds in the input string; they may have
3887                 //    been changed because of transparent bounds on a Region.
3888                 fActiveStart = fRegionStart;
3889                 fActiveLimit = fRegionLimit;
3890             }
3891             break;
3892 
3893         case URX_ONECHAR_I:
3894             if (fp->fInputIdx < fActiveLimit) {
3895                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3896 
3897                 UChar32 c = UTEXT_NEXT32(fInputText);
3898                 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
3899                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3900                     break;
3901                 }
3902             } else {
3903                 fHitEnd = TRUE;
3904             }
3905 
3906             #ifdef REGEX_SMART_BACKTRACKING
3907             if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
3908                 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
3909                 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
3910                     UBool success = FALSE;
3911                     UChar32 c = UTEXT_PREVIOUS32(fInputText);
3912                     while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex) {
3913                         if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
3914                             success = TRUE;
3915                             break;
3916                         } else if (c == U_SENTINEL) {
3917                             break;
3918                         }
3919                         c = UTEXT_PREVIOUS32(fInputText);
3920                     }
3921                     if (success) {
3922                         fHitEnd = FALSE;
3923                         fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3924                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3925                         if (fp->fInputIdx > backSearchIndex) {
3926                             fp = StateSave(fp, fp->fPatIdx, status);
3927                         }
3928                         fp->fPatIdx++; // Skip the LOOP_C, we just did that
3929                         break;
3930                     }
3931                 }
3932             }
3933             #endif
3934 
3935             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3936             break;
3937 
3938         case URX_STRING_I:
3939             {
3940                 // Test input against a literal string.
3941                 // Strings require two slots in the compiled pattern, one for the
3942                 //   offset to the string text, and one for the length.
3943                 const UCaseProps *csp = ucase_getSingleton();
3944                 {
3945                     int32_t stringStartIdx, stringLen;
3946                     stringStartIdx = opValue;
3947 
3948                     op      = (int32_t)pat[fp->fPatIdx];
3949                     fp->fPatIdx++;
3950                     opType  = URX_TYPE(op);
3951                     opValue = URX_VAL(op);
3952                     U_ASSERT(opType == URX_STRING_LEN);
3953                     stringLen = opValue;
3954 
3955                     const UChar *patternChars = litText+stringStartIdx;
3956                     const UChar *patternEnd = patternChars+stringLen;
3957 
3958                     const UChar *foldChars = NULL;
3959                     int32_t foldOffset, foldLength;
3960                     UChar32 c;
3961 
3962                     foldOffset = foldLength = 0;
3963                     UBool success = TRUE;
3964 
3965                     UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3966                     while (patternChars < patternEnd && success) {
3967                         if(foldOffset < foldLength) {
3968                             U16_NEXT_UNSAFE(foldChars, foldOffset, c);
3969                         } else {
3970                             c = UTEXT_NEXT32(fInputText);
3971                             if (c != U_SENTINEL) {
3972                                 foldLength = ucase_toFullFolding(csp, c, &foldChars, U_FOLD_CASE_DEFAULT);
3973                                 if(foldLength >= 0) {
3974                                     if(foldLength <= UCASE_MAX_STRING_LENGTH) {   // !!!: Does not correctly handle chars that fold to 0-length strings
3975                                         foldOffset = 0;
3976                                         U16_NEXT_UNSAFE(foldChars, foldOffset, c);
3977                                     } else {
3978                                         c = foldLength;
3979                                         foldLength = foldOffset; // to avoid reading chars from the folding buffer
3980                                     }
3981                                 }
3982                             }
3983 
3984                             fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3985                         }
3986 
3987                         success = FALSE;
3988                         if (c != U_SENTINEL && (fp->fInputIdx <= fActiveLimit)) {
3989                             if (U_IS_BMP(c)) {
3990                                 success = (*patternChars == c);
3991                                 patternChars += 1;
3992                             } else if (patternChars+1 < patternEnd) {
3993                                 success = (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c));
3994                                 patternChars += 2;
3995                             }
3996                         } else {
3997                             fHitEnd = TRUE;          //   TODO:  See ticket 6074
3998                         }
3999                     }
4000 
4001                     if (!success) {
4002                         #ifdef REGEX_SMART_BACKTRACKING
4003                         if (fp->fInputIdx > backSearchIndex && fStack->size()) {
4004                             REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
4005                             if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
4006                                 // Reset to last start point
4007                                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
4008                                 patternChars = litText+stringStartIdx;
4009 
4010                                 // Search backwards for a possible start
4011                                 do {
4012                                     c = UTEXT_PREVIOUS32(fInputText);
4013                                     if (c == U_SENTINEL) {
4014                                         break;
4015                                     } else {
4016                                         foldLength = ucase_toFullFolding(csp, c, &foldChars, U_FOLD_CASE_DEFAULT);
4017                                         if(foldLength >= 0) {
4018                                             if(foldLength <= UCASE_MAX_STRING_LENGTH) {   // !!!: Does not correctly handle chars that fold to 0-length strings
4019                                                 foldOffset = 0;
4020                                                 U16_NEXT_UNSAFE(foldChars, foldOffset, c);
4021                                             } else {
4022                                                 c = foldLength;
4023                                                 foldLength = foldOffset; // to avoid reading chars from the folding buffer
4024                                             }
4025                                         }
4026 
4027                                         if ((U_IS_BMP(c) && *patternChars == c) ||
4028                                                (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c))) {
4029                                             success = TRUE;
4030                                             break;
4031                                         }
4032                                     }
4033                                 } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex);
4034 
4035                                 // And try again
4036                                 if (success) {
4037                                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4038                                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
4039                                     if (fp->fInputIdx > backSearchIndex) {
4040                                         fp = StateSave(fp, fp->fPatIdx, status);
4041                                     }
4042                                     fp->fPatIdx++; // Skip the LOOP_C, we just did that
4043                                     break;
4044                                 }
4045                             }
4046                         }
4047                         #endif
4048                         fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4049                     }
4050                 }
4051             }
4052             break;
4053 
4054         case URX_LB_START:
4055             {
4056                 // Entering a look-behind block.
4057                 // Save Stack Ptr, Input Pos.
4058                 //   TODO:  implement transparent bounds.  Ticket #6067
4059                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4060                 fData[opValue]   = fStack->size();
4061                 fData[opValue+1] = fp->fInputIdx;
4062                 // Init the variable containing the start index for attempted matches.
4063                 fData[opValue+2] = -1;
4064                 // Save input string length, then reset to pin any matches to end at
4065                 //   the current position.
4066                 fData[opValue+3] = fActiveLimit;
4067                 fActiveLimit     = fp->fInputIdx;
4068             }
4069             break;
4070 
4071 
4072         case URX_LB_CONT:
4073             {
4074                 // Positive Look-Behind, at top of loop checking for matches of LB expression
4075                 //    at all possible input starting positions.
4076 
4077                 // Fetch the min and max possible match lengths.  They are the operands
4078                 //   of this op in the pattern.
4079                 int32_t minML = (int32_t)pat[fp->fPatIdx++];
4080                 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
4081                 U_ASSERT(minML <= maxML);
4082                 U_ASSERT(minML >= 0);
4083 
4084                 // Fetch (from data) the last input index where a match was attempted.
4085                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4086                 int64_t  *lbStartIdx = &fData[opValue+2];
4087                 if (*lbStartIdx < 0) {
4088                     // First time through loop.
4089                     *lbStartIdx = fp->fInputIdx - minML;
4090                 } else {
4091                     // 2nd through nth time through the loop.
4092                     // Back up start position for match by one.
4093                     if (*lbStartIdx == 0) {
4094                         (*lbStartIdx)--;
4095                     } else {
4096                         UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx);
4097                         UTEXT_PREVIOUS32(fInputText);
4098                         *lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
4099                     }
4100                 }
4101 
4102                 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
4103                     // We have tried all potential match starting points without
4104                     //  getting a match.  Backtrack out, and out of the
4105                     //   Look Behind altogether.
4106                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4107                     int64_t restoreInputLen = fData[opValue+3];
4108                     U_ASSERT(restoreInputLen >= fActiveLimit);
4109                     U_ASSERT(restoreInputLen <= fInputLength);
4110                     fActiveLimit = restoreInputLen;
4111                     break;
4112                 }
4113 
4114                 //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
4115                 //      (successful match will fall off the end of the loop.)
4116                 fp = StateSave(fp, fp->fPatIdx-3, status);
4117                 fp->fInputIdx = *lbStartIdx;
4118             }
4119             break;
4120 
4121         case URX_LB_END:
4122             // End of a look-behind block, after a successful match.
4123             {
4124                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4125                 if (fp->fInputIdx != fActiveLimit) {
4126                     //  The look-behind expression matched, but the match did not
4127                     //    extend all the way to the point that we are looking behind from.
4128                     //  FAIL out of here, which will take us back to the LB_CONT, which
4129                     //     will retry the match starting at another position or fail
4130                     //     the look-behind altogether, whichever is appropriate.
4131                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4132                     break;
4133                 }
4134 
4135                 // Look-behind match is good.  Restore the orignal input string length,
4136                 //   which had been truncated to pin the end of the lookbehind match to the
4137                 //   position being looked-behind.
4138                 int64_t originalInputLen = fData[opValue+3];
4139                 U_ASSERT(originalInputLen >= fActiveLimit);
4140                 U_ASSERT(originalInputLen <= fInputLength);
4141                 fActiveLimit = originalInputLen;
4142             }
4143             break;
4144 
4145 
4146         case URX_LBN_CONT:
4147             {
4148                 // Negative Look-Behind, at top of loop checking for matches of LB expression
4149                 //    at all possible input starting positions.
4150 
4151                 // Fetch the extra parameters of this op.
4152                 int32_t minML       = (int32_t)pat[fp->fPatIdx++];
4153                 int32_t maxML       = (int32_t)pat[fp->fPatIdx++];
4154                 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
4155                         continueLoc = URX_VAL(continueLoc);
4156                 U_ASSERT(minML <= maxML);
4157                 U_ASSERT(minML >= 0);
4158                 U_ASSERT(continueLoc > fp->fPatIdx);
4159 
4160                 // Fetch (from data) the last input index where a match was attempted.
4161                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4162                 int64_t  *lbStartIdx = &fData[opValue+2];
4163                 if (*lbStartIdx < 0) {
4164                     // First time through loop.
4165                     *lbStartIdx = fp->fInputIdx - minML;
4166                 } else {
4167                     // 2nd through nth time through the loop.
4168                     // Back up start position for match by one.
4169                     if (*lbStartIdx == 0) {
4170                         (*lbStartIdx)--;
4171                     } else {
4172                         UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx);
4173                         UTEXT_PREVIOUS32(fInputText);
4174                         *lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
4175                     }
4176                 }
4177 
4178                 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
4179                     // We have tried all potential match starting points without
4180                     //  getting a match, which means that the negative lookbehind as
4181                     //  a whole has succeeded.  Jump forward to the continue location
4182                     int64_t restoreInputLen = fData[opValue+3];
4183                     U_ASSERT(restoreInputLen >= fActiveLimit);
4184                     U_ASSERT(restoreInputLen <= fInputLength);
4185                     fActiveLimit = restoreInputLen;
4186                     fp->fPatIdx = continueLoc;
4187                     break;
4188                 }
4189 
4190                 //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
4191                 //      (successful match will cause a FAIL out of the loop altogether.)
4192                 fp = StateSave(fp, fp->fPatIdx-4, status);
4193                 fp->fInputIdx = *lbStartIdx;
4194             }
4195             break;
4196 
4197         case URX_LBN_END:
4198             // End of a negative look-behind block, after a successful match.
4199             {
4200                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4201                 if (fp->fInputIdx != fActiveLimit) {
4202                     //  The look-behind expression matched, but the match did not
4203                     //    extend all the way to the point that we are looking behind from.
4204                     //  FAIL out of here, which will take us back to the LB_CONT, which
4205                     //     will retry the match starting at another position or succeed
4206                     //     the look-behind altogether, whichever is appropriate.
4207                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4208                     break;
4209                 }
4210 
4211                 // Look-behind expression matched, which means look-behind test as
4212                 //   a whole Fails
4213 
4214                 //   Restore the orignal input string length, which had been truncated
4215                 //   inorder to pin the end of the lookbehind match
4216                 //   to the position being looked-behind.
4217                 int64_t originalInputLen = fData[opValue+3];
4218                 U_ASSERT(originalInputLen >= fActiveLimit);
4219                 U_ASSERT(originalInputLen <= fInputLength);
4220                 fActiveLimit = originalInputLen;
4221 
4222                 // Restore original stack position, discarding any state saved
4223                 //   by the successful pattern match.
4224                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4225                 int32_t newStackSize = (int32_t)fData[opValue];
4226                 U_ASSERT(fStack->size() > newStackSize);
4227                 fStack->setSize(newStackSize);
4228 
4229                 //  FAIL, which will take control back to someplace
4230                 //  prior to entering the look-behind test.
4231                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4232             }
4233             break;
4234 
4235 
4236         case URX_LOOP_SR_I:
4237             // Loop Initialization for the optimized implementation of
4238             //     [some character set]*
4239             //   This op scans through all matching input.
4240             //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
4241             {
4242                 U_ASSERT(opValue > 0 && opValue < sets->size());
4243                 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
4244                 UnicodeSet   *s  = (UnicodeSet *)sets->elementAt(opValue);
4245 
4246                 // Loop through input, until either the input is exhausted or
4247                 //   we reach a character that is not a member of the set.
4248                 int64_t ix = fp->fInputIdx;
4249                 UTEXT_SETNATIVEINDEX(fInputText, ix);
4250                 for (;;) {
4251                     if (ix >= fActiveLimit) {
4252                         fHitEnd = TRUE;
4253                         break;
4254                     }
4255                     UChar32 c = UTEXT_NEXT32(fInputText);
4256                     if (c<256) {
4257                         if (s8->contains(c) == FALSE) {
4258                             break;
4259                         }
4260                     } else {
4261                         if (s->contains(c) == FALSE) {
4262                             break;
4263                         }
4264                     }
4265                     ix = UTEXT_GETNATIVEINDEX(fInputText);
4266                 }
4267 
4268                 // If there were no matching characters, skip over the loop altogether.
4269                 //   The loop doesn't run at all, a * op always succeeds.
4270                 if (ix == fp->fInputIdx) {
4271                     fp->fPatIdx++;   // skip the URX_LOOP_C op.
4272                     break;
4273                 }
4274 
4275                 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
4276                 //   must follow.  It's operand is the stack location
4277                 //   that holds the starting input index for the match of this [set]*
4278                 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
4279                 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
4280                 int32_t stackLoc = URX_VAL(loopcOp);
4281                 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
4282                 fp->fExtra[stackLoc] = fp->fInputIdx;
4283                 #ifdef REGEX_SMART_BACKTRACKING
4284                 backSearchIndex = fp->fInputIdx;
4285                 #endif
4286                 fp->fInputIdx = ix;
4287 
4288                 // Save State to the URX_LOOP_C op that follows this one,
4289                 //   so that match failures in the following code will return to there.
4290                 //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
4291                 fp = StateSave(fp, fp->fPatIdx, status);
4292                 fp->fPatIdx++;
4293             }
4294             break;
4295 
4296 
4297         case URX_LOOP_DOT_I:
4298             // Loop Initialization for the optimized implementation of .*
4299             //   This op scans through all remaining input.
4300             //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
4301             {
4302                 // Loop through input until the input is exhausted (we reach an end-of-line)
4303                 // In DOTALL mode, we can just go straight to the end of the input.
4304                 int64_t ix;
4305                 if ((opValue & 1) == 1) {
4306                     // Dot-matches-All mode.  Jump straight to the end of the string.
4307                     ix = fActiveLimit;
4308                     fHitEnd = TRUE;
4309                 } else {
4310                     // NOT DOT ALL mode.  Line endings do not match '.'
4311                     // Scan forward until a line ending or end of input.
4312                     ix = fp->fInputIdx;
4313                     UTEXT_SETNATIVEINDEX(fInputText, ix);
4314                     for (;;) {
4315                         if (ix >= fActiveLimit) {
4316                             fHitEnd = TRUE;
4317                             break;
4318                         }
4319                         UChar32 c = UTEXT_NEXT32(fInputText);
4320                         if ((c & 0x7f) <= 0x29) {          // Fast filter of non-new-line-s
4321                             if ((c == 0x0a) ||             //  0x0a is newline in both modes.
4322                                (((opValue & 2) == 0) &&    // IF not UNIX_LINES mode
4323                                     (c<=0x0d && c>=0x0a)) || c==0x85 ||c==0x2028 || c==0x2029) {
4324                                 //  char is a line ending.  Exit the scanning loop.
4325                                 break;
4326                             }
4327                         }
4328                         ix = UTEXT_GETNATIVEINDEX(fInputText);
4329                     }
4330                 }
4331 
4332                 // If there were no matching characters, skip over the loop altogether.
4333                 //   The loop doesn't run at all, a * op always succeeds.
4334                 if (ix == fp->fInputIdx) {
4335                     fp->fPatIdx++;   // skip the URX_LOOP_C op.
4336                     break;
4337                 }
4338 
4339                 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
4340                 //   must follow.  It's operand is the stack location
4341                 //   that holds the starting input index for the match of this .*
4342                 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
4343                 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
4344                 int32_t stackLoc = URX_VAL(loopcOp);
4345                 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
4346                 fp->fExtra[stackLoc] = fp->fInputIdx;
4347                 #ifdef REGEX_SMART_BACKTRACKING
4348                 backSearchIndex = fp->fInputIdx;
4349                 #endif
4350                 fp->fInputIdx = ix;
4351 
4352                 // Save State to the URX_LOOP_C op that follows this one,
4353                 //   so that match failures in the following code will return to there.
4354                 //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
4355                 fp = StateSave(fp, fp->fPatIdx, status);
4356                 fp->fPatIdx++;
4357             }
4358             break;
4359 
4360 
4361         case URX_LOOP_C:
4362             {
4363                 U_ASSERT(opValue>=0 && opValue<fFrameSize);
4364                 backSearchIndex = fp->fExtra[opValue];
4365                 U_ASSERT(backSearchIndex <= fp->fInputIdx);
4366                 if (backSearchIndex == fp->fInputIdx) {
4367                     // We've backed up the input idx to the point that the loop started.
4368                     // The loop is done.  Leave here without saving state.
4369                     //  Subsequent failures won't come back here.
4370                     break;
4371                 }
4372                 // Set up for the next iteration of the loop, with input index
4373                 //   backed up by one from the last time through,
4374                 //   and a state save to this instruction in case the following code fails again.
4375                 //   (We're going backwards because this loop emulates stack unwinding, not
4376                 //    the initial scan forward.)
4377                 U_ASSERT(fp->fInputIdx > 0);
4378                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
4379                 UChar32 prevC = UTEXT_PREVIOUS32(fInputText);
4380                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
4381 
4382                 UChar32 twoPrevC = UTEXT_PREVIOUS32(fInputText);
4383                 if (prevC == 0x0a &&
4384                     fp->fInputIdx > backSearchIndex &&
4385                     twoPrevC == 0x0d) {
4386                     int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
4387                     if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
4388                         // .*, stepping back over CRLF pair.
4389                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
4390                     }
4391                 }
4392 
4393 
4394                 fp = StateSave(fp, fp->fPatIdx-1, status);
4395             }
4396             break;
4397 
4398 
4399 
4400         default:
4401             // Trouble.  The compiled pattern contains an entry with an
4402             //           unrecognized type tag.
4403             U_ASSERT(FALSE);
4404         }
4405 
4406         if (U_FAILURE(status)) {
4407             isMatch = FALSE;
4408             break;
4409         }
4410     }
4411 
4412 breakFromLoop:
4413     fMatch = isMatch;
4414     if (isMatch) {
4415         fLastMatchEnd = fMatchEnd;
4416         fMatchStart   = startIdx;
4417         fMatchEnd     = fp->fInputIdx;
4418         if (fTraceDebug) {
4419             REGEX_RUN_DEBUG_PRINTF(("Match.  start=%d   end=%d\n\n", fMatchStart, fMatchEnd));
4420         }
4421     }
4422     else
4423     {
4424         if (fTraceDebug) {
4425             REGEX_RUN_DEBUG_PRINTF(("No match\n\n"));
4426         }
4427     }
4428 
4429     fFrame = fp;                // The active stack frame when the engine stopped.
4430                                 //   Contains the capture group results that we need to
4431                                 //    access later.
4432     return;
4433 }
4434 
4435 
4436 //--------------------------------------------------------------------------------
4437 //
4438 //   MatchChunkAt   This is the actual matching engine. Like MatchAt, but with the
4439 //                  assumption that the entire string is available in the UText's
4440 //                  chunk buffer. For now, that means we can use int32_t indexes,
4441 //                  except for anything that needs to be saved (like group starts
4442 //                  and ends).
4443 //
4444 //                  startIdx:    begin matching a this index.
4445 //                  toEnd:       if true, match must extend to end of the input region
4446 //
4447 //--------------------------------------------------------------------------------
MatchChunkAt(int32_t startIdx,UBool toEnd,UErrorCode & status)4448 void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status) {
4449     UBool       isMatch  = FALSE;      // True if the we have a match.
4450 
4451     int32_t     backSearchIndex = INT32_MAX; // used after greedy single-character matches for searching backwards
4452 
4453     int32_t     op;                    // Operation from the compiled pattern, split into
4454     int32_t     opType;                //    the opcode
4455     int32_t     opValue;               //    and the operand value.
4456 
4457 #ifdef REGEX_RUN_DEBUG
4458     if (fTraceDebug)
4459     {
4460         printf("MatchAt(startIdx=%ld)\n", startIdx);
4461         printf("Original Pattern: ");
4462         UChar32 c = utext_next32From(fPattern->fPattern, 0);
4463         while (c != U_SENTINEL) {
4464             if (c<32 || c>256) {
4465                 c = '.';
4466             }
4467             REGEX_DUMP_DEBUG_PRINTF(("%c", c));
4468 
4469             c = UTEXT_NEXT32(fPattern->fPattern);
4470         }
4471         printf("\n");
4472         printf("Input String: ");
4473         c = utext_next32From(fInputText, 0);
4474         while (c != U_SENTINEL) {
4475             if (c<32 || c>256) {
4476                 c = '.';
4477             }
4478             printf("%c", c);
4479 
4480             c = UTEXT_NEXT32(fInputText);
4481         }
4482         printf("\n");
4483         printf("\n");
4484     }
4485 #endif
4486 
4487     if (U_FAILURE(status)) {
4488         return;
4489     }
4490 
4491     //  Cache frequently referenced items from the compiled pattern
4492     //
4493     int64_t             *pat           = fPattern->fCompiledPat->getBuffer();
4494 
4495     const UChar         *litText       = fPattern->fLiteralText.getBuffer();
4496     UVector             *sets          = fPattern->fSets;
4497 
4498     const UChar         *inputBuf      = fInputText->chunkContents;
4499 
4500     fFrameSize = fPattern->fFrameSize;
4501     REStackFrame        *fp            = resetStack();
4502 
4503     fp->fPatIdx   = 0;
4504     fp->fInputIdx = startIdx;
4505 
4506     // Zero out the pattern's static data
4507     int32_t i;
4508     for (i = 0; i<fPattern->fDataSize; i++) {
4509         fData[i] = 0;
4510     }
4511 
4512     //
4513     //  Main loop for interpreting the compiled pattern.
4514     //  One iteration of the loop per pattern operation performed.
4515     //
4516     for (;;) {
4517 #if 0
4518         if (_heapchk() != _HEAPOK) {
4519             fprintf(stderr, "Heap Trouble\n");
4520         }
4521 #endif
4522 
4523         op      = (int32_t)pat[fp->fPatIdx];
4524         opType  = URX_TYPE(op);
4525         opValue = URX_VAL(op);
4526 #ifdef REGEX_RUN_DEBUG
4527         if (fTraceDebug) {
4528             UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
4529             printf("inputIdx=%d   inputChar=%x   sp=%3d   activeLimit=%d  ", fp->fInputIdx,
4530                    UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
4531             fPattern->dumpOp(fp->fPatIdx);
4532         }
4533 #endif
4534         fp->fPatIdx++;
4535 
4536         switch (opType) {
4537 
4538 
4539         case URX_NOP:
4540             break;
4541 
4542 
4543         case URX_BACKTRACK:
4544             // Force a backtrack.  In some circumstances, the pattern compiler
4545             //   will notice that the pattern can't possibly match anything, and will
4546             //   emit one of these at that point.
4547             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4548             break;
4549 
4550 
4551         case URX_ONECHAR:
4552             if (fp->fInputIdx < fActiveLimit) {
4553                 UChar32 c;
4554                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4555                 if (c == opValue) {
4556                     break;
4557                 }
4558             } else {
4559                 fHitEnd = TRUE;
4560             }
4561 
4562             #ifdef REGEX_SMART_BACKTRACKING
4563             if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
4564                 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
4565                 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
4566                     int64_t reverseIndex = fp->fInputIdx;
4567                     UChar32 c;
4568                     do {
4569                         U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
4570                         if (c == opValue) {
4571                             break;
4572                         }
4573                     } while (reverseIndex > backSearchIndex);
4574                     if (c == opValue) {
4575                         fHitEnd = FALSE;
4576                         fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4577                         fp->fInputIdx = reverseIndex;
4578                         if (fp->fInputIdx > backSearchIndex) {
4579                             fp = StateSave(fp, fp->fPatIdx, status);
4580                         }
4581                         fp->fPatIdx++; // Skip the LOOP_C, we just did that
4582                         break;
4583                     }
4584                 }
4585             }
4586             #endif
4587 
4588             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4589             break;
4590 
4591 
4592         case URX_STRING:
4593             {
4594                 // Test input against a literal string.
4595                 // Strings require two slots in the compiled pattern, one for the
4596                 //   offset to the string text, and one for the length.
4597                 int32_t   stringStartIdx = opValue;
4598                 int32_t   stringLen;
4599 
4600                 op      = (int32_t)pat[fp->fPatIdx];     // Fetch the second operand
4601                 fp->fPatIdx++;
4602                 opType    = URX_TYPE(op);
4603                 stringLen = URX_VAL(op);
4604                 U_ASSERT(opType == URX_STRING_LEN);
4605                 U_ASSERT(stringLen >= 2);
4606 
4607                 if (fp->fInputIdx + stringLen > fActiveLimit) {
4608                     // No match.  String is longer than the remaining input text.
4609                     fHitEnd = TRUE;          //   TODO:  See ticket 6074
4610                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4611                     break;
4612                 }
4613 
4614                 const UChar * pInp = inputBuf + fp->fInputIdx;
4615                 const UChar * pPat = litText+stringStartIdx;
4616                 const UChar * pEnd = pInp + stringLen;
4617                 UBool success = FALSE;
4618                 for(;;) {
4619                     if (*pInp == *pPat) {
4620                         pInp++;
4621                         pPat++;
4622                         if (pInp == pEnd) {
4623                             // Successful Match.
4624                             success = TRUE;
4625                             break;
4626                         }
4627                     } else {
4628                         // Match failed.
4629                         break;
4630                     }
4631                 }
4632 
4633                 if (success) {
4634                     fp->fInputIdx += stringLen;
4635                 } else {
4636                     #ifdef REGEX_SMART_BACKTRACKING
4637                     if (fp->fInputIdx > backSearchIndex && fStack->size()) {
4638                         REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
4639                         if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
4640                             // Reset to last start point
4641                             int64_t reverseIndex = fp->fInputIdx;
4642                             UChar32 c;
4643                             pPat = litText+stringStartIdx;
4644 
4645                             // Search backwards for a possible start
4646                             do {
4647                                 U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
4648                                 if ((U_IS_BMP(c) && *pPat == c) ||
4649                                     (*pPat == U16_LEAD(c) && *(pPat+1) == U16_TRAIL(c))) {
4650                                     success = TRUE;
4651                                     break;
4652                                 }
4653                             } while (reverseIndex > backSearchIndex);
4654 
4655                             // And try again
4656                             if (success) {
4657                                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4658                                 fp->fInputIdx = reverseIndex;
4659                                 if (fp->fInputIdx > backSearchIndex) {
4660                                     fp = StateSave(fp, fp->fPatIdx, status);
4661                                 }
4662                                 fp->fPatIdx++; // Skip the LOOP_C, we just did that
4663                                 break;
4664                             }
4665                         }
4666                     }
4667                     #endif
4668                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4669                 }
4670             }
4671             break;
4672 
4673 
4674         case URX_STATE_SAVE:
4675             fp = StateSave(fp, opValue, status);
4676             break;
4677 
4678 
4679         case URX_END:
4680             // The match loop will exit via this path on a successful match,
4681             //   when we reach the end of the pattern.
4682             if (toEnd && fp->fInputIdx != fActiveLimit) {
4683                 // The pattern matched, but not to the end of input.  Try some more.
4684                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4685                 break;
4686             }
4687             isMatch = TRUE;
4688             goto  breakFromLoop;
4689 
4690             // Start and End Capture stack frame variables are laid out out like this:
4691             //  fp->fExtra[opValue]  - The start of a completed capture group
4692             //             opValue+1 - The end   of a completed capture group
4693             //             opValue+2 - the start of a capture group whose end
4694             //                          has not yet been reached (and might not ever be).
4695         case URX_START_CAPTURE:
4696             U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
4697             fp->fExtra[opValue+2] = fp->fInputIdx;
4698             break;
4699 
4700 
4701         case URX_END_CAPTURE:
4702             U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
4703             U_ASSERT(fp->fExtra[opValue+2] >= 0);            // Start pos for this group must be set.
4704             fp->fExtra[opValue]   = fp->fExtra[opValue+2];   // Tentative start becomes real.
4705             fp->fExtra[opValue+1] = fp->fInputIdx;           // End position
4706             U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
4707             break;
4708 
4709 
4710         case URX_DOLLAR:                   //  $, test for End of line
4711             //     or for position before new line at end of input
4712             if (fp->fInputIdx < fAnchorLimit-2) {
4713                 // We are no where near the end of input.  Fail.
4714                 //   This is the common case.  Keep it first.
4715                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4716                 break;
4717             }
4718             if (fp->fInputIdx >= fAnchorLimit) {
4719                 // We really are at the end of input.  Success.
4720                 fHitEnd = TRUE;
4721                 fRequireEnd = TRUE;
4722                 break;
4723             }
4724 
4725             // If we are positioned just before a new-line that is located at the
4726             //   end of input, succeed.
4727             if (fp->fInputIdx == fAnchorLimit-1) {
4728                 UChar32 c;
4729                 U16_GET(inputBuf, fAnchorStart, fp->fInputIdx, fAnchorLimit, c);
4730 
4731                 if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) {
4732                     if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
4733                         // At new-line at end of input. Success
4734                         fHitEnd = TRUE;
4735                         fRequireEnd = TRUE;
4736                         break;
4737                     }
4738                 }
4739             } else if (fp->fInputIdx == fAnchorLimit-2 &&
4740                 inputBuf[fp->fInputIdx]==0x0d && inputBuf[fp->fInputIdx+1]==0x0a) {
4741                     fHitEnd = TRUE;
4742                     fRequireEnd = TRUE;
4743                     break;                         // At CR/LF at end of input.  Success
4744             }
4745 
4746             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4747 
4748             break;
4749 
4750 
4751         case URX_DOLLAR_D:                   //  $, test for End of Line, in UNIX_LINES mode.
4752             if (fp->fInputIdx >= fAnchorLimit-1) {
4753                 // Either at the last character of input, or off the end.
4754                 if (fp->fInputIdx == fAnchorLimit-1) {
4755                     // At last char of input.  Success if it's a new line.
4756                     if (inputBuf[fp->fInputIdx] == 0x0a) {
4757                         fHitEnd = TRUE;
4758                         fRequireEnd = TRUE;
4759                         break;
4760                     }
4761                 } else {
4762                     // Off the end of input.  Success.
4763                     fHitEnd = TRUE;
4764                     fRequireEnd = TRUE;
4765                     break;
4766                 }
4767             }
4768 
4769             // Not at end of input.  Back-track out.
4770             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4771             break;
4772 
4773 
4774         case URX_DOLLAR_M:                //  $, test for End of line in multi-line mode
4775             {
4776                 if (fp->fInputIdx >= fAnchorLimit) {
4777                     // We really are at the end of input.  Success.
4778                     fHitEnd = TRUE;
4779                     fRequireEnd = TRUE;
4780                     break;
4781                 }
4782                 // If we are positioned just before a new-line, succeed.
4783                 // It makes no difference where the new-line is within the input.
4784                 UChar32 c = inputBuf[fp->fInputIdx];
4785                 if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) {
4786                     // At a line end, except for the odd chance of  being in the middle of a CR/LF sequence
4787                     //  In multi-line mode, hitting a new-line just before the end of input does not
4788                     //   set the hitEnd or requireEnd flags
4789                     if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
4790                         break;
4791                     }
4792                 }
4793                 // not at a new line.  Fail.
4794                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4795             }
4796             break;
4797 
4798 
4799         case URX_DOLLAR_MD:                //  $, test for End of line in multi-line and UNIX_LINES mode
4800             {
4801                 if (fp->fInputIdx >= fAnchorLimit) {
4802                     // We really are at the end of input.  Success.
4803                     fHitEnd = TRUE;
4804                     fRequireEnd = TRUE;  // Java set requireEnd in this case, even though
4805                     break;               //   adding a new-line would not lose the match.
4806                 }
4807                 // If we are not positioned just before a new-line, the test fails; backtrack out.
4808                 // It makes no difference where the new-line is within the input.
4809                 if (inputBuf[fp->fInputIdx] != 0x0a) {
4810                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4811                 }
4812             }
4813             break;
4814 
4815 
4816         case URX_CARET:                    //  ^, test for start of line
4817             if (fp->fInputIdx != fAnchorStart) {
4818                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4819             }
4820             break;
4821 
4822 
4823         case URX_CARET_M:                   //  ^, test for start of line in mulit-line mode
4824             {
4825                 if (fp->fInputIdx == fAnchorStart) {
4826                     // We are at the start input.  Success.
4827                     break;
4828                 }
4829                 // Check whether character just before the current pos is a new-line
4830                 //   unless we are at the end of input
4831                 UChar  c = inputBuf[fp->fInputIdx - 1];
4832                 if ((fp->fInputIdx < fAnchorLimit) &&
4833                     ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
4834                     //  It's a new-line.  ^ is true.  Success.
4835                     //  TODO:  what should be done with positions between a CR and LF?
4836                     break;
4837                 }
4838                 // Not at the start of a line.  Fail.
4839                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4840             }
4841             break;
4842 
4843 
4844         case URX_CARET_M_UNIX:       //  ^, test for start of line in mulit-line + Unix-line mode
4845             {
4846                 U_ASSERT(fp->fInputIdx >= fAnchorStart);
4847                 if (fp->fInputIdx <= fAnchorStart) {
4848                     // We are at the start input.  Success.
4849                     break;
4850                 }
4851                 // Check whether character just before the current pos is a new-line
4852                 U_ASSERT(fp->fInputIdx <= fAnchorLimit);
4853                 UChar  c = inputBuf[fp->fInputIdx - 1];
4854                 if (c != 0x0a) {
4855                     // Not at the start of a line.  Back-track out.
4856                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4857                 }
4858             }
4859             break;
4860 
4861         case URX_BACKSLASH_B:          // Test for word boundaries
4862             {
4863                 UBool success = isChunkWordBoundary((int32_t)fp->fInputIdx);
4864                 success ^= (opValue != 0);     // flip sense for \B
4865                 if (!success) {
4866                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4867                 }
4868             }
4869             break;
4870 
4871 
4872         case URX_BACKSLASH_BU:          // Test for word boundaries, Unicode-style
4873             {
4874                 UBool success = isUWordBoundary(fp->fInputIdx);
4875                 success ^= (opValue != 0);     // flip sense for \B
4876                 if (!success) {
4877                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4878                 }
4879             }
4880             break;
4881 
4882 
4883         case URX_BACKSLASH_D:            // Test for decimal digit
4884             {
4885                 if (fp->fInputIdx >= fActiveLimit) {
4886                     fHitEnd = TRUE;
4887                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4888                     break;
4889                 }
4890 
4891                 UChar32 c;
4892                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4893                 int8_t ctype = u_charType(c);     // TODO:  make a unicode set for this.  Will be faster.
4894                 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
4895                 success ^= (opValue != 0);        // flip sense for \D
4896                 if (!success) {
4897                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4898                 }
4899             }
4900             break;
4901 
4902 
4903         case URX_BACKSLASH_G:          // Test for position at end of previous match
4904             if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) {
4905                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4906             }
4907             break;
4908 
4909 
4910         case URX_BACKSLASH_X:
4911         //  Match a Grapheme, as defined by Unicode TR 29.
4912         //  Differs slightly from Perl, which consumes combining marks independently
4913         //    of context.
4914         {
4915 
4916             // Fail if at end of input
4917             if (fp->fInputIdx >= fActiveLimit) {
4918                 fHitEnd = TRUE;
4919                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4920                 break;
4921             }
4922 
4923             // Examine (and consume) the current char.
4924             //   Dispatch into a little state machine, based on the char.
4925             UChar32  c;
4926             U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4927             UnicodeSet **sets = fPattern->fStaticSets;
4928             if (sets[URX_GC_NORMAL]->contains(c))  goto GC_Extend;
4929             if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control;
4930             if (sets[URX_GC_L]->contains(c))       goto GC_L;
4931             if (sets[URX_GC_LV]->contains(c))      goto GC_V;
4932             if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
4933             if (sets[URX_GC_V]->contains(c))       goto GC_V;
4934             if (sets[URX_GC_T]->contains(c))       goto GC_T;
4935             goto GC_Extend;
4936 
4937 
4938 
4939 GC_L:
4940             if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
4941             U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4942             if (sets[URX_GC_L]->contains(c))       goto GC_L;
4943             if (sets[URX_GC_LV]->contains(c))      goto GC_V;
4944             if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
4945             if (sets[URX_GC_V]->contains(c))       goto GC_V;
4946             U16_PREV(inputBuf, 0, fp->fInputIdx, c);
4947             goto GC_Extend;
4948 
4949 GC_V:
4950             if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
4951             U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4952             if (sets[URX_GC_V]->contains(c))       goto GC_V;
4953             if (sets[URX_GC_T]->contains(c))       goto GC_T;
4954             U16_PREV(inputBuf, 0, fp->fInputIdx, c);
4955             goto GC_Extend;
4956 
4957 GC_T:
4958             if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
4959             U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4960             if (sets[URX_GC_T]->contains(c))       goto GC_T;
4961             U16_PREV(inputBuf, 0, fp->fInputIdx, c);
4962             goto GC_Extend;
4963 
4964 GC_Extend:
4965             // Combining characters are consumed here
4966             for (;;) {
4967                 if (fp->fInputIdx >= fActiveLimit) {
4968                     break;
4969                 }
4970                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4971                 if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
4972                     U16_BACK_1(inputBuf, 0, fp->fInputIdx);
4973                     break;
4974                 }
4975             }
4976             goto GC_Done;
4977 
4978 GC_Control:
4979             // Most control chars stand alone (don't combine with combining chars),
4980             //   except for that CR/LF sequence is a single grapheme cluster.
4981             if (c == 0x0d && fp->fInputIdx < fActiveLimit && inputBuf[fp->fInputIdx] == 0x0a) {
4982                 fp->fInputIdx++;
4983             }
4984 
4985 GC_Done:
4986             if (fp->fInputIdx >= fActiveLimit) {
4987                 fHitEnd = TRUE;
4988             }
4989             break;
4990         }
4991 
4992 
4993 
4994 
4995         case URX_BACKSLASH_Z:          // Test for end of Input
4996             if (fp->fInputIdx < fAnchorLimit) {
4997                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4998             } else {
4999                 fHitEnd = TRUE;
5000                 fRequireEnd = TRUE;
5001             }
5002             break;
5003 
5004 
5005 
5006         case URX_STATIC_SETREF:
5007             {
5008                 // Test input character against one of the predefined sets
5009                 //    (Word Characters, for example)
5010                 // The high bit of the op value is a flag for the match polarity.
5011                 //    0:   success if input char is in set.
5012                 //    1:   success if input char is not in set.
5013                 if (fp->fInputIdx >= fActiveLimit) {
5014                     fHitEnd = TRUE;
5015                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5016                     break;
5017                 }
5018 
5019                 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
5020                 opValue &= ~URX_NEG_SET;
5021                 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
5022 
5023                 UChar32 c;
5024                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5025                 if (c < 256) {
5026                     Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
5027                     if (s8->contains(c)) {
5028                         success = !success;
5029                     }
5030                 } else {
5031                     const UnicodeSet *s = fPattern->fStaticSets[opValue];
5032                     if (s->contains(c)) {
5033                         success = !success;
5034                     }
5035                 }
5036                 if (!success) {
5037                     #ifdef REGEX_SMART_BACKTRACKING
5038                     if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
5039                         REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
5040                         if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
5041                             // Try to find it, backwards
5042                             int64_t reverseIndex = fp->fInputIdx;
5043                             U16_BACK_1(inputBuf, backSearchIndex, reverseIndex); // skip the first character we tried
5044                             success = ((opValue & URX_NEG_SET) == URX_NEG_SET); // reset
5045                             do {
5046                                 U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
5047                                 if (c < 256) {
5048                                     Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
5049                                     if (s8->contains(c)) {
5050                                         success = !success;
5051                                     }
5052                                 } else {
5053                                     const UnicodeSet *s = fPattern->fStaticSets[opValue];
5054                                     if (s->contains(c)) {
5055                                         success = !success;
5056                                     }
5057                                 }
5058                             } while (reverseIndex > backSearchIndex && !success);
5059 
5060                             if (success) {
5061                                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5062                                 fp->fInputIdx = reverseIndex;
5063                                 if (fp->fInputIdx > backSearchIndex) {
5064                                     fp = StateSave(fp, fp->fPatIdx, status);
5065                                 }
5066                                 fp->fPatIdx++; // Skip the LOOP_C, we just did that
5067                                 break;
5068                             }
5069                         }
5070                     }
5071                     #endif
5072                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5073                 }
5074             }
5075             break;
5076 
5077 
5078         case URX_STAT_SETREF_N:
5079             {
5080                 // Test input character for NOT being a member of  one of
5081                 //    the predefined sets (Word Characters, for example)
5082                 if (fp->fInputIdx >= fActiveLimit) {
5083                     fHitEnd = TRUE;
5084                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5085                     break;
5086                 }
5087 
5088                 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
5089 
5090                 UChar32  c;
5091                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5092                 if (c < 256) {
5093                     Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
5094                     if (s8->contains(c) == FALSE) {
5095                         break;
5096                     }
5097                 } else {
5098                     const UnicodeSet *s = fPattern->fStaticSets[opValue];
5099                     if (s->contains(c) == FALSE) {
5100                         break;
5101                     }
5102                 }
5103 
5104                 #ifdef REGEX_SMART_BACKTRACKING
5105                 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
5106                     REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
5107                     if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
5108                         // Try to find it, backwards
5109                         int64_t reverseIndex = fp->fInputIdx;
5110                         U16_BACK_1(inputBuf, backSearchIndex, reverseIndex); // skip the first character we tried
5111                         UBool success = FALSE;
5112                         do {
5113                             U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
5114                             if (c < 256) {
5115                                 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
5116                                 if (s8->contains(c) == FALSE) {
5117                                     success = TRUE;
5118                                     break;
5119                                 }
5120                             } else {
5121                                 const UnicodeSet *s = fPattern->fStaticSets[opValue];
5122                                 if (s->contains(c) == FALSE) {
5123                                     success = TRUE;
5124                                     break;
5125                                 }
5126                             }
5127                         } while (reverseIndex > backSearchIndex);
5128 
5129                         if (success) {
5130                             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5131                             fp->fInputIdx = reverseIndex;
5132                             if (fp->fInputIdx > backSearchIndex) {
5133                                 fp = StateSave(fp, fp->fPatIdx, status);
5134                             }
5135                             fp->fPatIdx++; // Skip the LOOP_C, we just did that
5136                             break;
5137                         }
5138                     }
5139                 }
5140                 #endif
5141                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5142             }
5143             break;
5144 
5145 
5146         case URX_SETREF:
5147             {
5148                 if (fp->fInputIdx >= fActiveLimit) {
5149                     fHitEnd = TRUE;
5150                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5151                     break;
5152                 }
5153 
5154                 U_ASSERT(opValue > 0 && opValue < sets->size());
5155 
5156                 // There is input left.  Pick up one char and test it for set membership.
5157                 UChar32  c;
5158                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5159                 if (c<256) {
5160                     Regex8BitSet *s8 = &fPattern->fSets8[opValue];
5161                     if (s8->contains(c)) {
5162                         // The character is in the set.  A Match.
5163                         break;
5164                     }
5165                 } else {
5166                     UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
5167                     if (s->contains(c)) {
5168                         // The character is in the set.  A Match.
5169                         break;
5170                     }
5171                 }
5172 
5173                 // the character wasn't in the set.
5174                 #ifdef REGEX_SMART_BACKTRACKING
5175                 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
5176                     REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
5177                     if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
5178                         // Try to find it, backwards
5179                         int64_t reverseIndex = fp->fInputIdx;
5180                         U16_BACK_1(inputBuf, backSearchIndex, reverseIndex); // skip the first character we tried
5181                         UBool success = FALSE;
5182                         do {
5183                             U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
5184                             if (c < 256) {
5185                                 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
5186                                 if (s8->contains(c)) {
5187                                     success = TRUE;
5188                                     break;
5189                                 }
5190                             } else {
5191                                 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
5192                                 if (s->contains(c)) {
5193                                     success = TRUE;
5194                                     break;
5195                                 }
5196                             }
5197                         } while (reverseIndex > backSearchIndex);
5198 
5199                         if (success) {
5200                             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5201                             fp->fInputIdx = reverseIndex;
5202                             if (fp->fInputIdx > reverseIndex) {
5203                                 fp = StateSave(fp, fp->fPatIdx, status);
5204                             }
5205                             fp->fPatIdx++; // Skip the LOOP_C, we just did that
5206                             break;
5207                         }
5208                     }
5209                 }
5210                 #endif
5211                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5212             }
5213             break;
5214 
5215 
5216         case URX_DOTANY:
5217             {
5218                 // . matches anything, but stops at end-of-line.
5219                 if (fp->fInputIdx >= fActiveLimit) {
5220                     // At end of input.  Match failed.  Backtrack out.
5221                     fHitEnd = TRUE;
5222                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5223                     break;
5224                 }
5225 
5226                 // There is input left.  Advance over one char, unless we've hit end-of-line
5227                 UChar32  c;
5228                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5229                 if (((c & 0x7f) <= 0x29) &&     // First quickly bypass as many chars as possible
5230                     ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
5231                     // End of line in normal mode.   . does not match.
5232                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5233                     break;
5234                 }
5235             }
5236             break;
5237 
5238 
5239         case URX_DOTANY_ALL:
5240             {
5241                 // . in dot-matches-all (including new lines) mode
5242                 if (fp->fInputIdx >= fActiveLimit) {
5243                     // At end of input.  Match failed.  Backtrack out.
5244                     fHitEnd = TRUE;
5245                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5246                     break;
5247                 }
5248 
5249                 // There is input left.  Advance over one char, except if we are
5250                 //   at a cr/lf, advance over both of them.
5251                 UChar32 c;
5252                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5253                 if (c==0x0d && fp->fInputIdx < fActiveLimit) {
5254                     // In the case of a CR/LF, we need to advance over both.
5255                     if (inputBuf[fp->fInputIdx] == 0x0a) {
5256                         U16_FWD_1(inputBuf, fp->fInputIdx, fActiveLimit);
5257                     }
5258                 }
5259             }
5260             break;
5261 
5262 
5263         case URX_DOTANY_UNIX:
5264             {
5265                 // '.' operator, matches all, but stops at end-of-line.
5266                 //   UNIX_LINES mode, so 0x0a is the only recognized line ending.
5267                 if (fp->fInputIdx >= fActiveLimit) {
5268                     // At end of input.  Match failed.  Backtrack out.
5269                     fHitEnd = TRUE;
5270                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5271                     break;
5272                 }
5273 
5274                 // There is input left.  Advance over one char, unless we've hit end-of-line
5275                 UChar32 c;
5276                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5277                 if (c == 0x0a) {
5278                     // End of line in normal mode.   '.' does not match the \n
5279                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5280                 }
5281             }
5282             break;
5283 
5284 
5285         case URX_JMP:
5286             fp->fPatIdx = opValue;
5287             break;
5288 
5289         case URX_FAIL:
5290             isMatch = FALSE;
5291             goto breakFromLoop;
5292 
5293         case URX_JMP_SAV:
5294             U_ASSERT(opValue < fPattern->fCompiledPat->size());
5295             fp = StateSave(fp, fp->fPatIdx, status);       // State save to loc following current
5296             fp->fPatIdx = opValue;                         // Then JMP.
5297             break;
5298 
5299         case URX_JMP_SAV_X:
5300             // This opcode is used with (x)+, when x can match a zero length string.
5301             // Same as JMP_SAV, except conditional on the match having made forward progress.
5302             // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
5303             //   data address of the input position at the start of the loop.
5304             {
5305                 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
5306                 int32_t  stoOp = (int32_t)pat[opValue-1];
5307                 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
5308                 int32_t  frameLoc = URX_VAL(stoOp);
5309                 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
5310                 int32_t prevInputIdx = (int32_t)fp->fExtra[frameLoc];
5311                 U_ASSERT(prevInputIdx <= fp->fInputIdx);
5312                 if (prevInputIdx < fp->fInputIdx) {
5313                     // The match did make progress.  Repeat the loop.
5314                     fp = StateSave(fp, fp->fPatIdx, status);  // State save to loc following current
5315                     fp->fPatIdx = opValue;
5316                     fp->fExtra[frameLoc] = fp->fInputIdx;
5317                 }
5318                 // If the input position did not advance, we do nothing here,
5319                 //   execution will fall out of the loop.
5320             }
5321             break;
5322 
5323         case URX_CTR_INIT:
5324             {
5325                 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
5326                 fp->fExtra[opValue] = 0;       //  Set the loop counter variable to zero
5327 
5328                 // Pick up the three extra operands that CTR_INIT has, and
5329                 //    skip the pattern location counter past
5330                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
5331                 fp->fPatIdx += 3;
5332                 int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
5333                 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
5334                 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
5335                 U_ASSERT(minCount>=0);
5336                 U_ASSERT(maxCount>=minCount || maxCount==-1);
5337                 U_ASSERT(loopLoc>fp->fPatIdx);
5338 
5339                 if (minCount == 0) {
5340                     fp = StateSave(fp, loopLoc+1, status);
5341                 }
5342                 if (maxCount == 0) {
5343                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5344                 }
5345             }
5346             break;
5347 
5348         case URX_CTR_LOOP:
5349             {
5350                 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
5351                 int32_t initOp = (int32_t)pat[opValue];
5352                 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
5353                 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
5354                 int32_t minCount  = (int32_t)pat[opValue+2];
5355                 int32_t maxCount  = (int32_t)pat[opValue+3];
5356                 // Increment the counter.  Note: we DIDN'T worry about counter
5357                 //   overflow, since the data comes from UnicodeStrings, which
5358                 //   stores its length in an int32_t. Do we have to think about
5359                 //   this now that we're using UText? Probably not, since the length
5360                 //   in UChar32s is still an int32_t.
5361                 (*pCounter)++;
5362                 U_ASSERT(*pCounter > 0);
5363                 if ((uint64_t)*pCounter >= (uint32_t)maxCount) {
5364                     U_ASSERT(*pCounter == maxCount || maxCount == -1);
5365                     break;
5366                 }
5367                 if (*pCounter >= minCount) {
5368                     fp = StateSave(fp, fp->fPatIdx, status);
5369                 }
5370                 fp->fPatIdx = opValue + 4;    // Loop back.
5371             }
5372             break;
5373 
5374         case URX_CTR_INIT_NG:
5375             {
5376                 // Initialize a non-greedy loop
5377                 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
5378                 fp->fExtra[opValue] = 0;       //  Set the loop counter variable to zero
5379 
5380                 // Pick up the three extra operands that CTR_INIT has, and
5381                 //    skip the pattern location counter past
5382                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
5383                 fp->fPatIdx += 3;
5384                 int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
5385                 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
5386                 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
5387                 U_ASSERT(minCount>=0);
5388                 U_ASSERT(maxCount>=minCount || maxCount==-1);
5389                 U_ASSERT(loopLoc>fp->fPatIdx);
5390 
5391                 if (minCount == 0) {
5392                     if (maxCount != 0) {
5393                         fp = StateSave(fp, fp->fPatIdx, status);
5394                     }
5395                     fp->fPatIdx = loopLoc+1;   // Continue with stuff after repeated block
5396                 }
5397             }
5398             break;
5399 
5400         case URX_CTR_LOOP_NG:
5401             {
5402                 // Non-greedy {min, max} loops
5403                 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
5404                 int32_t initOp = (int32_t)pat[opValue];
5405                 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
5406                 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
5407                 int32_t minCount  = (int32_t)pat[opValue+2];
5408                 int32_t maxCount  = (int32_t)pat[opValue+3];
5409                 // Increment the counter.  Note: we DIDN'T worry about counter
5410                 //   overflow, since the data comes from UnicodeStrings, which
5411                 //   stores its length in an int32_t. Do we have to think about
5412                 //   this now that we're using UText? Probably not, since the length
5413                 //   in UChar32s is still an int32_t.
5414                 (*pCounter)++;
5415                 U_ASSERT(*pCounter > 0);
5416 
5417                 if ((uint64_t)*pCounter >= (uint32_t)maxCount) {
5418                     // The loop has matched the maximum permitted number of times.
5419                     //   Break out of here with no action.  Matching will
5420                     //   continue with the following pattern.
5421                     U_ASSERT(*pCounter == maxCount || maxCount == -1);
5422                     break;
5423                 }
5424 
5425                 if (*pCounter < minCount) {
5426                     // We haven't met the minimum number of matches yet.
5427                     //   Loop back for another one.
5428                     fp->fPatIdx = opValue + 4;    // Loop back.
5429                 } else {
5430                     // We do have the minimum number of matches.
5431                     //   Fall into the following pattern, but first do
5432                     //   a state save to the top of the loop, so that a failure
5433                     //   in the following pattern will try another iteration of the loop.
5434                     fp = StateSave(fp, opValue + 4, status);
5435                 }
5436             }
5437             break;
5438 
5439         case URX_STO_SP:
5440             U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
5441             fData[opValue] = fStack->size();
5442             break;
5443 
5444         case URX_LD_SP:
5445             {
5446                 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
5447                 int32_t newStackSize = (int32_t)fData[opValue];
5448                 U_ASSERT(newStackSize <= fStack->size());
5449                 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
5450                 if (newFP == (int64_t *)fp) {
5451                     break;
5452                 }
5453                 int32_t i;
5454                 for (i=0; i<fFrameSize; i++) {
5455                     newFP[i] = ((int64_t *)fp)[i];
5456                 }
5457                 fp = (REStackFrame *)newFP;
5458                 fStack->setSize(newStackSize);
5459             }
5460             break;
5461 
5462         case URX_BACKREF:
5463         case URX_BACKREF_I:
5464             {
5465                 U_ASSERT(opValue < fFrameSize);
5466                 int64_t groupStartIdx = fp->fExtra[opValue];
5467                 int64_t groupEndIdx   = fp->fExtra[opValue+1];
5468                 U_ASSERT(groupStartIdx <= groupEndIdx);
5469                 int64_t len = groupEndIdx-groupStartIdx;
5470                 if (groupStartIdx < 0) {
5471                     // This capture group has not participated in the match thus far,
5472                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
5473                 }
5474 
5475                 if (len == 0) {
5476                         //   The capture group match was of an empty string.
5477                         //   Verified by testing:  Perl matches succeed in this case, so
5478                         //   we do too.
5479                         break;
5480                     }
5481 
5482                 UBool  haveMatch = FALSE;
5483                 if (fp->fInputIdx + len <= fActiveLimit) {
5484                     if (opType == URX_BACKREF) {
5485                         if (u_strncmp(inputBuf+groupStartIdx, inputBuf+fp->fInputIdx, (int32_t)len) == 0) {
5486                             haveMatch = TRUE;
5487                         }
5488                     } else {
5489                         if (u_strncasecmp(inputBuf+groupStartIdx, inputBuf+fp->fInputIdx,
5490                                   (int32_t)len, U_FOLD_CASE_DEFAULT) == 0) {
5491                             haveMatch = TRUE;
5492                         }
5493                     }
5494                 } else {
5495                     // TODO: probably need to do a partial string comparison, and only
5496                     //       set HitEnd if the available input matched.  Ticket #6074
5497                     fHitEnd = TRUE;
5498                 }
5499                 if (haveMatch) {
5500                     fp->fInputIdx += len;     // Match.  Advance current input position.
5501                 } else {
5502                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
5503                 }
5504             }
5505             break;
5506 
5507         case URX_STO_INP_LOC:
5508             {
5509                 U_ASSERT(opValue >= 0 && opValue < fFrameSize);
5510                 fp->fExtra[opValue] = fp->fInputIdx;
5511             }
5512             break;
5513 
5514         case URX_JMPX:
5515             {
5516                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
5517                 fp->fPatIdx += 1;
5518                 int32_t dataLoc  = URX_VAL(pat[instrOperandLoc]);
5519                 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
5520                 int32_t savedInputIdx = (int32_t)fp->fExtra[dataLoc];
5521                 U_ASSERT(savedInputIdx <= fp->fInputIdx);
5522                 if (savedInputIdx < fp->fInputIdx) {
5523                     fp->fPatIdx = opValue;                               // JMP
5524                 } else {
5525                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no progress in loop.
5526                 }
5527             }
5528             break;
5529 
5530         case URX_LA_START:
5531             {
5532                 // Entering a lookahead block.
5533                 // Save Stack Ptr, Input Pos.
5534                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5535                 fData[opValue]   = fStack->size();
5536                 fData[opValue+1] = fp->fInputIdx;
5537                 fActiveStart     = fLookStart;          // Set the match region change for
5538                 fActiveLimit     = fLookLimit;          //   transparent bounds.
5539             }
5540             break;
5541 
5542         case URX_LA_END:
5543             {
5544                 // Leaving a look-ahead block.
5545                 //  restore Stack Ptr, Input Pos to positions they had on entry to block.
5546                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5547                 int32_t stackSize = fStack->size();
5548                 int32_t newStackSize = (int32_t)fData[opValue];
5549                 U_ASSERT(stackSize >= newStackSize);
5550                 if (stackSize > newStackSize) {
5551                     // Copy the current top frame back to the new (cut back) top frame.
5552                     //   This makes the capture groups from within the look-ahead
5553                     //   expression available.
5554                     int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
5555                     int32_t i;
5556                     for (i=0; i<fFrameSize; i++) {
5557                         newFP[i] = ((int64_t *)fp)[i];
5558                     }
5559                     fp = (REStackFrame *)newFP;
5560                     fStack->setSize(newStackSize);
5561                 }
5562                 fp->fInputIdx = fData[opValue+1];
5563 
5564                 // Restore the active region bounds in the input string; they may have
5565                 //    been changed because of transparent bounds on a Region.
5566                 fActiveStart = fRegionStart;
5567                 fActiveLimit = fRegionLimit;
5568             }
5569             break;
5570 
5571         case URX_ONECHAR_I:
5572             if (fp->fInputIdx < fActiveLimit) {
5573                 UChar32 c;
5574                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5575                 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
5576                     break;
5577                 }
5578             } else {
5579                 fHitEnd = TRUE;
5580             }
5581 
5582             #ifdef REGEX_SMART_BACKTRACKING
5583             if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
5584                 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
5585                 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
5586                     UBool success = FALSE;
5587                     int64_t reverseIndex = fp->fInputIdx;
5588                     UChar32 c;
5589                     while (reverseIndex > backSearchIndex) {
5590                         U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
5591                         if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
5592                             success = TRUE;
5593                             break;
5594                         } else if (c == U_SENTINEL) {
5595                             break;
5596                         }
5597                     }
5598                     if (success) {
5599                         fHitEnd = FALSE;
5600                         fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5601                         fp->fInputIdx = reverseIndex;
5602                         if (fp->fInputIdx > backSearchIndex) {
5603                             fp = StateSave(fp, fp->fPatIdx, status);
5604                         }
5605                         fp->fPatIdx++; // Skip the LOOP_C, we just did that
5606                         break;
5607                     }
5608                 }
5609             }
5610             #endif
5611 
5612             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5613             break;
5614 
5615         case URX_STRING_I:
5616             {
5617                 // Test input against a literal string.
5618                 // Strings require two slots in the compiled pattern, one for the
5619                 //   offset to the string text, and one for the length.
5620                 const UCaseProps *csp = ucase_getSingleton();
5621                 {
5622                     int32_t stringStartIdx, stringLen;
5623                     stringStartIdx = opValue;
5624 
5625                     op      = (int32_t)pat[fp->fPatIdx];
5626                     fp->fPatIdx++;
5627                     opType  = URX_TYPE(op);
5628                     opValue = URX_VAL(op);
5629                     U_ASSERT(opType == URX_STRING_LEN);
5630                     stringLen = opValue;
5631 
5632                     const UChar *patternChars = litText+stringStartIdx;
5633                     const UChar *patternEnd = patternChars+stringLen;
5634 
5635                     const UChar *foldChars = NULL;
5636                     int32_t foldOffset, foldLength;
5637                     UChar32 c;
5638                     // BEGIN android-changed
5639                     // For ICU ticket#8824
5640                     UBool c_is_valid = FALSE;
5641 
5642                     #ifdef REGEX_SMART_BACKTRACKING
5643                     int32_t originalInputIdx = fp->fInputIdx;
5644                     #endif
5645                     UBool success = TRUE;
5646 
5647                     foldOffset = foldLength = 0;
5648 
5649                     while (patternChars < patternEnd && success) {
5650                         if (fp->fInputIdx < fActiveLimit) {  // don't read past end of string
5651                             if(foldOffset < foldLength) {
5652                                 U16_NEXT_UNSAFE(foldChars, foldOffset, c);
5653                                 c_is_valid = TRUE;
5654                             } else {
5655                                 // test pre-condition of U16_NEXT: i < length
5656                                 U_ASSERT(fp->fInputIdx < fActiveLimit);
5657                                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5658                                 c_is_valid = TRUE;
5659                                 foldLength = ucase_toFullFolding(csp, c, &foldChars, U_FOLD_CASE_DEFAULT);
5660                                 if(foldLength >= 0) {
5661                                     if(foldLength <= UCASE_MAX_STRING_LENGTH) {   // !!!: Does not correctly handle chars that fold to 0-length strings
5662                                         foldOffset = 0;
5663                                         U16_NEXT_UNSAFE(foldChars, foldOffset, c);
5664                                     } else {
5665                                         c = foldLength;
5666                                         foldLength = foldOffset; // to avoid reading chars from the folding buffer
5667                                     }
5668                                 }
5669                             }
5670                         } else {
5671                             c_is_valid = FALSE;
5672                         }
5673 
5674                         if (fp->fInputIdx <= fActiveLimit && c_is_valid) {
5675                             if (U_IS_BMP(c)) {
5676                                 success = (*patternChars == c);
5677                                 patternChars += 1;
5678                             } else if (patternChars+1 < patternEnd) {
5679                                 success = (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c));
5680                                 patternChars += 2;
5681                             }
5682                         } else {
5683                             success = FALSE;
5684                             fHitEnd = TRUE;          //   TODO:  See ticket 6074
5685                         }
5686                     }
5687                     // END android-changed
5688 
5689                     if (!success) {
5690                         #ifdef REGEX_SMART_BACKTRACKING
5691                         if (fp->fInputIdx > backSearchIndex && fStack->size()) {
5692                             REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
5693                             if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
5694                                 // Reset to last start point
5695                                 int64_t reverseIndex = originalInputIdx;
5696                                 patternChars = litText+stringStartIdx;
5697 
5698                                 // Search backwards for a possible start
5699                                 do {
5700                                     U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
5701                                     foldLength = ucase_toFullFolding(csp, c, &foldChars, U_FOLD_CASE_DEFAULT);
5702                                     if(foldLength >= 0) {
5703                                         if(foldLength <= UCASE_MAX_STRING_LENGTH) {   // !!!: Does not correctly handle chars that fold to 0-length strings
5704                                             foldOffset = 0;
5705                                             U16_NEXT_UNSAFE(foldChars, foldOffset, c);
5706                                         } else {
5707                                             c = foldLength;
5708                                             foldLength = foldOffset; // to avoid reading chars from the folding buffer
5709                                         }
5710                                     }
5711 
5712                                     if ((U_IS_BMP(c) && *patternChars == c) ||
5713                                            (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c))) {
5714                                         success = TRUE;
5715                                         break;
5716                                     }
5717                                 } while (reverseIndex > backSearchIndex);
5718 
5719                                 // And try again
5720                                 if (success) {
5721                                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5722                                     fp->fInputIdx = reverseIndex;
5723                                     if (fp->fInputIdx > backSearchIndex) {
5724                                         fp = StateSave(fp, fp->fPatIdx, status);
5725                                     }
5726                                     fp->fPatIdx++; // Skip the LOOP_C, we just did that
5727                                     break;
5728                                 }
5729                             }
5730                         }
5731                         #endif
5732                         fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5733                     }
5734                 }
5735             }
5736             break;
5737 
5738         case URX_LB_START:
5739             {
5740                 // Entering a look-behind block.
5741                 // Save Stack Ptr, Input Pos.
5742                 //   TODO:  implement transparent bounds.  Ticket #6067
5743                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5744                 fData[opValue]   = fStack->size();
5745                 fData[opValue+1] = fp->fInputIdx;
5746                 // Init the variable containing the start index for attempted matches.
5747                 fData[opValue+2] = -1;
5748                 // Save input string length, then reset to pin any matches to end at
5749                 //   the current position.
5750                 fData[opValue+3] = fActiveLimit;
5751                 fActiveLimit     = fp->fInputIdx;
5752             }
5753             break;
5754 
5755 
5756         case URX_LB_CONT:
5757             {
5758                 // Positive Look-Behind, at top of loop checking for matches of LB expression
5759                 //    at all possible input starting positions.
5760 
5761                 // Fetch the min and max possible match lengths.  They are the operands
5762                 //   of this op in the pattern.
5763                 int32_t minML = (int32_t)pat[fp->fPatIdx++];
5764                 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
5765                 U_ASSERT(minML <= maxML);
5766                 U_ASSERT(minML >= 0);
5767 
5768                 // Fetch (from data) the last input index where a match was attempted.
5769                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5770                 int64_t  *lbStartIdx = &fData[opValue+2];
5771                 if (*lbStartIdx < 0) {
5772                     // First time through loop.
5773                     *lbStartIdx = fp->fInputIdx - minML;
5774                 } else {
5775                     // 2nd through nth time through the loop.
5776                     // Back up start position for match by one.
5777                     if (*lbStartIdx == 0) {
5778                         (*lbStartIdx)--;
5779                     } else {
5780                         U16_BACK_1(inputBuf, 0, *lbStartIdx);
5781                     }
5782                 }
5783 
5784                 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
5785                     // We have tried all potential match starting points without
5786                     //  getting a match.  Backtrack out, and out of the
5787                     //   Look Behind altogether.
5788                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5789                     int64_t restoreInputLen = fData[opValue+3];
5790                     U_ASSERT(restoreInputLen >= fActiveLimit);
5791                     U_ASSERT(restoreInputLen <= fInputLength);
5792                     fActiveLimit = restoreInputLen;
5793                     break;
5794                 }
5795 
5796                 //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
5797                 //      (successful match will fall off the end of the loop.)
5798                 fp = StateSave(fp, fp->fPatIdx-3, status);
5799                 fp->fInputIdx =  *lbStartIdx;
5800             }
5801             break;
5802 
5803         case URX_LB_END:
5804             // End of a look-behind block, after a successful match.
5805             {
5806                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5807                 if (fp->fInputIdx != fActiveLimit) {
5808                     //  The look-behind expression matched, but the match did not
5809                     //    extend all the way to the point that we are looking behind from.
5810                     //  FAIL out of here, which will take us back to the LB_CONT, which
5811                     //     will retry the match starting at another position or fail
5812                     //     the look-behind altogether, whichever is appropriate.
5813                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5814                     break;
5815                 }
5816 
5817                 // Look-behind match is good.  Restore the orignal input string length,
5818                 //   which had been truncated to pin the end of the lookbehind match to the
5819                 //   position being looked-behind.
5820                 int64_t originalInputLen = fData[opValue+3];
5821                 U_ASSERT(originalInputLen >= fActiveLimit);
5822                 U_ASSERT(originalInputLen <= fInputLength);
5823                 fActiveLimit = originalInputLen;
5824             }
5825             break;
5826 
5827 
5828         case URX_LBN_CONT:
5829             {
5830                 // Negative Look-Behind, at top of loop checking for matches of LB expression
5831                 //    at all possible input starting positions.
5832 
5833                 // Fetch the extra parameters of this op.
5834                 int32_t minML       = (int32_t)pat[fp->fPatIdx++];
5835                 int32_t maxML       = (int32_t)pat[fp->fPatIdx++];
5836                 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
5837                 continueLoc = URX_VAL(continueLoc);
5838                 U_ASSERT(minML <= maxML);
5839                 U_ASSERT(minML >= 0);
5840                 U_ASSERT(continueLoc > fp->fPatIdx);
5841 
5842                 // Fetch (from data) the last input index where a match was attempted.
5843                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5844                 int64_t  *lbStartIdx = &fData[opValue+2];
5845                 if (*lbStartIdx < 0) {
5846                     // First time through loop.
5847                     *lbStartIdx = fp->fInputIdx - minML;
5848                 } else {
5849                     // 2nd through nth time through the loop.
5850                     // Back up start position for match by one.
5851                     if (*lbStartIdx == 0) {
5852                         (*lbStartIdx)--;   // Because U16_BACK is unsafe starting at 0.
5853                     } else {
5854                         U16_BACK_1(inputBuf, 0, *lbStartIdx);
5855                     }
5856                 }
5857 
5858                 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
5859                     // We have tried all potential match starting points without
5860                     //  getting a match, which means that the negative lookbehind as
5861                     //  a whole has succeeded.  Jump forward to the continue location
5862                     int64_t restoreInputLen = fData[opValue+3];
5863                     U_ASSERT(restoreInputLen >= fActiveLimit);
5864                     U_ASSERT(restoreInputLen <= fInputLength);
5865                     fActiveLimit = restoreInputLen;
5866                     fp->fPatIdx = continueLoc;
5867                     break;
5868                 }
5869 
5870                 //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
5871                 //      (successful match will cause a FAIL out of the loop altogether.)
5872                 fp = StateSave(fp, fp->fPatIdx-4, status);
5873                 fp->fInputIdx =  *lbStartIdx;
5874             }
5875             break;
5876 
5877         case URX_LBN_END:
5878             // End of a negative look-behind block, after a successful match.
5879             {
5880                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5881                 if (fp->fInputIdx != fActiveLimit) {
5882                     //  The look-behind expression matched, but the match did not
5883                     //    extend all the way to the point that we are looking behind from.
5884                     //  FAIL out of here, which will take us back to the LB_CONT, which
5885                     //     will retry the match starting at another position or succeed
5886                     //     the look-behind altogether, whichever is appropriate.
5887                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5888                     break;
5889                 }
5890 
5891                 // Look-behind expression matched, which means look-behind test as
5892                 //   a whole Fails
5893 
5894                 //   Restore the orignal input string length, which had been truncated
5895                 //   inorder to pin the end of the lookbehind match
5896                 //   to the position being looked-behind.
5897                 int64_t originalInputLen = fData[opValue+3];
5898                 U_ASSERT(originalInputLen >= fActiveLimit);
5899                 U_ASSERT(originalInputLen <= fInputLength);
5900                 fActiveLimit = originalInputLen;
5901 
5902                 // Restore original stack position, discarding any state saved
5903                 //   by the successful pattern match.
5904                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5905                 int32_t newStackSize = (int32_t)fData[opValue];
5906                 U_ASSERT(fStack->size() > newStackSize);
5907                 fStack->setSize(newStackSize);
5908 
5909                 //  FAIL, which will take control back to someplace
5910                 //  prior to entering the look-behind test.
5911                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5912             }
5913             break;
5914 
5915 
5916         case URX_LOOP_SR_I:
5917             // Loop Initialization for the optimized implementation of
5918             //     [some character set]*
5919             //   This op scans through all matching input.
5920             //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
5921             {
5922                 U_ASSERT(opValue > 0 && opValue < sets->size());
5923                 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
5924                 UnicodeSet   *s  = (UnicodeSet *)sets->elementAt(opValue);
5925 
5926                 // Loop through input, until either the input is exhausted or
5927                 //   we reach a character that is not a member of the set.
5928                 int32_t ix = (int32_t)fp->fInputIdx;
5929                 for (;;) {
5930                     if (ix >= fActiveLimit) {
5931                         fHitEnd = TRUE;
5932                         break;
5933                     }
5934                     UChar32   c;
5935                     U16_NEXT(inputBuf, ix, fActiveLimit, c);
5936                     if (c<256) {
5937                         if (s8->contains(c) == FALSE) {
5938                             U16_BACK_1(inputBuf, 0, ix);
5939                             break;
5940                         }
5941                     } else {
5942                         if (s->contains(c) == FALSE) {
5943                             U16_BACK_1(inputBuf, 0, ix);
5944                             break;
5945                         }
5946                     }
5947                 }
5948 
5949                 // If there were no matching characters, skip over the loop altogether.
5950                 //   The loop doesn't run at all, a * op always succeeds.
5951                 if (ix == fp->fInputIdx) {
5952                     fp->fPatIdx++;   // skip the URX_LOOP_C op.
5953                     break;
5954                 }
5955 
5956                 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
5957                 //   must follow.  It's operand is the stack location
5958                 //   that holds the starting input index for the match of this [set]*
5959                 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
5960                 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
5961                 int32_t stackLoc = URX_VAL(loopcOp);
5962                 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
5963                 fp->fExtra[stackLoc] = fp->fInputIdx;
5964                 #ifdef REGEX_SMART_BACKTRACKING
5965                 backSearchIndex = fp->fInputIdx;
5966                 #endif
5967                 fp->fInputIdx = ix;
5968 
5969                 // Save State to the URX_LOOP_C op that follows this one,
5970                 //   so that match failures in the following code will return to there.
5971                 //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
5972                 fp = StateSave(fp, fp->fPatIdx, status);
5973                 fp->fPatIdx++;
5974             }
5975             break;
5976 
5977 
5978         case URX_LOOP_DOT_I:
5979             // Loop Initialization for the optimized implementation of .*
5980             //   This op scans through all remaining input.
5981             //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
5982             {
5983                 // Loop through input until the input is exhausted (we reach an end-of-line)
5984                 // In DOTALL mode, we can just go straight to the end of the input.
5985                 int32_t ix;
5986                 if ((opValue & 1) == 1) {
5987                     // Dot-matches-All mode.  Jump straight to the end of the string.
5988                     ix = (int32_t)fActiveLimit;
5989                     fHitEnd = TRUE;
5990                 } else {
5991                     // NOT DOT ALL mode.  Line endings do not match '.'
5992                     // Scan forward until a line ending or end of input.
5993                     ix = (int32_t)fp->fInputIdx;
5994                     for (;;) {
5995                         if (ix >= fActiveLimit) {
5996                             fHitEnd = TRUE;
5997                             break;
5998                         }
5999                         UChar32   c;
6000                         U16_NEXT(inputBuf, ix, fActiveLimit, c);   // c = inputBuf[ix++]
6001                         if ((c & 0x7f) <= 0x29) {          // Fast filter of non-new-line-s
6002                             if ((c == 0x0a) ||             //  0x0a is newline in both modes.
6003                                 (((opValue & 2) == 0) &&    // IF not UNIX_LINES mode
6004                                    ((c<=0x0d && c>=0x0a) || c==0x85 || c==0x2028 || c==0x2029))) {
6005                                 //  char is a line ending.  Put the input pos back to the
6006                                 //    line ending char, and exit the scanning loop.
6007                                 U16_BACK_1(inputBuf, 0, ix);
6008                                 break;
6009                             }
6010                         }
6011                     }
6012                 }
6013 
6014                 // If there were no matching characters, skip over the loop altogether.
6015                 //   The loop doesn't run at all, a * op always succeeds.
6016                 if (ix == fp->fInputIdx) {
6017                     fp->fPatIdx++;   // skip the URX_LOOP_C op.
6018                     break;
6019                 }
6020 
6021                 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
6022                 //   must follow.  It's operand is the stack location
6023                 //   that holds the starting input index for the match of this .*
6024                 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
6025                 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
6026                 int32_t stackLoc = URX_VAL(loopcOp);
6027                 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
6028                 fp->fExtra[stackLoc] = fp->fInputIdx;
6029                 #ifdef REGEX_SMART_BACKTRACKING
6030                 backSearchIndex = fp->fInputIdx;
6031                 #endif
6032                 fp->fInputIdx = ix;
6033 
6034                 // Save State to the URX_LOOP_C op that follows this one,
6035                 //   so that match failures in the following code will return to there.
6036                 //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
6037                 fp = StateSave(fp, fp->fPatIdx, status);
6038                 fp->fPatIdx++;
6039             }
6040             break;
6041 
6042 
6043         case URX_LOOP_C:
6044             {
6045                 U_ASSERT(opValue>=0 && opValue<fFrameSize);
6046                 backSearchIndex = (int32_t)fp->fExtra[opValue];
6047                 U_ASSERT(backSearchIndex <= fp->fInputIdx);
6048                 if (backSearchIndex == fp->fInputIdx) {
6049                     // We've backed up the input idx to the point that the loop started.
6050                     // The loop is done.  Leave here without saving state.
6051                     //  Subsequent failures won't come back here.
6052                     break;
6053                 }
6054                 // Set up for the next iteration of the loop, with input index
6055                 //   backed up by one from the last time through,
6056                 //   and a state save to this instruction in case the following code fails again.
6057                 //   (We're going backwards because this loop emulates stack unwinding, not
6058                 //    the initial scan forward.)
6059                 U_ASSERT(fp->fInputIdx > 0);
6060                 UChar32 prevC;
6061                 U16_PREV(inputBuf, 0, fp->fInputIdx, prevC); // !!!: should this 0 be one of f*Limit?
6062 
6063                 if (prevC == 0x0a &&
6064                     fp->fInputIdx > backSearchIndex &&
6065                     inputBuf[fp->fInputIdx-1] == 0x0d) {
6066                     int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
6067                     if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
6068                         // .*, stepping back over CRLF pair.
6069                         U16_BACK_1(inputBuf, 0, fp->fInputIdx);
6070                     }
6071                 }
6072 
6073 
6074                 fp = StateSave(fp, fp->fPatIdx-1, status);
6075             }
6076             break;
6077 
6078 
6079 
6080         default:
6081             // Trouble.  The compiled pattern contains an entry with an
6082             //           unrecognized type tag.
6083             U_ASSERT(FALSE);
6084         }
6085 
6086         if (U_FAILURE(status)) {
6087             isMatch = FALSE;
6088             break;
6089         }
6090     }
6091 
6092 breakFromLoop:
6093     fMatch = isMatch;
6094     if (isMatch) {
6095         fLastMatchEnd = fMatchEnd;
6096         fMatchStart   = startIdx;
6097         fMatchEnd     = fp->fInputIdx;
6098         if (fTraceDebug) {
6099             REGEX_RUN_DEBUG_PRINTF(("Match.  start=%d   end=%d\n\n", fMatchStart, fMatchEnd));
6100         }
6101     }
6102     else
6103     {
6104         if (fTraceDebug) {
6105             REGEX_RUN_DEBUG_PRINTF(("No match\n\n"));
6106         }
6107     }
6108 
6109     fFrame = fp;                // The active stack frame when the engine stopped.
6110     //   Contains the capture group results that we need to
6111     //    access later.
6112 
6113     return;
6114 }
6115 
6116 
6117 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher)
6118 
6119 U_NAMESPACE_END
6120 
6121 #endif  // !UCONFIG_NO_REGULAR_EXPRESSIONS
6122 
6123