• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 **************************************************************************
3 *   Copyright (C) 2002-2012 International Business Machines Corporation  *
4 *   and others. All rights reserved.                                     *
5 **************************************************************************
6 */
7 //
8 //  file:  rematch.cpp
9 //
10 //         Contains the implementation of class RegexMatcher,
11 //         which is one of the main API classes for the ICU regular expression package.
12 //
13 
14 #include "unicode/utypes.h"
15 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
16 
17 #include "unicode/regex.h"
18 #include "unicode/uniset.h"
19 #include "unicode/uchar.h"
20 #include "unicode/ustring.h"
21 #include "unicode/rbbi.h"
22 #include "unicode/utf.h"
23 #include "unicode/utf16.h"
24 #include "uassert.h"
25 #include "cmemory.h"
26 #include "uvector.h"
27 #include "uvectr32.h"
28 #include "uvectr64.h"
29 #include "regeximp.h"
30 #include "regexst.h"
31 #include "regextxt.h"
32 #include "ucase.h"
33 
34 // #include <malloc.h>        // Needed for heapcheck testing
35 
36 
37 // Find progress callback
38 // ----------------------
39 // Macro to inline test & call to ReportFindProgress().  Eliminates unnecessary function call.
40 //
41 #define REGEXFINDPROGRESS_INTERRUPT(pos, status)     \
42     (fFindProgressCallbackFn != NULL) && (ReportFindProgress(pos, status) == FALSE)
43 
44 
45 // Smart Backtracking
46 // ------------------
47 // When a failure would go back to a LOOP_C instruction,
48 // strings, characters, and setrefs scan backwards for a valid start
49 // character themselves, pop the stack, and save state, emulating the
50 // LOOP_C's effect but assured that the next character of input is a
51 // possible matching character.
52 //
53 // Good idea in theory; unfortunately it only helps out a few specific
54 // cases and slows the engine down a little in the rest.
55 
56 U_NAMESPACE_BEGIN
57 
58 // Default limit for the size of the back track stack, to avoid system
59 //    failures causedby heap exhaustion.  Units are in 32 bit words, not bytes.
60 // This value puts ICU's limits higher than most other regexp implementations,
61 //    which use recursion rather than the heap, and take more storage per
62 //    backtrack point.
63 //
64 static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY = 8000000;
65 
66 // Time limit counter constant.
67 //   Time limits for expression evaluation are in terms of quanta of work by
68 //   the engine, each of which is 10,000 state saves.
69 //   This constant determines that state saves per tick number.
70 static const int32_t TIMER_INITIAL_VALUE = 10000;
71 
72 //-----------------------------------------------------------------------------
73 //
74 //   Constructor and Destructor
75 //
76 //-----------------------------------------------------------------------------
RegexMatcher(const RegexPattern * pat)77 RegexMatcher::RegexMatcher(const RegexPattern *pat)  {
78     fDeferredStatus = U_ZERO_ERROR;
79     init(fDeferredStatus);
80     if (U_FAILURE(fDeferredStatus)) {
81         return;
82     }
83     if (pat==NULL) {
84         fDeferredStatus = U_ILLEGAL_ARGUMENT_ERROR;
85         return;
86     }
87     fPattern = pat;
88     init2(RegexStaticSets::gStaticSets->fEmptyText, fDeferredStatus);
89 }
90 
91 
92 
RegexMatcher(const UnicodeString & regexp,const UnicodeString & input,uint32_t flags,UErrorCode & status)93 RegexMatcher::RegexMatcher(const UnicodeString &regexp, const UnicodeString &input,
94                            uint32_t flags, UErrorCode &status) {
95     init(status);
96     if (U_FAILURE(status)) {
97         return;
98     }
99     UParseError    pe;
100     fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
101     fPattern           = fPatternOwned;
102 
103     UText inputText = UTEXT_INITIALIZER;
104     utext_openConstUnicodeString(&inputText, &input, &status);
105     init2(&inputText, status);
106     utext_close(&inputText);
107 
108     fInputUniStrMaybeMutable = TRUE;
109 }
110 
111 
RegexMatcher(UText * regexp,UText * input,uint32_t flags,UErrorCode & status)112 RegexMatcher::RegexMatcher(UText *regexp, UText *input,
113                            uint32_t flags, UErrorCode &status) {
114     init(status);
115     if (U_FAILURE(status)) {
116         return;
117     }
118     UParseError    pe;
119     fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
120     if (U_FAILURE(status)) {
121         return;
122     }
123 
124     fPattern           = fPatternOwned;
125     init2(input, status);
126 }
127 
128 
RegexMatcher(const UnicodeString & regexp,uint32_t flags,UErrorCode & status)129 RegexMatcher::RegexMatcher(const UnicodeString &regexp,
130                            uint32_t flags, UErrorCode &status) {
131     init(status);
132     if (U_FAILURE(status)) {
133         return;
134     }
135     UParseError    pe;
136     fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
137     if (U_FAILURE(status)) {
138         return;
139     }
140     fPattern           = fPatternOwned;
141     init2(RegexStaticSets::gStaticSets->fEmptyText, status);
142 }
143 
RegexMatcher(UText * regexp,uint32_t flags,UErrorCode & status)144 RegexMatcher::RegexMatcher(UText *regexp,
145                            uint32_t flags, UErrorCode &status) {
146     init(status);
147     if (U_FAILURE(status)) {
148         return;
149     }
150     UParseError    pe;
151     fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
152         if (U_FAILURE(status)) {
153         return;
154     }
155 
156     fPattern           = fPatternOwned;
157     init2(RegexStaticSets::gStaticSets->fEmptyText, status);
158 }
159 
160 
161 
162 
~RegexMatcher()163 RegexMatcher::~RegexMatcher() {
164     delete fStack;
165     if (fData != fSmallData) {
166         uprv_free(fData);
167         fData = NULL;
168     }
169     if (fPatternOwned) {
170         delete fPatternOwned;
171         fPatternOwned = NULL;
172         fPattern = NULL;
173     }
174 
175     if (fInput) {
176         delete fInput;
177     }
178     if (fInputText) {
179         utext_close(fInputText);
180     }
181     if (fAltInputText) {
182         utext_close(fAltInputText);
183     }
184 
185     #if UCONFIG_NO_BREAK_ITERATION==0
186     delete fWordBreakItr;
187     #endif
188 }
189 
190 //
191 //   init()   common initialization for use by all constructors.
192 //            Initialize all fields, get the object into a consistent state.
193 //            This must be done even when the initial status shows an error,
194 //            so that the object is initialized sufficiently well for the destructor
195 //            to run safely.
196 //
init(UErrorCode & status)197 void RegexMatcher::init(UErrorCode &status) {
198     fPattern           = NULL;
199     fPatternOwned      = NULL;
200     fFrameSize         = 0;
201     fRegionStart       = 0;
202     fRegionLimit       = 0;
203     fAnchorStart       = 0;
204     fAnchorLimit       = 0;
205     fLookStart         = 0;
206     fLookLimit         = 0;
207     fActiveStart       = 0;
208     fActiveLimit       = 0;
209     fTransparentBounds = FALSE;
210     fAnchoringBounds   = TRUE;
211     fMatch             = FALSE;
212     fMatchStart        = 0;
213     fMatchEnd          = 0;
214     fLastMatchEnd      = -1;
215     fAppendPosition    = 0;
216     fHitEnd            = FALSE;
217     fRequireEnd        = FALSE;
218     fStack             = NULL;
219     fFrame             = NULL;
220     fTimeLimit         = 0;
221     fTime              = 0;
222     fTickCounter       = 0;
223     fStackLimit        = DEFAULT_BACKTRACK_STACK_CAPACITY;
224     fCallbackFn        = NULL;
225     fCallbackContext   = NULL;
226     fFindProgressCallbackFn      = NULL;
227     fFindProgressCallbackContext = NULL;
228     fTraceDebug        = FALSE;
229     fDeferredStatus    = status;
230     fData              = fSmallData;
231     fWordBreakItr      = NULL;
232 
233     fStack             = NULL;
234     fInputText         = NULL;
235     fAltInputText      = NULL;
236     fInput             = NULL;
237     fInputLength       = 0;
238     fInputUniStrMaybeMutable = FALSE;
239 
240     if (U_FAILURE(status)) {
241         fDeferredStatus = status;
242     }
243 }
244 
245 //
246 //  init2()   Common initialization for use by RegexMatcher constructors, part 2.
247 //            This handles the common setup to be done after the Pattern is available.
248 //
init2(UText * input,UErrorCode & status)249 void RegexMatcher::init2(UText *input, UErrorCode &status) {
250     if (U_FAILURE(status)) {
251         fDeferredStatus = status;
252         return;
253     }
254 
255     if (fPattern->fDataSize > (int32_t)(sizeof(fSmallData)/sizeof(fSmallData[0]))) {
256         fData = (int64_t *)uprv_malloc(fPattern->fDataSize * sizeof(int64_t));
257         if (fData == NULL) {
258             status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
259             return;
260         }
261     }
262 
263     fStack = new UVector64(status);
264     if (fStack == NULL) {
265         status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
266         return;
267     }
268 
269     reset(input);
270     setStackLimit(DEFAULT_BACKTRACK_STACK_CAPACITY, status);
271     if (U_FAILURE(status)) {
272         fDeferredStatus = status;
273         return;
274     }
275 }
276 
277 
278 static const UChar BACKSLASH  = 0x5c;
279 static const UChar DOLLARSIGN = 0x24;
280 //--------------------------------------------------------------------------------
281 //
282 //    appendReplacement
283 //
284 //--------------------------------------------------------------------------------
appendReplacement(UnicodeString & dest,const UnicodeString & replacement,UErrorCode & status)285 RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest,
286                                               const UnicodeString &replacement,
287                                               UErrorCode &status) {
288     UText replacementText = UTEXT_INITIALIZER;
289 
290     utext_openConstUnicodeString(&replacementText, &replacement, &status);
291     if (U_SUCCESS(status)) {
292         UText resultText = UTEXT_INITIALIZER;
293         utext_openUnicodeString(&resultText, &dest, &status);
294 
295         if (U_SUCCESS(status)) {
296             appendReplacement(&resultText, &replacementText, status);
297             utext_close(&resultText);
298         }
299         utext_close(&replacementText);
300     }
301 
302     return *this;
303 }
304 
305 //
306 //    appendReplacement, UText mode
307 //
appendReplacement(UText * dest,UText * replacement,UErrorCode & status)308 RegexMatcher &RegexMatcher::appendReplacement(UText *dest,
309                                               UText *replacement,
310                                               UErrorCode &status) {
311     if (U_FAILURE(status)) {
312         return *this;
313     }
314     if (U_FAILURE(fDeferredStatus)) {
315         status = fDeferredStatus;
316         return *this;
317     }
318     if (fMatch == FALSE) {
319         status = U_REGEX_INVALID_STATE;
320         return *this;
321     }
322 
323     // Copy input string from the end of previous match to start of current match
324     int64_t  destLen = utext_nativeLength(dest);
325     if (fMatchStart > fAppendPosition) {
326         if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
327             destLen += utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
328                                      (int32_t)(fMatchStart-fAppendPosition), &status);
329         } else {
330             int32_t len16;
331             if (UTEXT_USES_U16(fInputText)) {
332                 len16 = (int32_t)(fMatchStart-fAppendPosition);
333             } else {
334                 UErrorCode lengthStatus = U_ZERO_ERROR;
335                 len16 = utext_extract(fInputText, fAppendPosition, fMatchStart, NULL, 0, &lengthStatus);
336             }
337             UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
338             if (inputChars == NULL) {
339                 status = U_MEMORY_ALLOCATION_ERROR;
340                 return *this;
341             }
342             utext_extract(fInputText, fAppendPosition, fMatchStart, inputChars, len16+1, &status);
343             destLen += utext_replace(dest, destLen, destLen, inputChars, len16, &status);
344             uprv_free(inputChars);
345         }
346     }
347     fAppendPosition = fMatchEnd;
348 
349 
350     // scan the replacement text, looking for substitutions ($n) and \escapes.
351     //  TODO:  optimize this loop by efficiently scanning for '$' or '\',
352     //         move entire ranges not containing substitutions.
353     UTEXT_SETNATIVEINDEX(replacement, 0);
354     UChar32 c = UTEXT_NEXT32(replacement);
355     while (c != U_SENTINEL) {
356         if (c == BACKSLASH) {
357             // Backslash Escape.  Copy the following char out without further checks.
358             //                    Note:  Surrogate pairs don't need any special handling
359             //                           The second half wont be a '$' or a '\', and
360             //                           will move to the dest normally on the next
361             //                           loop iteration.
362             c = UTEXT_CURRENT32(replacement);
363             if (c == U_SENTINEL) {
364                 break;
365             }
366 
367             if (c==0x55/*U*/ || c==0x75/*u*/) {
368                 // We have a \udddd or \Udddddddd escape sequence.
369                 int32_t offset = 0;
370                 struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(replacement);
371                 UChar32 escapedChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context);
372                 if (escapedChar != (UChar32)0xFFFFFFFF) {
373                     if (U_IS_BMP(escapedChar)) {
374                         UChar c16 = (UChar)escapedChar;
375                         destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
376                     } else {
377                         UChar surrogate[2];
378                         surrogate[0] = U16_LEAD(escapedChar);
379                         surrogate[1] = U16_TRAIL(escapedChar);
380                         if (U_SUCCESS(status)) {
381                             destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
382                         }
383                     }
384                     // TODO:  Report errors for mal-formed \u escapes?
385                     //        As this is, the original sequence is output, which may be OK.
386                     if (context.lastOffset == offset) {
387                         (void)UTEXT_PREVIOUS32(replacement);
388                     } else if (context.lastOffset != offset-1) {
389                         utext_moveIndex32(replacement, offset - context.lastOffset - 1);
390                     }
391                 }
392             } else {
393                 (void)UTEXT_NEXT32(replacement);
394                 // Plain backslash escape.  Just put out the escaped character.
395                 if (U_IS_BMP(c)) {
396                     UChar c16 = (UChar)c;
397                     destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
398                 } else {
399                     UChar surrogate[2];
400                     surrogate[0] = U16_LEAD(c);
401                     surrogate[1] = U16_TRAIL(c);
402                     if (U_SUCCESS(status)) {
403                         destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
404                     }
405                 }
406             }
407         } else if (c != DOLLARSIGN) {
408             // Normal char, not a $.  Copy it out without further checks.
409             if (U_IS_BMP(c)) {
410                 UChar c16 = (UChar)c;
411                 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
412             } else {
413                 UChar surrogate[2];
414                 surrogate[0] = U16_LEAD(c);
415                 surrogate[1] = U16_TRAIL(c);
416                 if (U_SUCCESS(status)) {
417                     destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
418                 }
419             }
420         } else {
421             // We've got a $.  Pick up a capture group number if one follows.
422             // Consume at most the number of digits necessary for the largest capture
423             // number that is valid for this pattern.
424 
425             int32_t numDigits = 0;
426             int32_t groupNum  = 0;
427             UChar32 digitC;
428             for (;;) {
429                 digitC = UTEXT_CURRENT32(replacement);
430                 if (digitC == U_SENTINEL) {
431                     break;
432                 }
433                 if (u_isdigit(digitC) == FALSE) {
434                     break;
435                 }
436                 (void)UTEXT_NEXT32(replacement);
437                 groupNum=groupNum*10 + u_charDigitValue(digitC);
438                 numDigits++;
439                 if (numDigits >= fPattern->fMaxCaptureDigits) {
440                     break;
441                 }
442             }
443 
444 
445             if (numDigits == 0) {
446                 // The $ didn't introduce a group number at all.
447                 // Treat it as just part of the substitution text.
448                 UChar c16 = DOLLARSIGN;
449                 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
450             } else {
451                 // Finally, append the capture group data to the destination.
452                 destLen += appendGroup(groupNum, dest, status);
453                 if (U_FAILURE(status)) {
454                     // Can fail if group number is out of range.
455                     break;
456                 }
457             }
458         }
459 
460         if (U_FAILURE(status)) {
461             break;
462         } else {
463             c = UTEXT_NEXT32(replacement);
464         }
465     }
466 
467     return *this;
468 }
469 
470 
471 
472 //--------------------------------------------------------------------------------
473 //
474 //    appendTail     Intended to be used in conjunction with appendReplacement()
475 //                   To the destination string, append everything following
476 //                   the last match position from the input string.
477 //
478 //                   Note:  Match ranges do not affect appendTail or appendReplacement
479 //
480 //--------------------------------------------------------------------------------
appendTail(UnicodeString & dest)481 UnicodeString &RegexMatcher::appendTail(UnicodeString &dest) {
482     UErrorCode status = U_ZERO_ERROR;
483     UText resultText = UTEXT_INITIALIZER;
484     utext_openUnicodeString(&resultText, &dest, &status);
485 
486     if (U_SUCCESS(status)) {
487         appendTail(&resultText, status);
488         utext_close(&resultText);
489     }
490 
491     return dest;
492 }
493 
494 //
495 //   appendTail, UText mode
496 //
appendTail(UText * dest,UErrorCode & status)497 UText *RegexMatcher::appendTail(UText *dest, UErrorCode &status) {
498     UBool bailOut = FALSE;
499     if (U_FAILURE(status)) {
500         bailOut = TRUE;
501     }
502     if (U_FAILURE(fDeferredStatus)) {
503         status = fDeferredStatus;
504         bailOut = TRUE;
505     }
506 
507     if (bailOut) {
508         //  dest must not be NULL
509         if (dest) {
510             utext_replace(dest, utext_nativeLength(dest), utext_nativeLength(dest), NULL, 0, &status);
511             return dest;
512         }
513     }
514 
515     if (fInputLength > fAppendPosition) {
516         if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
517             int64_t destLen = utext_nativeLength(dest);
518             utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
519                           (int32_t)(fInputLength-fAppendPosition), &status);
520         } else {
521             int32_t len16;
522             if (UTEXT_USES_U16(fInputText)) {
523                 len16 = (int32_t)(fInputLength-fAppendPosition);
524             } else {
525                 len16 = utext_extract(fInputText, fAppendPosition, fInputLength, NULL, 0, &status);
526                 status = U_ZERO_ERROR; // buffer overflow
527             }
528 
529             UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16));
530             if (inputChars == NULL) {
531                 fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
532             } else {
533                 utext_extract(fInputText, fAppendPosition, fInputLength, inputChars, len16, &status); // unterminated
534                 int64_t destLen = utext_nativeLength(dest);
535                 utext_replace(dest, destLen, destLen, inputChars, len16, &status);
536                 uprv_free(inputChars);
537             }
538         }
539     }
540     return dest;
541 }
542 
543 
544 
545 //--------------------------------------------------------------------------------
546 //
547 //   end
548 //
549 //--------------------------------------------------------------------------------
end(UErrorCode & err) const550 int32_t RegexMatcher::end(UErrorCode &err) const {
551     return end(0, err);
552 }
553 
end64(UErrorCode & err) const554 int64_t RegexMatcher::end64(UErrorCode &err) const {
555     return end64(0, err);
556 }
557 
end64(int32_t group,UErrorCode & err) const558 int64_t RegexMatcher::end64(int32_t group, UErrorCode &err) const {
559     if (U_FAILURE(err)) {
560         return -1;
561     }
562     if (fMatch == FALSE) {
563         err = U_REGEX_INVALID_STATE;
564         return -1;
565     }
566     if (group < 0 || group > fPattern->fGroupMap->size()) {
567         err = U_INDEX_OUTOFBOUNDS_ERROR;
568         return -1;
569     }
570     int64_t e = -1;
571     if (group == 0) {
572         e = fMatchEnd;
573     } else {
574         // Get the position within the stack frame of the variables for
575         //    this capture group.
576         int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
577         U_ASSERT(groupOffset < fPattern->fFrameSize);
578         U_ASSERT(groupOffset >= 0);
579         e = fFrame->fExtra[groupOffset + 1];
580     }
581 
582         return e;
583 }
584 
end(int32_t group,UErrorCode & err) const585 int32_t RegexMatcher::end(int32_t group, UErrorCode &err) const {
586     return (int32_t)end64(group, err);
587 }
588 
589 
590 //--------------------------------------------------------------------------------
591 //
592 //   find()
593 //
594 //--------------------------------------------------------------------------------
find()595 UBool RegexMatcher::find() {
596     // Start at the position of the last match end.  (Will be zero if the
597     //   matcher has been reset.)
598     //
599     if (U_FAILURE(fDeferredStatus)) {
600         return FALSE;
601     }
602 
603     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
604         return findUsingChunk();
605     }
606 
607     int64_t startPos = fMatchEnd;
608     if (startPos==0) {
609         startPos = fActiveStart;
610     }
611 
612     if (fMatch) {
613         // Save the position of any previous successful match.
614         fLastMatchEnd = fMatchEnd;
615 
616         if (fMatchStart == fMatchEnd) {
617             // Previous match had zero length.  Move start position up one position
618             //  to avoid sending find() into a loop on zero-length matches.
619             if (startPos >= fActiveLimit) {
620                 fMatch = FALSE;
621                 fHitEnd = TRUE;
622                 return FALSE;
623             }
624             UTEXT_SETNATIVEINDEX(fInputText, startPos);
625             (void)UTEXT_NEXT32(fInputText);
626             startPos = UTEXT_GETNATIVEINDEX(fInputText);
627         }
628     } else {
629         if (fLastMatchEnd >= 0) {
630             // A previous find() failed to match.  Don't try again.
631             //   (without this test, a pattern with a zero-length match
632             //    could match again at the end of an input string.)
633             fHitEnd = TRUE;
634             return FALSE;
635         }
636     }
637 
638 
639     // Compute the position in the input string beyond which a match can not begin, because
640     //   the minimum length match would extend past the end of the input.
641     //   Note:  some patterns that cannot match anything will have fMinMatchLength==Max Int.
642     //          Be aware of possible overflows if making changes here.
643     int64_t testStartLimit;
644     if (UTEXT_USES_U16(fInputText)) {
645         testStartLimit = fActiveLimit - fPattern->fMinMatchLen;
646         if (startPos > testStartLimit) {
647             fMatch = FALSE;
648             fHitEnd = TRUE;
649             return FALSE;
650         }
651     } else {
652         // For now, let the matcher discover that it can't match on its own
653         // We don't know how long the match len is in native characters
654         testStartLimit = fActiveLimit;
655     }
656 
657     UChar32  c;
658     U_ASSERT(startPos >= 0);
659 
660     switch (fPattern->fStartType) {
661     case START_NO_INFO:
662         // No optimization was found.
663         //  Try a match at each input position.
664         for (;;) {
665             MatchAt(startPos, FALSE, fDeferredStatus);
666             if (U_FAILURE(fDeferredStatus)) {
667                 return FALSE;
668             }
669             if (fMatch) {
670                 return TRUE;
671             }
672             if (startPos >= testStartLimit) {
673                 fHitEnd = TRUE;
674                 return FALSE;
675             }
676             UTEXT_SETNATIVEINDEX(fInputText, startPos);
677             (void)UTEXT_NEXT32(fInputText);
678             startPos = UTEXT_GETNATIVEINDEX(fInputText);
679             // Note that it's perfectly OK for a pattern to have a zero-length
680             //   match at the end of a string, so we must make sure that the loop
681             //   runs with startPos == testStartLimit the last time through.
682             if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
683                 return FALSE;
684         }
685         U_ASSERT(FALSE);
686 
687     case START_START:
688         // Matches are only possible at the start of the input string
689         //   (pattern begins with ^ or \A)
690         if (startPos > fActiveStart) {
691             fMatch = FALSE;
692             return FALSE;
693         }
694         MatchAt(startPos, FALSE, fDeferredStatus);
695         if (U_FAILURE(fDeferredStatus)) {
696             return FALSE;
697         }
698         return fMatch;
699 
700 
701     case START_SET:
702         {
703             // Match may start on any char from a pre-computed set.
704             U_ASSERT(fPattern->fMinMatchLen > 0);
705             int64_t pos;
706             UTEXT_SETNATIVEINDEX(fInputText, startPos);
707             for (;;) {
708                 c = UTEXT_NEXT32(fInputText);
709                 pos = UTEXT_GETNATIVEINDEX(fInputText);
710                 // c will be -1 (U_SENTINEL) at end of text, in which case we
711                 // skip this next block (so we don't have a negative array index)
712                 // and handle end of text in the following block.
713                 if (c >= 0 && ((c<256 && fPattern->fInitialChars8->contains(c)) ||
714                               (c>=256 && fPattern->fInitialChars->contains(c)))) {
715                     MatchAt(startPos, FALSE, fDeferredStatus);
716                     if (U_FAILURE(fDeferredStatus)) {
717                         return FALSE;
718                     }
719                     if (fMatch) {
720                         return TRUE;
721                     }
722                     UTEXT_SETNATIVEINDEX(fInputText, pos);
723                 }
724                 if (startPos >= testStartLimit) {
725                     fMatch = FALSE;
726                     fHitEnd = TRUE;
727                     return FALSE;
728                 }
729                 startPos = pos;
730 	            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
731                     return FALSE;
732             }
733         }
734         U_ASSERT(FALSE);
735 
736     case START_STRING:
737     case START_CHAR:
738         {
739             // Match starts on exactly one char.
740             U_ASSERT(fPattern->fMinMatchLen > 0);
741             UChar32 theChar = fPattern->fInitialChar;
742             int64_t pos;
743             UTEXT_SETNATIVEINDEX(fInputText, startPos);
744             for (;;) {
745                 c = UTEXT_NEXT32(fInputText);
746                 pos = UTEXT_GETNATIVEINDEX(fInputText);
747                 if (c == theChar) {
748                     MatchAt(startPos, FALSE, fDeferredStatus);
749                     if (U_FAILURE(fDeferredStatus)) {
750                         return FALSE;
751                     }
752                     if (fMatch) {
753                         return TRUE;
754                     }
755                     UTEXT_SETNATIVEINDEX(fInputText, pos);
756                 }
757                 if (startPos >= testStartLimit) {
758                     fMatch = FALSE;
759                     fHitEnd = TRUE;
760                     return FALSE;
761                 }
762                 startPos = pos;
763 	            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
764                     return FALSE;
765            }
766         }
767         U_ASSERT(FALSE);
768 
769     case START_LINE:
770         {
771             UChar32  c;
772             if (startPos == fAnchorStart) {
773                 MatchAt(startPos, FALSE, fDeferredStatus);
774                 if (U_FAILURE(fDeferredStatus)) {
775                     return FALSE;
776                 }
777                 if (fMatch) {
778                     return TRUE;
779                 }
780                 UTEXT_SETNATIVEINDEX(fInputText, startPos);
781                 c = UTEXT_NEXT32(fInputText);
782                 startPos = UTEXT_GETNATIVEINDEX(fInputText);
783             } else {
784                 UTEXT_SETNATIVEINDEX(fInputText, startPos);
785                 c = UTEXT_PREVIOUS32(fInputText);
786                 UTEXT_SETNATIVEINDEX(fInputText, startPos);
787             }
788 
789             if (fPattern->fFlags & UREGEX_UNIX_LINES) {
790                 for (;;) {
791                     if (c == 0x0a) {
792                             MatchAt(startPos, FALSE, fDeferredStatus);
793                             if (U_FAILURE(fDeferredStatus)) {
794                                 return FALSE;
795                             }
796                             if (fMatch) {
797                                 return TRUE;
798                             }
799                             UTEXT_SETNATIVEINDEX(fInputText, startPos);
800                     }
801                     if (startPos >= testStartLimit) {
802                         fMatch = FALSE;
803                         fHitEnd = TRUE;
804                         return FALSE;
805                     }
806                     c = UTEXT_NEXT32(fInputText);
807                     startPos = UTEXT_GETNATIVEINDEX(fInputText);
808                     // Note that it's perfectly OK for a pattern to have a zero-length
809                     //   match at the end of a string, so we must make sure that the loop
810                     //   runs with startPos == testStartLimit the last time through.
811 		            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
812                         return FALSE;
813                 }
814             } else {
815                 for (;;) {
816                     if (((c & 0x7f) <= 0x29) &&     // First quickly bypass as many chars as possible
817                         ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 )) {
818                             if (c == 0x0d && startPos < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
819                                 (void)UTEXT_NEXT32(fInputText);
820                                 startPos = UTEXT_GETNATIVEINDEX(fInputText);
821                             }
822                             MatchAt(startPos, FALSE, fDeferredStatus);
823                             if (U_FAILURE(fDeferredStatus)) {
824                                 return FALSE;
825                             }
826                             if (fMatch) {
827                                 return TRUE;
828                             }
829                             UTEXT_SETNATIVEINDEX(fInputText, startPos);
830                     }
831                     if (startPos >= testStartLimit) {
832                         fMatch = FALSE;
833                         fHitEnd = TRUE;
834                         return FALSE;
835                     }
836                     c = UTEXT_NEXT32(fInputText);
837                     startPos = UTEXT_GETNATIVEINDEX(fInputText);
838                     // Note that it's perfectly OK for a pattern to have a zero-length
839                     //   match at the end of a string, so we must make sure that the loop
840                     //   runs with startPos == testStartLimit the last time through.
841 		            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
842                         return FALSE;
843                 }
844             }
845         }
846 
847     default:
848         U_ASSERT(FALSE);
849     }
850 
851     U_ASSERT(FALSE);
852     return FALSE;
853 }
854 
855 
856 
find(int64_t start,UErrorCode & status)857 UBool RegexMatcher::find(int64_t start, UErrorCode &status) {
858     if (U_FAILURE(status)) {
859         return FALSE;
860     }
861     if (U_FAILURE(fDeferredStatus)) {
862         status = fDeferredStatus;
863         return FALSE;
864     }
865     this->reset();                        // Note:  Reset() is specified by Java Matcher documentation.
866                                           //        This will reset the region to be the full input length.
867     if (start < 0) {
868         status = U_INDEX_OUTOFBOUNDS_ERROR;
869         return FALSE;
870     }
871 
872     int64_t nativeStart = start;
873     if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
874         status = U_INDEX_OUTOFBOUNDS_ERROR;
875         return FALSE;
876     }
877     fMatchEnd = nativeStart;
878     return find();
879 }
880 
881 
882 //--------------------------------------------------------------------------------
883 //
884 //   findUsingChunk() -- like find(), but with the advance knowledge that the
885 //                       entire string is available in the UText's chunk buffer.
886 //
887 //--------------------------------------------------------------------------------
findUsingChunk()888 UBool RegexMatcher::findUsingChunk() {
889     // Start at the position of the last match end.  (Will be zero if the
890     //   matcher has been reset.
891     //
892 
893     int32_t startPos = (int32_t)fMatchEnd;
894     if (startPos==0) {
895         startPos = (int32_t)fActiveStart;
896     }
897 
898     const UChar *inputBuf = fInputText->chunkContents;
899 
900     if (fMatch) {
901         // Save the position of any previous successful match.
902         fLastMatchEnd = fMatchEnd;
903 
904         if (fMatchStart == fMatchEnd) {
905             // Previous match had zero length.  Move start position up one position
906             //  to avoid sending find() into a loop on zero-length matches.
907             if (startPos >= fActiveLimit) {
908                 fMatch = FALSE;
909                 fHitEnd = TRUE;
910                 return FALSE;
911             }
912             U16_FWD_1(inputBuf, startPos, fInputLength);
913         }
914     } else {
915         if (fLastMatchEnd >= 0) {
916             // A previous find() failed to match.  Don't try again.
917             //   (without this test, a pattern with a zero-length match
918             //    could match again at the end of an input string.)
919             fHitEnd = TRUE;
920             return FALSE;
921         }
922     }
923 
924 
925     // Compute the position in the input string beyond which a match can not begin, because
926     //   the minimum length match would extend past the end of the input.
927     //   Note:  some patterns that cannot match anything will have fMinMatchLength==Max Int.
928     //          Be aware of possible overflows if making changes here.
929     int32_t testLen  = (int32_t)(fActiveLimit - fPattern->fMinMatchLen);
930     if (startPos > testLen) {
931         fMatch = FALSE;
932         fHitEnd = TRUE;
933         return FALSE;
934     }
935 
936     UChar32  c;
937     U_ASSERT(startPos >= 0);
938 
939     switch (fPattern->fStartType) {
940     case START_NO_INFO:
941         // No optimization was found.
942         //  Try a match at each input position.
943         for (;;) {
944             MatchChunkAt(startPos, FALSE, fDeferredStatus);
945             if (U_FAILURE(fDeferredStatus)) {
946                 return FALSE;
947             }
948             if (fMatch) {
949                 return TRUE;
950             }
951             if (startPos >= testLen) {
952                 fHitEnd = TRUE;
953                 return FALSE;
954             }
955             U16_FWD_1(inputBuf, startPos, fActiveLimit);
956             // Note that it's perfectly OK for a pattern to have a zero-length
957             //   match at the end of a string, so we must make sure that the loop
958             //   runs with startPos == testLen the last time through.
959             if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
960                 return FALSE;
961         }
962         U_ASSERT(FALSE);
963 
964     case START_START:
965         // Matches are only possible at the start of the input string
966         //   (pattern begins with ^ or \A)
967         if (startPos > fActiveStart) {
968             fMatch = FALSE;
969             return FALSE;
970         }
971         MatchChunkAt(startPos, FALSE, fDeferredStatus);
972         if (U_FAILURE(fDeferredStatus)) {
973             return FALSE;
974         }
975         return fMatch;
976 
977 
978     case START_SET:
979     {
980         // Match may start on any char from a pre-computed set.
981         U_ASSERT(fPattern->fMinMatchLen > 0);
982         for (;;) {
983             int32_t pos = startPos;
984             U16_NEXT(inputBuf, startPos, fActiveLimit, c);  // like c = inputBuf[startPos++];
985             if ((c<256 && fPattern->fInitialChars8->contains(c)) ||
986                 (c>=256 && fPattern->fInitialChars->contains(c))) {
987                 MatchChunkAt(pos, FALSE, fDeferredStatus);
988                 if (U_FAILURE(fDeferredStatus)) {
989                     return FALSE;
990                 }
991                 if (fMatch) {
992                     return TRUE;
993                 }
994             }
995             if (pos >= testLen) {
996                 fMatch = FALSE;
997                 fHitEnd = TRUE;
998                 return FALSE;
999             }
1000             if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
1001                 return FALSE;
1002         }
1003     }
1004         U_ASSERT(FALSE);
1005 
1006     case START_STRING:
1007     case START_CHAR:
1008     {
1009         // Match starts on exactly one char.
1010         U_ASSERT(fPattern->fMinMatchLen > 0);
1011         UChar32 theChar = fPattern->fInitialChar;
1012         for (;;) {
1013             int32_t pos = startPos;
1014             U16_NEXT(inputBuf, startPos, fActiveLimit, c);  // like c = inputBuf[startPos++];
1015             if (c == theChar) {
1016                 MatchChunkAt(pos, FALSE, fDeferredStatus);
1017                 if (U_FAILURE(fDeferredStatus)) {
1018                     return FALSE;
1019                 }
1020                 if (fMatch) {
1021                     return TRUE;
1022                 }
1023             }
1024             if (pos >= testLen) {
1025                 fMatch = FALSE;
1026                 fHitEnd = TRUE;
1027                 return FALSE;
1028             }
1029             if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
1030                 return FALSE;
1031         }
1032     }
1033         U_ASSERT(FALSE);
1034 
1035     case START_LINE:
1036     {
1037         UChar32  c;
1038         if (startPos == fAnchorStart) {
1039             MatchChunkAt(startPos, FALSE, fDeferredStatus);
1040             if (U_FAILURE(fDeferredStatus)) {
1041                 return FALSE;
1042             }
1043             if (fMatch) {
1044                 return TRUE;
1045             }
1046             U16_FWD_1(inputBuf, startPos, fActiveLimit);
1047         }
1048 
1049         if (fPattern->fFlags & UREGEX_UNIX_LINES) {
1050             for (;;) {
1051                 c = inputBuf[startPos-1];
1052                 if (c == 0x0a) {
1053                     MatchChunkAt(startPos, FALSE, fDeferredStatus);
1054                     if (U_FAILURE(fDeferredStatus)) {
1055                         return FALSE;
1056                     }
1057                     if (fMatch) {
1058                         return TRUE;
1059                     }
1060                 }
1061                 if (startPos >= testLen) {
1062                     fMatch = FALSE;
1063                     fHitEnd = TRUE;
1064                     return FALSE;
1065                 }
1066                 U16_FWD_1(inputBuf, startPos, fActiveLimit);
1067                 // Note that it's perfectly OK for a pattern to have a zero-length
1068                 //   match at the end of a string, so we must make sure that the loop
1069                 //   runs with startPos == testLen the last time through.
1070 	            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
1071                     return FALSE;
1072             }
1073         } else {
1074             for (;;) {
1075                 c = inputBuf[startPos-1];
1076                 if (((c & 0x7f) <= 0x29) &&     // First quickly bypass as many chars as possible
1077                     ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 )) {
1078                     if (c == 0x0d && startPos < fActiveLimit && inputBuf[startPos] == 0x0a) {
1079                         startPos++;
1080                     }
1081                     MatchChunkAt(startPos, FALSE, fDeferredStatus);
1082                     if (U_FAILURE(fDeferredStatus)) {
1083                         return FALSE;
1084                     }
1085                     if (fMatch) {
1086                         return TRUE;
1087                     }
1088                 }
1089                 if (startPos >= testLen) {
1090                     fMatch = FALSE;
1091                     fHitEnd = TRUE;
1092                     return FALSE;
1093                 }
1094                 U16_FWD_1(inputBuf, startPos, fActiveLimit);
1095                 // Note that it's perfectly OK for a pattern to have a zero-length
1096                 //   match at the end of a string, so we must make sure that the loop
1097                 //   runs with startPos == testLen the last time through.
1098 	            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
1099                     return FALSE;
1100             }
1101         }
1102     }
1103 
1104     default:
1105         U_ASSERT(FALSE);
1106     }
1107 
1108     U_ASSERT(FALSE);
1109     return FALSE;
1110 }
1111 
1112 
1113 
1114 //--------------------------------------------------------------------------------
1115 //
1116 //  group()
1117 //
1118 //--------------------------------------------------------------------------------
group(UErrorCode & status) const1119 UnicodeString RegexMatcher::group(UErrorCode &status) const {
1120     return group(0, status);
1121 }
1122 
1123 //  Return immutable shallow clone
group(UText * dest,int64_t & group_len,UErrorCode & status) const1124 UText *RegexMatcher::group(UText *dest, int64_t &group_len, UErrorCode &status) const {
1125     return group(0, dest, group_len, status);
1126 }
1127 
1128 //  Return immutable shallow clone
group(int32_t groupNum,UText * dest,int64_t & group_len,UErrorCode & status) const1129 UText *RegexMatcher::group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const {
1130     group_len = 0;
1131     UBool bailOut = FALSE;
1132     if (U_FAILURE(status)) {
1133         return dest;
1134     }
1135     if (U_FAILURE(fDeferredStatus)) {
1136         status = fDeferredStatus;
1137         bailOut = TRUE;
1138     }
1139     if (fMatch == FALSE) {
1140         status = U_REGEX_INVALID_STATE;
1141         bailOut = TRUE;
1142     }
1143     if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
1144         status = U_INDEX_OUTOFBOUNDS_ERROR;
1145         bailOut = TRUE;
1146     }
1147 
1148     if (bailOut) {
1149         return (dest) ? dest : utext_openUChars(NULL, NULL, 0, &status);
1150     }
1151 
1152     int64_t s, e;
1153     if (groupNum == 0) {
1154         s = fMatchStart;
1155         e = fMatchEnd;
1156     } else {
1157         int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1158         U_ASSERT(groupOffset < fPattern->fFrameSize);
1159         U_ASSERT(groupOffset >= 0);
1160         s = fFrame->fExtra[groupOffset];
1161         e = fFrame->fExtra[groupOffset+1];
1162     }
1163 
1164     if (s < 0) {
1165         // A capture group wasn't part of the match
1166         return utext_clone(dest, fInputText, FALSE, TRUE, &status);
1167     }
1168     U_ASSERT(s <= e);
1169     group_len = e - s;
1170 
1171     dest = utext_clone(dest, fInputText, FALSE, TRUE, &status);
1172     if (dest)
1173         UTEXT_SETNATIVEINDEX(dest, s);
1174     return dest;
1175 }
1176 
group(int32_t groupNum,UErrorCode & status) const1177 UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const {
1178     UnicodeString result;
1179     if (U_FAILURE(status)) {
1180         return result;
1181     }
1182     UText resultText = UTEXT_INITIALIZER;
1183     utext_openUnicodeString(&resultText, &result, &status);
1184     group(groupNum, &resultText, status);
1185     utext_close(&resultText);
1186     return result;
1187 }
1188 
1189 
1190 //  Return deep (mutable) clone
1191 //		Technology Preview (as an API), but note that the UnicodeString API is implemented
1192 //		using this function.
group(int32_t groupNum,UText * dest,UErrorCode & status) const1193 UText *RegexMatcher::group(int32_t groupNum, UText *dest, UErrorCode &status) const {
1194     UBool bailOut = FALSE;
1195     if (U_FAILURE(status)) {
1196         return dest;
1197     }
1198     if (U_FAILURE(fDeferredStatus)) {
1199         status = fDeferredStatus;
1200         bailOut = TRUE;
1201     }
1202 
1203     if (fMatch == FALSE) {
1204         status = U_REGEX_INVALID_STATE;
1205         bailOut = TRUE;
1206     }
1207     if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
1208         status = U_INDEX_OUTOFBOUNDS_ERROR;
1209         bailOut = TRUE;
1210     }
1211 
1212     if (bailOut) {
1213         if (dest) {
1214             utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status);
1215             return dest;
1216         } else {
1217             return utext_openUChars(NULL, NULL, 0, &status);
1218         }
1219     }
1220 
1221     int64_t s, e;
1222     if (groupNum == 0) {
1223         s = fMatchStart;
1224         e = fMatchEnd;
1225     } else {
1226         int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1227         U_ASSERT(groupOffset < fPattern->fFrameSize);
1228         U_ASSERT(groupOffset >= 0);
1229         s = fFrame->fExtra[groupOffset];
1230         e = fFrame->fExtra[groupOffset+1];
1231     }
1232 
1233     if (s < 0) {
1234         // A capture group wasn't part of the match
1235         if (dest) {
1236             utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status);
1237             return dest;
1238         } else {
1239             return utext_openUChars(NULL, NULL, 0, &status);
1240         }
1241     }
1242     U_ASSERT(s <= e);
1243 
1244     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1245         U_ASSERT(e <= fInputLength);
1246         if (dest) {
1247             utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents+s, (int32_t)(e-s), &status);
1248         } else {
1249             UText groupText = UTEXT_INITIALIZER;
1250             utext_openUChars(&groupText, fInputText->chunkContents+s, e-s, &status);
1251             dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status);
1252             utext_close(&groupText);
1253         }
1254     } else {
1255         int32_t len16;
1256         if (UTEXT_USES_U16(fInputText)) {
1257             len16 = (int32_t)(e-s);
1258         } else {
1259             UErrorCode lengthStatus = U_ZERO_ERROR;
1260             len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus);
1261         }
1262         UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
1263         if (groupChars == NULL) {
1264             status = U_MEMORY_ALLOCATION_ERROR;
1265             return dest;
1266         }
1267         utext_extract(fInputText, s, e, groupChars, len16+1, &status);
1268 
1269         if (dest) {
1270             utext_replace(dest, 0, utext_nativeLength(dest), groupChars, len16, &status);
1271         } else {
1272             UText groupText = UTEXT_INITIALIZER;
1273             utext_openUChars(&groupText, groupChars, len16, &status);
1274             dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status);
1275             utext_close(&groupText);
1276         }
1277 
1278         uprv_free(groupChars);
1279     }
1280     return dest;
1281 }
1282 
1283 //--------------------------------------------------------------------------------
1284 //
1285 //  appendGroup() -- currently internal only, appends a group to a UText rather
1286 //                   than replacing its contents
1287 //
1288 //--------------------------------------------------------------------------------
1289 
appendGroup(int32_t groupNum,UText * dest,UErrorCode & status) const1290 int64_t RegexMatcher::appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const {
1291     if (U_FAILURE(status)) {
1292         return 0;
1293     }
1294     if (U_FAILURE(fDeferredStatus)) {
1295         status = fDeferredStatus;
1296         return 0;
1297     }
1298     int64_t destLen = utext_nativeLength(dest);
1299 
1300     if (fMatch == FALSE) {
1301         status = U_REGEX_INVALID_STATE;
1302         return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1303     }
1304     if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
1305         status = U_INDEX_OUTOFBOUNDS_ERROR;
1306         return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1307     }
1308 
1309     int64_t s, e;
1310     if (groupNum == 0) {
1311         s = fMatchStart;
1312         e = fMatchEnd;
1313     } else {
1314         int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1315         U_ASSERT(groupOffset < fPattern->fFrameSize);
1316         U_ASSERT(groupOffset >= 0);
1317         s = fFrame->fExtra[groupOffset];
1318         e = fFrame->fExtra[groupOffset+1];
1319     }
1320 
1321     if (s < 0) {
1322         // A capture group wasn't part of the match
1323         return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1324     }
1325     U_ASSERT(s <= e);
1326 
1327     int64_t deltaLen;
1328     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1329         U_ASSERT(e <= fInputLength);
1330         deltaLen = utext_replace(dest, destLen, destLen, fInputText->chunkContents+s, (int32_t)(e-s), &status);
1331     } else {
1332         int32_t len16;
1333         if (UTEXT_USES_U16(fInputText)) {
1334             len16 = (int32_t)(e-s);
1335         } else {
1336             UErrorCode lengthStatus = U_ZERO_ERROR;
1337             len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus);
1338         }
1339         UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
1340         if (groupChars == NULL) {
1341             status = U_MEMORY_ALLOCATION_ERROR;
1342             return 0;
1343         }
1344         utext_extract(fInputText, s, e, groupChars, len16+1, &status);
1345 
1346         deltaLen = utext_replace(dest, destLen, destLen, groupChars, len16, &status);
1347         uprv_free(groupChars);
1348     }
1349     return deltaLen;
1350 }
1351 
1352 
1353 
1354 //--------------------------------------------------------------------------------
1355 //
1356 //  groupCount()
1357 //
1358 //--------------------------------------------------------------------------------
groupCount() const1359 int32_t RegexMatcher::groupCount() const {
1360     return fPattern->fGroupMap->size();
1361 }
1362 
1363 
1364 
1365 //--------------------------------------------------------------------------------
1366 //
1367 //  hasAnchoringBounds()
1368 //
1369 //--------------------------------------------------------------------------------
hasAnchoringBounds() const1370 UBool RegexMatcher::hasAnchoringBounds() const {
1371     return fAnchoringBounds;
1372 }
1373 
1374 
1375 //--------------------------------------------------------------------------------
1376 //
1377 //  hasTransparentBounds()
1378 //
1379 //--------------------------------------------------------------------------------
hasTransparentBounds() const1380 UBool RegexMatcher::hasTransparentBounds() const {
1381     return fTransparentBounds;
1382 }
1383 
1384 
1385 
1386 //--------------------------------------------------------------------------------
1387 //
1388 //  hitEnd()
1389 //
1390 //--------------------------------------------------------------------------------
hitEnd() const1391 UBool RegexMatcher::hitEnd() const {
1392     return fHitEnd;
1393 }
1394 
1395 
1396 //--------------------------------------------------------------------------------
1397 //
1398 //  input()
1399 //
1400 //--------------------------------------------------------------------------------
input() const1401 const UnicodeString &RegexMatcher::input() const {
1402     if (!fInput) {
1403         UErrorCode status = U_ZERO_ERROR;
1404         int32_t len16;
1405         if (UTEXT_USES_U16(fInputText)) {
1406             len16 = (int32_t)fInputLength;
1407         } else {
1408             len16 = utext_extract(fInputText, 0, fInputLength, NULL, 0, &status);
1409             status = U_ZERO_ERROR; // overflow, length status
1410         }
1411         UnicodeString *result = new UnicodeString(len16, 0, 0);
1412 
1413         UChar *inputChars = result->getBuffer(len16);
1414         utext_extract(fInputText, 0, fInputLength, inputChars, len16, &status); // unterminated warning
1415         result->releaseBuffer(len16);
1416 
1417         (*(const UnicodeString **)&fInput) = result; // pointer assignment, rather than operator=
1418     }
1419 
1420     return *fInput;
1421 }
1422 
1423 //--------------------------------------------------------------------------------
1424 //
1425 //  inputText()
1426 //
1427 //--------------------------------------------------------------------------------
inputText() const1428 UText *RegexMatcher::inputText() const {
1429     return fInputText;
1430 }
1431 
1432 
1433 //--------------------------------------------------------------------------------
1434 //
1435 //  getInput() -- like inputText(), but makes a clone or copies into another UText
1436 //
1437 //--------------------------------------------------------------------------------
getInput(UText * dest,UErrorCode & status) const1438 UText *RegexMatcher::getInput (UText *dest, UErrorCode &status) const {
1439     UBool bailOut = FALSE;
1440     if (U_FAILURE(status)) {
1441         return dest;
1442     }
1443     if (U_FAILURE(fDeferredStatus)) {
1444         status = fDeferredStatus;
1445         bailOut = TRUE;
1446     }
1447 
1448     if (bailOut) {
1449         if (dest) {
1450             utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status);
1451             return dest;
1452         } else {
1453             return utext_clone(NULL, fInputText, FALSE, TRUE, &status);
1454         }
1455     }
1456 
1457     if (dest) {
1458         if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1459             utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents, (int32_t)fInputLength, &status);
1460         } else {
1461             int32_t input16Len;
1462             if (UTEXT_USES_U16(fInputText)) {
1463                 input16Len = (int32_t)fInputLength;
1464             } else {
1465                 UErrorCode lengthStatus = U_ZERO_ERROR;
1466                 input16Len = utext_extract(fInputText, 0, fInputLength, NULL, 0, &lengthStatus); // buffer overflow error
1467             }
1468             UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(input16Len));
1469             if (inputChars == NULL) {
1470                 return dest;
1471             }
1472 
1473             status = U_ZERO_ERROR;
1474             utext_extract(fInputText, 0, fInputLength, inputChars, input16Len, &status); // not terminated warning
1475             status = U_ZERO_ERROR;
1476             utext_replace(dest, 0, utext_nativeLength(dest), inputChars, input16Len, &status);
1477 
1478             uprv_free(inputChars);
1479         }
1480         return dest;
1481     } else {
1482         return utext_clone(NULL, fInputText, FALSE, TRUE, &status);
1483     }
1484 }
1485 
1486 
1487 static UBool compat_SyncMutableUTextContents(UText *ut);
compat_SyncMutableUTextContents(UText * ut)1488 static UBool compat_SyncMutableUTextContents(UText *ut) {
1489     UBool retVal = FALSE;
1490 
1491     //  In the following test, we're really only interested in whether the UText should switch
1492     //  between heap and stack allocation.  If length hasn't changed, we won't, so the chunkContents
1493     //  will still point to the correct data.
1494     if (utext_nativeLength(ut) != ut->nativeIndexingLimit) {
1495         UnicodeString *us=(UnicodeString *)ut->context;
1496 
1497         // Update to the latest length.
1498         // For example, (utext_nativeLength(ut) != ut->nativeIndexingLimit).
1499         int32_t newLength = us->length();
1500 
1501         // Update the chunk description.
1502         // The buffer may have switched between stack- and heap-based.
1503         ut->chunkContents    = us->getBuffer();
1504         ut->chunkLength      = newLength;
1505         ut->chunkNativeLimit = newLength;
1506         ut->nativeIndexingLimit = newLength;
1507         retVal = TRUE;
1508     }
1509 
1510     return retVal;
1511 }
1512 
1513 //--------------------------------------------------------------------------------
1514 //
1515 //  lookingAt()
1516 //
1517 //--------------------------------------------------------------------------------
lookingAt(UErrorCode & status)1518 UBool RegexMatcher::lookingAt(UErrorCode &status) {
1519     if (U_FAILURE(status)) {
1520         return FALSE;
1521     }
1522     if (U_FAILURE(fDeferredStatus)) {
1523         status = fDeferredStatus;
1524         return FALSE;
1525     }
1526 
1527     if (fInputUniStrMaybeMutable) {
1528         if (compat_SyncMutableUTextContents(fInputText)) {
1529         fInputLength = utext_nativeLength(fInputText);
1530         reset();
1531         }
1532     }
1533     else {
1534         resetPreserveRegion();
1535     }
1536     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1537         MatchChunkAt((int32_t)fActiveStart, FALSE, status);
1538     } else {
1539         MatchAt(fActiveStart, FALSE, status);
1540     }
1541     return fMatch;
1542 }
1543 
1544 
lookingAt(int64_t start,UErrorCode & status)1545 UBool RegexMatcher::lookingAt(int64_t start, UErrorCode &status) {
1546     if (U_FAILURE(status)) {
1547         return FALSE;
1548     }
1549     if (U_FAILURE(fDeferredStatus)) {
1550         status = fDeferredStatus;
1551         return FALSE;
1552     }
1553     reset();
1554 
1555     if (start < 0) {
1556         status = U_INDEX_OUTOFBOUNDS_ERROR;
1557         return FALSE;
1558     }
1559 
1560     if (fInputUniStrMaybeMutable) {
1561         if (compat_SyncMutableUTextContents(fInputText)) {
1562         fInputLength = utext_nativeLength(fInputText);
1563         reset();
1564         }
1565     }
1566 
1567     int64_t nativeStart;
1568     nativeStart = start;
1569     if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
1570         status = U_INDEX_OUTOFBOUNDS_ERROR;
1571         return FALSE;
1572     }
1573 
1574     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1575         MatchChunkAt((int32_t)nativeStart, FALSE, status);
1576     } else {
1577         MatchAt(nativeStart, FALSE, status);
1578     }
1579     return fMatch;
1580 }
1581 
1582 
1583 
1584 //--------------------------------------------------------------------------------
1585 //
1586 //  matches()
1587 //
1588 //--------------------------------------------------------------------------------
matches(UErrorCode & status)1589 UBool RegexMatcher::matches(UErrorCode &status) {
1590     if (U_FAILURE(status)) {
1591         return FALSE;
1592     }
1593     if (U_FAILURE(fDeferredStatus)) {
1594         status = fDeferredStatus;
1595         return FALSE;
1596     }
1597 
1598     if (fInputUniStrMaybeMutable) {
1599         if (compat_SyncMutableUTextContents(fInputText)) {
1600         fInputLength = utext_nativeLength(fInputText);
1601         reset();
1602         }
1603     }
1604     else {
1605         resetPreserveRegion();
1606     }
1607 
1608     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1609         MatchChunkAt((int32_t)fActiveStart, TRUE, status);
1610     } else {
1611         MatchAt(fActiveStart, TRUE, status);
1612     }
1613     return fMatch;
1614 }
1615 
1616 
matches(int64_t start,UErrorCode & status)1617 UBool RegexMatcher::matches(int64_t start, UErrorCode &status) {
1618     if (U_FAILURE(status)) {
1619         return FALSE;
1620     }
1621     if (U_FAILURE(fDeferredStatus)) {
1622         status = fDeferredStatus;
1623         return FALSE;
1624     }
1625     reset();
1626 
1627     if (start < 0) {
1628         status = U_INDEX_OUTOFBOUNDS_ERROR;
1629         return FALSE;
1630     }
1631 
1632     if (fInputUniStrMaybeMutable) {
1633         if (compat_SyncMutableUTextContents(fInputText)) {
1634         fInputLength = utext_nativeLength(fInputText);
1635         reset();
1636         }
1637     }
1638 
1639     int64_t nativeStart;
1640     nativeStart = start;
1641     if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
1642         status = U_INDEX_OUTOFBOUNDS_ERROR;
1643         return FALSE;
1644     }
1645 
1646     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1647         MatchChunkAt((int32_t)nativeStart, TRUE, status);
1648     } else {
1649         MatchAt(nativeStart, TRUE, status);
1650     }
1651     return fMatch;
1652 }
1653 
1654 
1655 
1656 //--------------------------------------------------------------------------------
1657 //
1658 //    pattern
1659 //
1660 //--------------------------------------------------------------------------------
pattern() const1661 const RegexPattern &RegexMatcher::pattern() const {
1662     return *fPattern;
1663 }
1664 
1665 
1666 
1667 //--------------------------------------------------------------------------------
1668 //
1669 //    region
1670 //
1671 //--------------------------------------------------------------------------------
region(int64_t regionStart,int64_t regionLimit,int64_t startIndex,UErrorCode & status)1672 RegexMatcher &RegexMatcher::region(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status) {
1673     if (U_FAILURE(status)) {
1674         return *this;
1675     }
1676 
1677     if (regionStart>regionLimit || regionStart<0 || regionLimit<0) {
1678         status = U_ILLEGAL_ARGUMENT_ERROR;
1679     }
1680 
1681     int64_t nativeStart = regionStart;
1682     int64_t nativeLimit = regionLimit;
1683     if (nativeStart > fInputLength || nativeLimit > fInputLength) {
1684       status = U_ILLEGAL_ARGUMENT_ERROR;
1685     }
1686 
1687     if (startIndex == -1)
1688       this->reset();
1689     else
1690       resetPreserveRegion();
1691 
1692     fRegionStart = nativeStart;
1693     fRegionLimit = nativeLimit;
1694     fActiveStart = nativeStart;
1695     fActiveLimit = nativeLimit;
1696 
1697     if (startIndex != -1) {
1698       if (startIndex < fActiveStart || startIndex > fActiveLimit) {
1699           status = U_INDEX_OUTOFBOUNDS_ERROR;
1700       }
1701       fMatchEnd = startIndex;
1702     }
1703 
1704     if (!fTransparentBounds) {
1705         fLookStart = nativeStart;
1706         fLookLimit = nativeLimit;
1707     }
1708     if (fAnchoringBounds) {
1709         fAnchorStart = nativeStart;
1710         fAnchorLimit = nativeLimit;
1711     }
1712     return *this;
1713 }
1714 
region(int64_t start,int64_t limit,UErrorCode & status)1715 RegexMatcher &RegexMatcher::region(int64_t start, int64_t limit, UErrorCode &status) {
1716   return region(start, limit, -1, status);
1717 }
1718 
1719 //--------------------------------------------------------------------------------
1720 //
1721 //    regionEnd
1722 //
1723 //--------------------------------------------------------------------------------
regionEnd() const1724 int32_t RegexMatcher::regionEnd() const {
1725     return (int32_t)fRegionLimit;
1726 }
1727 
regionEnd64() const1728 int64_t RegexMatcher::regionEnd64() const {
1729     return fRegionLimit;
1730 }
1731 
1732 //--------------------------------------------------------------------------------
1733 //
1734 //    regionStart
1735 //
1736 //--------------------------------------------------------------------------------
regionStart() const1737 int32_t RegexMatcher::regionStart() const {
1738     return (int32_t)fRegionStart;
1739 }
1740 
regionStart64() const1741 int64_t RegexMatcher::regionStart64() const {
1742     return fRegionStart;
1743 }
1744 
1745 
1746 //--------------------------------------------------------------------------------
1747 //
1748 //    replaceAll
1749 //
1750 //--------------------------------------------------------------------------------
replaceAll(const UnicodeString & replacement,UErrorCode & status)1751 UnicodeString RegexMatcher::replaceAll(const UnicodeString &replacement, UErrorCode &status) {
1752     UText replacementText = UTEXT_INITIALIZER;
1753     UText resultText = UTEXT_INITIALIZER;
1754     UnicodeString resultString;
1755     if (U_FAILURE(status)) {
1756         return resultString;
1757     }
1758 
1759     utext_openConstUnicodeString(&replacementText, &replacement, &status);
1760     utext_openUnicodeString(&resultText, &resultString, &status);
1761 
1762     replaceAll(&replacementText, &resultText, status);
1763 
1764     utext_close(&resultText);
1765     utext_close(&replacementText);
1766 
1767     return resultString;
1768 }
1769 
1770 
1771 //
1772 //    replaceAll, UText mode
1773 //
replaceAll(UText * replacement,UText * dest,UErrorCode & status)1774 UText *RegexMatcher::replaceAll(UText *replacement, UText *dest, UErrorCode &status) {
1775     if (U_FAILURE(status)) {
1776         return dest;
1777     }
1778     if (U_FAILURE(fDeferredStatus)) {
1779         status = fDeferredStatus;
1780         return dest;
1781     }
1782 
1783     if (dest == NULL) {
1784         UnicodeString emptyString;
1785         UText empty = UTEXT_INITIALIZER;
1786 
1787         utext_openUnicodeString(&empty, &emptyString, &status);
1788         dest = utext_clone(NULL, &empty, TRUE, FALSE, &status);
1789         utext_close(&empty);
1790     }
1791 
1792     if (U_SUCCESS(status)) {
1793         reset();
1794         while (find()) {
1795             appendReplacement(dest, replacement, status);
1796             if (U_FAILURE(status)) {
1797                 break;
1798             }
1799         }
1800         appendTail(dest, status);
1801     }
1802 
1803     return dest;
1804 }
1805 
1806 
1807 //--------------------------------------------------------------------------------
1808 //
1809 //    replaceFirst
1810 //
1811 //--------------------------------------------------------------------------------
replaceFirst(const UnicodeString & replacement,UErrorCode & status)1812 UnicodeString RegexMatcher::replaceFirst(const UnicodeString &replacement, UErrorCode &status) {
1813     UText replacementText = UTEXT_INITIALIZER;
1814     UText resultText = UTEXT_INITIALIZER;
1815     UnicodeString resultString;
1816 
1817     utext_openConstUnicodeString(&replacementText, &replacement, &status);
1818     utext_openUnicodeString(&resultText, &resultString, &status);
1819 
1820     replaceFirst(&replacementText, &resultText, status);
1821 
1822     utext_close(&resultText);
1823     utext_close(&replacementText);
1824 
1825     return resultString;
1826 }
1827 
1828 //
1829 //    replaceFirst, UText mode
1830 //
replaceFirst(UText * replacement,UText * dest,UErrorCode & status)1831 UText *RegexMatcher::replaceFirst(UText *replacement, UText *dest, UErrorCode &status) {
1832     if (U_FAILURE(status)) {
1833         return dest;
1834     }
1835     if (U_FAILURE(fDeferredStatus)) {
1836         status = fDeferredStatus;
1837         return dest;
1838     }
1839 
1840     reset();
1841     if (!find()) {
1842         return getInput(dest, status);
1843     }
1844 
1845     if (dest == NULL) {
1846         UnicodeString emptyString;
1847         UText empty = UTEXT_INITIALIZER;
1848 
1849         utext_openUnicodeString(&empty, &emptyString, &status);
1850         dest = utext_clone(NULL, &empty, TRUE, FALSE, &status);
1851         utext_close(&empty);
1852     }
1853 
1854     appendReplacement(dest, replacement, status);
1855     appendTail(dest, status);
1856 
1857     return dest;
1858 }
1859 
1860 
1861 //--------------------------------------------------------------------------------
1862 //
1863 //     requireEnd
1864 //
1865 //--------------------------------------------------------------------------------
requireEnd() const1866 UBool RegexMatcher::requireEnd() const {
1867     return fRequireEnd;
1868 }
1869 
1870 
1871 //--------------------------------------------------------------------------------
1872 //
1873 //     reset
1874 //
1875 //--------------------------------------------------------------------------------
reset()1876 RegexMatcher &RegexMatcher::reset() {
1877     fRegionStart    = 0;
1878     fRegionLimit    = fInputLength;
1879     fActiveStart    = 0;
1880     fActiveLimit    = fInputLength;
1881     fAnchorStart    = 0;
1882     fAnchorLimit    = fInputLength;
1883     fLookStart      = 0;
1884     fLookLimit      = fInputLength;
1885     resetPreserveRegion();
1886     return *this;
1887 }
1888 
1889 
1890 
resetPreserveRegion()1891 void RegexMatcher::resetPreserveRegion() {
1892     fMatchStart     = 0;
1893     fMatchEnd       = 0;
1894     fLastMatchEnd   = -1;
1895     fAppendPosition = 0;
1896     fMatch          = FALSE;
1897     fHitEnd         = FALSE;
1898     fRequireEnd     = FALSE;
1899     fTime           = 0;
1900     fTickCounter    = TIMER_INITIAL_VALUE;
1901     //resetStack(); // more expensive than it looks...
1902 }
1903 
1904 
reset(const UnicodeString & input)1905 RegexMatcher &RegexMatcher::reset(const UnicodeString &input) {
1906     fInputText = utext_openConstUnicodeString(fInputText, &input, &fDeferredStatus);
1907     if (fPattern->fNeedsAltInput) {
1908         fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
1909     }
1910     fInputLength = utext_nativeLength(fInputText);
1911 
1912     reset();
1913     delete fInput;
1914     fInput = NULL;
1915 
1916     //  Do the following for any UnicodeString.
1917     //  This is for compatibility for those clients who modify the input string "live" during regex operations.
1918     fInputUniStrMaybeMutable = TRUE;
1919 
1920     if (fWordBreakItr != NULL) {
1921 #if UCONFIG_NO_BREAK_ITERATION==0
1922         UErrorCode status = U_ZERO_ERROR;
1923         fWordBreakItr->setText(fInputText, status);
1924 #endif
1925     }
1926     return *this;
1927 }
1928 
1929 
reset(UText * input)1930 RegexMatcher &RegexMatcher::reset(UText *input) {
1931     if (fInputText != input) {
1932         fInputText = utext_clone(fInputText, input, FALSE, TRUE, &fDeferredStatus);
1933         if (fPattern->fNeedsAltInput) fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
1934         fInputLength = utext_nativeLength(fInputText);
1935 
1936         delete fInput;
1937         fInput = NULL;
1938 
1939         if (fWordBreakItr != NULL) {
1940 #if UCONFIG_NO_BREAK_ITERATION==0
1941             UErrorCode status = U_ZERO_ERROR;
1942             fWordBreakItr->setText(input, status);
1943 #endif
1944         }
1945     }
1946     reset();
1947     fInputUniStrMaybeMutable = FALSE;
1948 
1949     return *this;
1950 }
1951 
1952 /*RegexMatcher &RegexMatcher::reset(const UChar *) {
1953     fDeferredStatus = U_INTERNAL_PROGRAM_ERROR;
1954     return *this;
1955 }*/
1956 
reset(int64_t position,UErrorCode & status)1957 RegexMatcher &RegexMatcher::reset(int64_t position, UErrorCode &status) {
1958     if (U_FAILURE(status)) {
1959         return *this;
1960     }
1961     reset();       // Reset also resets the region to be the entire string.
1962 
1963     if (position < 0 || position > fActiveLimit) {
1964         status = U_INDEX_OUTOFBOUNDS_ERROR;
1965         return *this;
1966     }
1967     fMatchEnd = position;
1968     return *this;
1969 }
1970 
1971 
1972 //--------------------------------------------------------------------------------
1973 //
1974 //    refresh
1975 //
1976 //--------------------------------------------------------------------------------
refreshInputText(UText * input,UErrorCode & status)1977 RegexMatcher &RegexMatcher::refreshInputText(UText *input, UErrorCode &status) {
1978     if (U_FAILURE(status)) {
1979         return *this;
1980     }
1981     if (input == NULL) {
1982         status = U_ILLEGAL_ARGUMENT_ERROR;
1983         return *this;
1984     }
1985     if (utext_nativeLength(fInputText) != utext_nativeLength(input)) {
1986         status = U_ILLEGAL_ARGUMENT_ERROR;
1987         return *this;
1988     }
1989     int64_t  pos = utext_getNativeIndex(fInputText);
1990     //  Shallow read-only clone of the new UText into the existing input UText
1991     fInputText = utext_clone(fInputText, input, FALSE, TRUE, &status);
1992     if (U_FAILURE(status)) {
1993         return *this;
1994     }
1995     utext_setNativeIndex(fInputText, pos);
1996 
1997     if (fAltInputText != NULL) {
1998         pos = utext_getNativeIndex(fAltInputText);
1999         fAltInputText = utext_clone(fAltInputText, input, FALSE, TRUE, &status);
2000         if (U_FAILURE(status)) {
2001             return *this;
2002         }
2003         utext_setNativeIndex(fAltInputText, pos);
2004     }
2005     return *this;
2006 }
2007 
2008 
2009 
2010 //--------------------------------------------------------------------------------
2011 //
2012 //    setTrace
2013 //
2014 //--------------------------------------------------------------------------------
setTrace(UBool state)2015 void RegexMatcher::setTrace(UBool state) {
2016     fTraceDebug = state;
2017 }
2018 
2019 
2020 
2021 //---------------------------------------------------------------------
2022 //
2023 //   split
2024 //
2025 //---------------------------------------------------------------------
split(const UnicodeString & input,UnicodeString dest[],int32_t destCapacity,UErrorCode & status)2026 int32_t  RegexMatcher::split(const UnicodeString &input,
2027         UnicodeString    dest[],
2028         int32_t          destCapacity,
2029         UErrorCode      &status)
2030 {
2031     UText inputText = UTEXT_INITIALIZER;
2032     utext_openConstUnicodeString(&inputText, &input, &status);
2033     if (U_FAILURE(status)) {
2034         return 0;
2035     }
2036 
2037     UText **destText = (UText **)uprv_malloc(sizeof(UText*)*destCapacity);
2038     if (destText == NULL) {
2039         status = U_MEMORY_ALLOCATION_ERROR;
2040         return 0;
2041     }
2042     int32_t i;
2043     for (i = 0; i < destCapacity; i++) {
2044         destText[i] = utext_openUnicodeString(NULL, &dest[i], &status);
2045     }
2046 
2047     int32_t fieldCount = split(&inputText, destText, destCapacity, status);
2048 
2049     for (i = 0; i < destCapacity; i++) {
2050         utext_close(destText[i]);
2051     }
2052 
2053     uprv_free(destText);
2054     utext_close(&inputText);
2055     return fieldCount;
2056 }
2057 
2058 //
2059 //   split, UText mode
2060 //
split(UText * input,UText * dest[],int32_t destCapacity,UErrorCode & status)2061 int32_t  RegexMatcher::split(UText *input,
2062         UText           *dest[],
2063         int32_t          destCapacity,
2064         UErrorCode      &status)
2065 {
2066     //
2067     // Check arguements for validity
2068     //
2069     if (U_FAILURE(status)) {
2070         return 0;
2071     };
2072 
2073     if (destCapacity < 1) {
2074         status = U_ILLEGAL_ARGUMENT_ERROR;
2075         return 0;
2076     }
2077 
2078     //
2079     // Reset for the input text
2080     //
2081     reset(input);
2082     int64_t   nextOutputStringStart = 0;
2083     if (fActiveLimit == 0) {
2084         return 0;
2085     }
2086 
2087     //
2088     // Loop through the input text, searching for the delimiter pattern
2089     //
2090     int32_t i;
2091     int32_t numCaptureGroups = fPattern->fGroupMap->size();
2092     for (i=0; ; i++) {
2093         if (i>=destCapacity-1) {
2094             // There is one or zero output string left.
2095             // Fill the last output string with whatever is left from the input, then exit the loop.
2096             //  ( i will be == destCapacity if we filled the output array while processing
2097             //    capture groups of the delimiter expression, in which case we will discard the
2098             //    last capture group saved in favor of the unprocessed remainder of the
2099             //    input string.)
2100             i = destCapacity-1;
2101             if (fActiveLimit > nextOutputStringStart) {
2102                 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2103                     if (dest[i]) {
2104                         utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2105                                       input->chunkContents+nextOutputStringStart,
2106                                       (int32_t)(fActiveLimit-nextOutputStringStart), &status);
2107                     } else {
2108                         UText remainingText = UTEXT_INITIALIZER;
2109                         utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2110                                          fActiveLimit-nextOutputStringStart, &status);
2111                         dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2112                         utext_close(&remainingText);
2113                     }
2114                 } else {
2115                     UErrorCode lengthStatus = U_ZERO_ERROR;
2116                     int32_t remaining16Length =
2117                         utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus);
2118                     UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2119                     if (remainingChars == NULL) {
2120                         status = U_MEMORY_ALLOCATION_ERROR;
2121                         break;
2122                     }
2123 
2124                     utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
2125                     if (dest[i]) {
2126                         utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2127                     } else {
2128                         UText remainingText = UTEXT_INITIALIZER;
2129                         utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2130                         dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2131                         utext_close(&remainingText);
2132                     }
2133 
2134                     uprv_free(remainingChars);
2135                 }
2136             }
2137             break;
2138         }
2139         if (find()) {
2140             // We found another delimiter.  Move everything from where we started looking
2141             //  up until the start of the delimiter into the next output string.
2142             if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2143                 if (dest[i]) {
2144                     utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2145                                   input->chunkContents+nextOutputStringStart,
2146                                   (int32_t)(fMatchStart-nextOutputStringStart), &status);
2147                 } else {
2148                     UText remainingText = UTEXT_INITIALIZER;
2149                     utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2150                                       fMatchStart-nextOutputStringStart, &status);
2151                     dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2152                     utext_close(&remainingText);
2153                 }
2154             } else {
2155                 UErrorCode lengthStatus = U_ZERO_ERROR;
2156                 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fMatchStart, NULL, 0, &lengthStatus);
2157                 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2158                 if (remainingChars == NULL) {
2159                     status = U_MEMORY_ALLOCATION_ERROR;
2160                     break;
2161                 }
2162                 utext_extract(input, nextOutputStringStart, fMatchStart, remainingChars, remaining16Length+1, &status);
2163                 if (dest[i]) {
2164                     utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2165                 } else {
2166                     UText remainingText = UTEXT_INITIALIZER;
2167                     utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2168                     dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2169                     utext_close(&remainingText);
2170                 }
2171 
2172                 uprv_free(remainingChars);
2173             }
2174             nextOutputStringStart = fMatchEnd;
2175 
2176             // If the delimiter pattern has capturing parentheses, the captured
2177             //  text goes out into the next n destination strings.
2178             int32_t groupNum;
2179             for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
2180                 if (i >= destCapacity-2) {
2181                     // Never fill the last available output string with capture group text.
2182                     // It will filled with the last field, the remainder of the
2183                     //  unsplit input text.
2184                     break;
2185                 }
2186                 i++;
2187                 dest[i] = group(groupNum, dest[i], status);
2188             }
2189 
2190             if (nextOutputStringStart == fActiveLimit) {
2191                 // The delimiter was at the end of the string.  We're done, but first
2192                 // we output one last empty string, for the empty field following
2193                 //   the delimiter at the end of input.
2194                 if (i+1 < destCapacity) {
2195                     ++i;
2196                     if (dest[i] == NULL) {
2197                         dest[i] = utext_openUChars(NULL, NULL, 0, &status);
2198                     } else {
2199                         static UChar emptyString[] = {(UChar)0};
2200                         utext_replace(dest[i], 0, utext_nativeLength(dest[i]), emptyString, 0, &status);
2201                     }
2202                 }
2203                 break;
2204 
2205             }
2206         }
2207         else
2208         {
2209             // We ran off the end of the input while looking for the next delimiter.
2210             // All the remaining text goes into the current output string.
2211             if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2212                 if (dest[i]) {
2213                     utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2214                                   input->chunkContents+nextOutputStringStart,
2215                                   (int32_t)(fActiveLimit-nextOutputStringStart), &status);
2216                 } else {
2217                     UText remainingText = UTEXT_INITIALIZER;
2218                     utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2219                                      fActiveLimit-nextOutputStringStart, &status);
2220                     dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2221                     utext_close(&remainingText);
2222                 }
2223             } else {
2224                 UErrorCode lengthStatus = U_ZERO_ERROR;
2225                 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus);
2226                 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2227                 if (remainingChars == NULL) {
2228                     status = U_MEMORY_ALLOCATION_ERROR;
2229                     break;
2230                 }
2231 
2232                 utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
2233                 if (dest[i]) {
2234                     utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2235                 } else {
2236                     UText remainingText = UTEXT_INITIALIZER;
2237                     utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2238                     dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2239                     utext_close(&remainingText);
2240                 }
2241 
2242                 uprv_free(remainingChars);
2243             }
2244             break;
2245         }
2246         if (U_FAILURE(status)) {
2247             break;
2248         }
2249     }   // end of for loop
2250     return i+1;
2251 }
2252 
2253 
2254 //--------------------------------------------------------------------------------
2255 //
2256 //     start
2257 //
2258 //--------------------------------------------------------------------------------
start(UErrorCode & status) const2259 int32_t RegexMatcher::start(UErrorCode &status) const {
2260     return start(0, status);
2261 }
2262 
start64(UErrorCode & status) const2263 int64_t RegexMatcher::start64(UErrorCode &status) const {
2264     return start64(0, status);
2265 }
2266 
2267 //--------------------------------------------------------------------------------
2268 //
2269 //     start(int32_t group, UErrorCode &status)
2270 //
2271 //--------------------------------------------------------------------------------
2272 
start64(int32_t group,UErrorCode & status) const2273 int64_t RegexMatcher::start64(int32_t group, UErrorCode &status) const {
2274     if (U_FAILURE(status)) {
2275         return -1;
2276     }
2277     if (U_FAILURE(fDeferredStatus)) {
2278         status = fDeferredStatus;
2279         return -1;
2280     }
2281     if (fMatch == FALSE) {
2282         status = U_REGEX_INVALID_STATE;
2283         return -1;
2284     }
2285     if (group < 0 || group > fPattern->fGroupMap->size()) {
2286         status = U_INDEX_OUTOFBOUNDS_ERROR;
2287         return -1;
2288     }
2289     int64_t s;
2290     if (group == 0) {
2291         s = fMatchStart;
2292     } else {
2293         int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
2294         U_ASSERT(groupOffset < fPattern->fFrameSize);
2295         U_ASSERT(groupOffset >= 0);
2296         s = fFrame->fExtra[groupOffset];
2297     }
2298 
2299     return s;
2300 }
2301 
2302 
start(int32_t group,UErrorCode & status) const2303 int32_t RegexMatcher::start(int32_t group, UErrorCode &status) const {
2304     return (int32_t)start64(group, status);
2305 }
2306 
2307 //--------------------------------------------------------------------------------
2308 //
2309 //     useAnchoringBounds
2310 //
2311 //--------------------------------------------------------------------------------
useAnchoringBounds(UBool b)2312 RegexMatcher &RegexMatcher::useAnchoringBounds(UBool b) {
2313     fAnchoringBounds = b;
2314     fAnchorStart = (fAnchoringBounds ? fRegionStart : 0);
2315     fAnchorLimit = (fAnchoringBounds ? fRegionLimit : fInputLength);
2316     return *this;
2317 }
2318 
2319 
2320 //--------------------------------------------------------------------------------
2321 //
2322 //     useTransparentBounds
2323 //
2324 //--------------------------------------------------------------------------------
useTransparentBounds(UBool b)2325 RegexMatcher &RegexMatcher::useTransparentBounds(UBool b) {
2326     fTransparentBounds = b;
2327     fLookStart = (fTransparentBounds ? 0 : fRegionStart);
2328     fLookLimit = (fTransparentBounds ? fInputLength : fRegionLimit);
2329     return *this;
2330 }
2331 
2332 //--------------------------------------------------------------------------------
2333 //
2334 //     setTimeLimit
2335 //
2336 //--------------------------------------------------------------------------------
setTimeLimit(int32_t limit,UErrorCode & status)2337 void RegexMatcher::setTimeLimit(int32_t limit, UErrorCode &status) {
2338     if (U_FAILURE(status)) {
2339         return;
2340     }
2341     if (U_FAILURE(fDeferredStatus)) {
2342         status = fDeferredStatus;
2343         return;
2344     }
2345     if (limit < 0) {
2346         status = U_ILLEGAL_ARGUMENT_ERROR;
2347         return;
2348     }
2349     fTimeLimit = limit;
2350 }
2351 
2352 
2353 //--------------------------------------------------------------------------------
2354 //
2355 //     getTimeLimit
2356 //
2357 //--------------------------------------------------------------------------------
getTimeLimit() const2358 int32_t RegexMatcher::getTimeLimit() const {
2359     return fTimeLimit;
2360 }
2361 
2362 
2363 //--------------------------------------------------------------------------------
2364 //
2365 //     setStackLimit
2366 //
2367 //--------------------------------------------------------------------------------
setStackLimit(int32_t limit,UErrorCode & status)2368 void RegexMatcher::setStackLimit(int32_t limit, UErrorCode &status) {
2369     if (U_FAILURE(status)) {
2370         return;
2371     }
2372     if (U_FAILURE(fDeferredStatus)) {
2373         status = fDeferredStatus;
2374         return;
2375     }
2376     if (limit < 0) {
2377         status = U_ILLEGAL_ARGUMENT_ERROR;
2378         return;
2379     }
2380 
2381     // Reset the matcher.  This is needed here in case there is a current match
2382     //    whose final stack frame (containing the match results, pointed to by fFrame)
2383     //    would be lost by resizing to a smaller stack size.
2384     reset();
2385 
2386     if (limit == 0) {
2387         // Unlimited stack expansion
2388         fStack->setMaxCapacity(0);
2389     } else {
2390         // Change the units of the limit  from bytes to ints, and bump the size up
2391         //   to be big enough to hold at least one stack frame for the pattern,
2392         //   if it isn't there already.
2393         int32_t adjustedLimit = limit / sizeof(int32_t);
2394         if (adjustedLimit < fPattern->fFrameSize) {
2395             adjustedLimit = fPattern->fFrameSize;
2396         }
2397         fStack->setMaxCapacity(adjustedLimit);
2398     }
2399     fStackLimit = limit;
2400 }
2401 
2402 
2403 //--------------------------------------------------------------------------------
2404 //
2405 //     getStackLimit
2406 //
2407 //--------------------------------------------------------------------------------
getStackLimit() const2408 int32_t RegexMatcher::getStackLimit() const {
2409     return fStackLimit;
2410 }
2411 
2412 
2413 //--------------------------------------------------------------------------------
2414 //
2415 //     setMatchCallback
2416 //
2417 //--------------------------------------------------------------------------------
setMatchCallback(URegexMatchCallback * callback,const void * context,UErrorCode & status)2418 void RegexMatcher::setMatchCallback(URegexMatchCallback     *callback,
2419                                     const void              *context,
2420                                     UErrorCode              &status) {
2421     if (U_FAILURE(status)) {
2422         return;
2423     }
2424     fCallbackFn = callback;
2425     fCallbackContext = context;
2426 }
2427 
2428 
2429 //--------------------------------------------------------------------------------
2430 //
2431 //     getMatchCallback
2432 //
2433 //--------------------------------------------------------------------------------
getMatchCallback(URegexMatchCallback * & callback,const void * & context,UErrorCode & status)2434 void RegexMatcher::getMatchCallback(URegexMatchCallback   *&callback,
2435                                   const void              *&context,
2436                                   UErrorCode              &status) {
2437     if (U_FAILURE(status)) {
2438        return;
2439     }
2440     callback = fCallbackFn;
2441     context  = fCallbackContext;
2442 }
2443 
2444 
2445 //--------------------------------------------------------------------------------
2446 //
2447 //     setMatchCallback
2448 //
2449 //--------------------------------------------------------------------------------
setFindProgressCallback(URegexFindProgressCallback * callback,const void * context,UErrorCode & status)2450 void RegexMatcher::setFindProgressCallback(URegexFindProgressCallback      *callback,
2451                                                 const void                      *context,
2452                                                 UErrorCode                      &status) {
2453     if (U_FAILURE(status)) {
2454         return;
2455     }
2456     fFindProgressCallbackFn = callback;
2457     fFindProgressCallbackContext = context;
2458 }
2459 
2460 
2461 //--------------------------------------------------------------------------------
2462 //
2463 //     getMatchCallback
2464 //
2465 //--------------------------------------------------------------------------------
getFindProgressCallback(URegexFindProgressCallback * & callback,const void * & context,UErrorCode & status)2466 void RegexMatcher::getFindProgressCallback(URegexFindProgressCallback    *&callback,
2467                                                 const void                    *&context,
2468                                                 UErrorCode                    &status) {
2469     if (U_FAILURE(status)) {
2470        return;
2471     }
2472     callback = fFindProgressCallbackFn;
2473     context  = fFindProgressCallbackContext;
2474 }
2475 
2476 
2477 //================================================================================
2478 //
2479 //    Code following this point in this file is the internal
2480 //    Match Engine Implementation.
2481 //
2482 //================================================================================
2483 
2484 
2485 //--------------------------------------------------------------------------------
2486 //
2487 //   resetStack
2488 //           Discard any previous contents of the state save stack, and initialize a
2489 //           new stack frame to all -1.  The -1s are needed for capture group limits,
2490 //           where they indicate that a group has not yet matched anything.
2491 //--------------------------------------------------------------------------------
resetStack()2492 REStackFrame *RegexMatcher::resetStack() {
2493     // Discard any previous contents of the state save stack, and initialize a
2494     //  new stack frame with all -1 data.  The -1s are needed for capture group limits,
2495     //  where they indicate that a group has not yet matched anything.
2496     fStack->removeAllElements();
2497 
2498     REStackFrame *iFrame = (REStackFrame *)fStack->reserveBlock(fPattern->fFrameSize, fDeferredStatus);
2499     int32_t i;
2500     for (i=0; i<fPattern->fFrameSize-RESTACKFRAME_HDRCOUNT; i++) {
2501         iFrame->fExtra[i] = -1;
2502     }
2503     return iFrame;
2504 }
2505 
2506 
2507 
2508 //--------------------------------------------------------------------------------
2509 //
2510 //   isWordBoundary
2511 //                     in perl, "xab..cd..", \b is true at positions 0,3,5,7
2512 //                     For us,
2513 //                       If the current char is a combining mark,
2514 //                          \b is FALSE.
2515 //                       Else Scan backwards to the first non-combining char.
2516 //                            We are at a boundary if the this char and the original chars are
2517 //                               opposite in membership in \w set
2518 //
2519 //          parameters:   pos   - the current position in the input buffer
2520 //
2521 //              TODO:  double-check edge cases at region boundaries.
2522 //
2523 //--------------------------------------------------------------------------------
isWordBoundary(int64_t pos)2524 UBool RegexMatcher::isWordBoundary(int64_t pos) {
2525     UBool isBoundary = FALSE;
2526     UBool cIsWord    = FALSE;
2527 
2528     if (pos >= fLookLimit) {
2529         fHitEnd = TRUE;
2530     } else {
2531         // Determine whether char c at current position is a member of the word set of chars.
2532         // If we're off the end of the string, behave as though we're not at a word char.
2533         UTEXT_SETNATIVEINDEX(fInputText, pos);
2534         UChar32  c = UTEXT_CURRENT32(fInputText);
2535         if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
2536             // Current char is a combining one.  Not a boundary.
2537             return FALSE;
2538         }
2539         cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
2540     }
2541 
2542     // Back up until we come to a non-combining char, determine whether
2543     //  that char is a word char.
2544     UBool prevCIsWord = FALSE;
2545     for (;;) {
2546         if (UTEXT_GETNATIVEINDEX(fInputText) <= fLookStart) {
2547             break;
2548         }
2549         UChar32 prevChar = UTEXT_PREVIOUS32(fInputText);
2550         if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
2551               || u_charType(prevChar) == U_FORMAT_CHAR)) {
2552             prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar);
2553             break;
2554         }
2555     }
2556     isBoundary = cIsWord ^ prevCIsWord;
2557     return isBoundary;
2558 }
2559 
isChunkWordBoundary(int32_t pos)2560 UBool RegexMatcher::isChunkWordBoundary(int32_t pos) {
2561     UBool isBoundary = FALSE;
2562     UBool cIsWord    = FALSE;
2563 
2564     const UChar *inputBuf = fInputText->chunkContents;
2565 
2566     if (pos >= fLookLimit) {
2567         fHitEnd = TRUE;
2568     } else {
2569         // Determine whether char c at current position is a member of the word set of chars.
2570         // If we're off the end of the string, behave as though we're not at a word char.
2571         UChar32 c;
2572         U16_GET(inputBuf, fLookStart, pos, fLookLimit, c);
2573         if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
2574             // Current char is a combining one.  Not a boundary.
2575             return FALSE;
2576         }
2577         cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
2578     }
2579 
2580     // Back up until we come to a non-combining char, determine whether
2581     //  that char is a word char.
2582     UBool prevCIsWord = FALSE;
2583     for (;;) {
2584         if (pos <= fLookStart) {
2585             break;
2586         }
2587         UChar32 prevChar;
2588         U16_PREV(inputBuf, fLookStart, pos, prevChar);
2589         if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
2590               || u_charType(prevChar) == U_FORMAT_CHAR)) {
2591             prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar);
2592             break;
2593         }
2594     }
2595     isBoundary = cIsWord ^ prevCIsWord;
2596     return isBoundary;
2597 }
2598 
2599 //--------------------------------------------------------------------------------
2600 //
2601 //   isUWordBoundary
2602 //
2603 //         Test for a word boundary using RBBI word break.
2604 //
2605 //          parameters:   pos   - the current position in the input buffer
2606 //
2607 //--------------------------------------------------------------------------------
isUWordBoundary(int64_t pos)2608 UBool RegexMatcher::isUWordBoundary(int64_t pos) {
2609     UBool       returnVal = FALSE;
2610 #if UCONFIG_NO_BREAK_ITERATION==0
2611 
2612     // If we haven't yet created a break iterator for this matcher, do it now.
2613     if (fWordBreakItr == NULL) {
2614         fWordBreakItr =
2615             (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), fDeferredStatus);
2616         if (U_FAILURE(fDeferredStatus)) {
2617             return FALSE;
2618         }
2619         fWordBreakItr->setText(fInputText, fDeferredStatus);
2620     }
2621 
2622     if (pos >= fLookLimit) {
2623         fHitEnd = TRUE;
2624         returnVal = TRUE;   // With Unicode word rules, only positions within the interior of "real"
2625                             //    words are not boundaries.  All non-word chars stand by themselves,
2626                             //    with word boundaries on both sides.
2627     } else {
2628         if (!UTEXT_USES_U16(fInputText)) {
2629             // !!!: Would like a better way to do this!
2630             UErrorCode status = U_ZERO_ERROR;
2631             pos = utext_extract(fInputText, 0, pos, NULL, 0, &status);
2632         }
2633         returnVal = fWordBreakItr->isBoundary((int32_t)pos);
2634     }
2635 #endif
2636     return   returnVal;
2637 }
2638 
2639 //--------------------------------------------------------------------------------
2640 //
2641 //   IncrementTime     This function is called once each TIMER_INITIAL_VALUE state
2642 //                     saves. Increment the "time" counter, and call the
2643 //                     user callback function if there is one installed.
2644 //
2645 //                     If the match operation needs to be aborted, either for a time-out
2646 //                     or because the user callback asked for it, just set an error status.
2647 //                     The engine will pick that up and stop in its outer loop.
2648 //
2649 //--------------------------------------------------------------------------------
IncrementTime(UErrorCode & status)2650 void RegexMatcher::IncrementTime(UErrorCode &status) {
2651     fTickCounter = TIMER_INITIAL_VALUE;
2652     fTime++;
2653     if (fCallbackFn != NULL) {
2654         if ((*fCallbackFn)(fCallbackContext, fTime) == FALSE) {
2655             status = U_REGEX_STOPPED_BY_CALLER;
2656             return;
2657         }
2658     }
2659     if (fTimeLimit > 0 && fTime >= fTimeLimit) {
2660         status = U_REGEX_TIME_OUT;
2661     }
2662 }
2663 
2664 //--------------------------------------------------------------------------------
2665 //
2666 //   ReportFindProgress     This function is called once for each advance in the target
2667 //                          string from the find() function, and calls the user progress callback
2668 //                          function if there is one installed.
2669 //
2670 //                          NOTE:
2671 //
2672 //                          If the match operation needs to be aborted because the user
2673 //                          callback asked for it, just set an error status.
2674 //                          The engine will pick that up and stop in its outer loop.
2675 //
2676 //--------------------------------------------------------------------------------
ReportFindProgress(int64_t matchIndex,UErrorCode & status)2677 UBool RegexMatcher::ReportFindProgress(int64_t matchIndex, UErrorCode &status) {
2678     if (fFindProgressCallbackFn != NULL) {
2679         if ((*fFindProgressCallbackFn)(fFindProgressCallbackContext, matchIndex) == FALSE) {
2680             status = U_ZERO_ERROR /*U_REGEX_STOPPED_BY_CALLER*/;
2681             return FALSE;
2682         }
2683     }
2684     return TRUE;
2685 }
2686 
2687 //--------------------------------------------------------------------------------
2688 //
2689 //   StateSave
2690 //       Make a new stack frame, initialized as a copy of the current stack frame.
2691 //       Set the pattern index in the original stack frame from the operand value
2692 //       in the opcode.  Execution of the engine continues with the state in
2693 //       the newly created stack frame
2694 //
2695 //       Note that reserveBlock() may grow the stack, resulting in the
2696 //       whole thing being relocated in memory.
2697 //
2698 //    Parameters:
2699 //       fp           The top frame pointer when called.  At return, a new
2700 //                    fame will be present
2701 //       savePatIdx   An index into the compiled pattern.  Goes into the original
2702 //                    (not new) frame.  If execution ever back-tracks out of the
2703 //                    new frame, this will be where we continue from in the pattern.
2704 //    Return
2705 //                    The new frame pointer.
2706 //
2707 //--------------------------------------------------------------------------------
StateSave(REStackFrame * fp,int64_t savePatIdx,UErrorCode & status)2708 inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status) {
2709     // push storage for a new frame.
2710     int64_t *newFP = fStack->reserveBlock(fFrameSize, status);
2711     if (newFP == NULL) {
2712         // Failure on attempted stack expansion.
2713         //   Stack function set some other error code, change it to a more
2714         //   specific one for regular expressions.
2715         status = U_REGEX_STACK_OVERFLOW;
2716         // We need to return a writable stack frame, so just return the
2717         //    previous frame.  The match operation will stop quickly
2718         //    because of the error status, after which the frame will never
2719         //    be looked at again.
2720         return fp;
2721     }
2722     fp = (REStackFrame *)(newFP - fFrameSize);  // in case of realloc of stack.
2723 
2724     // New stack frame = copy of old top frame.
2725     int64_t *source = (int64_t *)fp;
2726     int64_t *dest   = newFP;
2727     for (;;) {
2728         *dest++ = *source++;
2729         if (source == newFP) {
2730             break;
2731         }
2732     }
2733 
2734     fTickCounter--;
2735     if (fTickCounter <= 0) {
2736        IncrementTime(status);    // Re-initializes fTickCounter
2737     }
2738     fp->fPatIdx = savePatIdx;
2739     return (REStackFrame *)newFP;
2740 }
2741 
2742 
2743 //--------------------------------------------------------------------------------
2744 //
2745 //   MatchAt      This is the actual matching engine.
2746 //
2747 //                  startIdx:    begin matching a this index.
2748 //                  toEnd:       if true, match must extend to end of the input region
2749 //
2750 //--------------------------------------------------------------------------------
MatchAt(int64_t startIdx,UBool toEnd,UErrorCode & status)2751 void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) {
2752     UBool       isMatch  = FALSE;      // True if the we have a match.
2753 
2754     int64_t     backSearchIndex = U_INT64_MAX; // used after greedy single-character matches for searching backwards
2755 
2756     int32_t     op;                    // Operation from the compiled pattern, split into
2757     int32_t     opType;                //    the opcode
2758     int32_t     opValue;               //    and the operand value.
2759 
2760     #ifdef REGEX_RUN_DEBUG
2761     if (fTraceDebug)
2762     {
2763         printf("MatchAt(startIdx=%ld)\n", startIdx);
2764         printf("Original Pattern: ");
2765         UChar32 c = utext_next32From(fPattern->fPattern, 0);
2766         while (c != U_SENTINEL) {
2767             if (c<32 || c>256) {
2768                 c = '.';
2769             }
2770             REGEX_DUMP_DEBUG_PRINTF(("%c", c));
2771 
2772             c = UTEXT_NEXT32(fPattern->fPattern);
2773         }
2774         printf("\n");
2775         printf("Input String: ");
2776         c = utext_next32From(fInputText, 0);
2777         while (c != U_SENTINEL) {
2778             if (c<32 || c>256) {
2779                 c = '.';
2780             }
2781             printf("%c", c);
2782 
2783             c = UTEXT_NEXT32(fInputText);
2784         }
2785         printf("\n");
2786         printf("\n");
2787     }
2788     #endif
2789 
2790     if (U_FAILURE(status)) {
2791         return;
2792     }
2793 
2794     //  Cache frequently referenced items from the compiled pattern
2795     //
2796     int64_t             *pat           = fPattern->fCompiledPat->getBuffer();
2797 
2798     const UChar         *litText       = fPattern->fLiteralText.getBuffer();
2799     UVector             *sets          = fPattern->fSets;
2800 
2801     fFrameSize = fPattern->fFrameSize;
2802     REStackFrame        *fp            = resetStack();
2803 
2804     fp->fPatIdx   = 0;
2805     fp->fInputIdx = startIdx;
2806 
2807     // Zero out the pattern's static data
2808     int32_t i;
2809     for (i = 0; i<fPattern->fDataSize; i++) {
2810         fData[i] = 0;
2811     }
2812 
2813     //
2814     //  Main loop for interpreting the compiled pattern.
2815     //  One iteration of the loop per pattern operation performed.
2816     //
2817     for (;;) {
2818 #if 0
2819         if (_heapchk() != _HEAPOK) {
2820             fprintf(stderr, "Heap Trouble\n");
2821         }
2822 #endif
2823 
2824         op      = (int32_t)pat[fp->fPatIdx];
2825         opType  = URX_TYPE(op);
2826         opValue = URX_VAL(op);
2827         #ifdef REGEX_RUN_DEBUG
2828         if (fTraceDebug) {
2829             UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2830             printf("inputIdx=%d   inputChar=%x   sp=%3d   activeLimit=%d  ", fp->fInputIdx,
2831                 UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
2832             fPattern->dumpOp(fp->fPatIdx);
2833         }
2834         #endif
2835         fp->fPatIdx++;
2836 
2837         switch (opType) {
2838 
2839 
2840         case URX_NOP:
2841             break;
2842 
2843 
2844         case URX_BACKTRACK:
2845             // Force a backtrack.  In some circumstances, the pattern compiler
2846             //   will notice that the pattern can't possibly match anything, and will
2847             //   emit one of these at that point.
2848             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2849             break;
2850 
2851 
2852         case URX_ONECHAR:
2853             if (fp->fInputIdx < fActiveLimit) {
2854                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2855                 UChar32 c = UTEXT_NEXT32(fInputText);
2856                 if (c == opValue) {
2857                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
2858                     break;
2859                 }
2860             } else {
2861                 fHitEnd = TRUE;
2862             }
2863             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2864             break;
2865 
2866 
2867         case URX_STRING:
2868             {
2869                 // Test input against a literal string.
2870                 // Strings require two slots in the compiled pattern, one for the
2871                 //   offset to the string text, and one for the length.
2872 
2873                 int32_t   stringStartIdx = opValue;
2874                 op      = (int32_t)pat[fp->fPatIdx];     // Fetch the second operand
2875                 fp->fPatIdx++;
2876                 opType    = URX_TYPE(op);
2877                 int32_t stringLen = URX_VAL(op);
2878                 U_ASSERT(opType == URX_STRING_LEN);
2879                 U_ASSERT(stringLen >= 2);
2880 
2881                 const UChar *patternString = litText+stringStartIdx;
2882                 int32_t patternStringIndex = 0;
2883                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2884                 UChar32 inputChar;
2885                 UChar32 patternChar;
2886                 UBool success = TRUE;
2887                 while (patternStringIndex < stringLen) {
2888                     if (UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) {
2889                         success = FALSE;
2890                         fHitEnd = TRUE;
2891                         break;
2892                     }
2893                     inputChar = UTEXT_NEXT32(fInputText);
2894                     U16_NEXT(patternString, patternStringIndex, stringLen, patternChar);
2895                     if (patternChar != inputChar) {
2896                         success = FALSE;
2897                         break;
2898                     }
2899                 }
2900 
2901                 if (success) {
2902                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
2903                 } else {
2904                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2905                 }
2906             }
2907             break;
2908 
2909 
2910         case URX_STATE_SAVE:
2911             fp = StateSave(fp, opValue, status);
2912             break;
2913 
2914 
2915         case URX_END:
2916             // The match loop will exit via this path on a successful match,
2917             //   when we reach the end of the pattern.
2918             if (toEnd && fp->fInputIdx != fActiveLimit) {
2919                 // The pattern matched, but not to the end of input.  Try some more.
2920                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2921                 break;
2922             }
2923             isMatch = TRUE;
2924             goto  breakFromLoop;
2925 
2926         // Start and End Capture stack frame variables are laid out out like this:
2927             //  fp->fExtra[opValue]  - The start of a completed capture group
2928             //             opValue+1 - The end   of a completed capture group
2929             //             opValue+2 - the start of a capture group whose end
2930             //                          has not yet been reached (and might not ever be).
2931         case URX_START_CAPTURE:
2932             U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
2933             fp->fExtra[opValue+2] = fp->fInputIdx;
2934             break;
2935 
2936 
2937         case URX_END_CAPTURE:
2938             U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
2939             U_ASSERT(fp->fExtra[opValue+2] >= 0);            // Start pos for this group must be set.
2940             fp->fExtra[opValue]   = fp->fExtra[opValue+2];   // Tentative start becomes real.
2941             fp->fExtra[opValue+1] = fp->fInputIdx;           // End position
2942             U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
2943             break;
2944 
2945 
2946         case URX_DOLLAR:                   //  $, test for End of line
2947                                            //     or for position before new line at end of input
2948             {
2949                 if (fp->fInputIdx >= fAnchorLimit) {
2950                     // We really are at the end of input.  Success.
2951                     fHitEnd = TRUE;
2952                     fRequireEnd = TRUE;
2953                     break;
2954                 }
2955 
2956                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2957 
2958                 // If we are positioned just before a new-line that is located at the
2959                 //   end of input, succeed.
2960                 UChar32 c = UTEXT_NEXT32(fInputText);
2961                 if (UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
2962                     if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) {
2963                         // If not in the middle of a CR/LF sequence
2964                       if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && ((void)UTEXT_PREVIOUS32(fInputText), UTEXT_PREVIOUS32(fInputText))==0x0d)) {
2965                             // At new-line at end of input. Success
2966                             fHitEnd = TRUE;
2967                             fRequireEnd = TRUE;
2968 
2969                             break;
2970                         }
2971                     }
2972                 } else {
2973                     UChar32 nextC = UTEXT_NEXT32(fInputText);
2974                     if (c == 0x0d && nextC == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
2975                         fHitEnd = TRUE;
2976                         fRequireEnd = TRUE;
2977                         break;                         // At CR/LF at end of input.  Success
2978                     }
2979                 }
2980 
2981                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2982             }
2983             break;
2984 
2985 
2986          case URX_DOLLAR_D:                   //  $, test for End of Line, in UNIX_LINES mode.
2987             if (fp->fInputIdx >= fAnchorLimit) {
2988                 // Off the end of input.  Success.
2989                 fHitEnd = TRUE;
2990                 fRequireEnd = TRUE;
2991                 break;
2992             } else {
2993                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2994                 UChar32 c = UTEXT_NEXT32(fInputText);
2995                 // Either at the last character of input, or off the end.
2996                 if (c == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) == fAnchorLimit) {
2997                     fHitEnd = TRUE;
2998                     fRequireEnd = TRUE;
2999                     break;
3000                 }
3001             }
3002 
3003             // Not at end of input.  Back-track out.
3004             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3005             break;
3006 
3007 
3008          case URX_DOLLAR_M:                //  $, test for End of line in multi-line mode
3009              {
3010                  if (fp->fInputIdx >= fAnchorLimit) {
3011                      // We really are at the end of input.  Success.
3012                      fHitEnd = TRUE;
3013                      fRequireEnd = TRUE;
3014                      break;
3015                  }
3016                  // If we are positioned just before a new-line, succeed.
3017                  // It makes no difference where the new-line is within the input.
3018                  UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3019                  UChar32 c = UTEXT_CURRENT32(fInputText);
3020                  if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) {
3021                      // At a line end, except for the odd chance of  being in the middle of a CR/LF sequence
3022                      //  In multi-line mode, hitting a new-line just before the end of input does not
3023                      //   set the hitEnd or requireEnd flags
3024                      if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && UTEXT_PREVIOUS32(fInputText)==0x0d)) {
3025                         break;
3026                      }
3027                  }
3028                  // not at a new line.  Fail.
3029                  fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3030              }
3031              break;
3032 
3033 
3034          case URX_DOLLAR_MD:                //  $, test for End of line in multi-line and UNIX_LINES mode
3035              {
3036                  if (fp->fInputIdx >= fAnchorLimit) {
3037                      // We really are at the end of input.  Success.
3038                      fHitEnd = TRUE;
3039                      fRequireEnd = TRUE;  // Java set requireEnd in this case, even though
3040                      break;               //   adding a new-line would not lose the match.
3041                  }
3042                  // If we are not positioned just before a new-line, the test fails; backtrack out.
3043                  // It makes no difference where the new-line is within the input.
3044                  UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3045                  if (UTEXT_CURRENT32(fInputText) != 0x0a) {
3046                      fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3047                  }
3048              }
3049              break;
3050 
3051 
3052        case URX_CARET:                    //  ^, test for start of line
3053             if (fp->fInputIdx != fAnchorStart) {
3054                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3055             }
3056             break;
3057 
3058 
3059        case URX_CARET_M:                   //  ^, test for start of line in mulit-line mode
3060            {
3061                if (fp->fInputIdx == fAnchorStart) {
3062                    // We are at the start input.  Success.
3063                    break;
3064                }
3065                // Check whether character just before the current pos is a new-line
3066                //   unless we are at the end of input
3067                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3068                UChar32  c = UTEXT_PREVIOUS32(fInputText);
3069                if ((fp->fInputIdx < fAnchorLimit) &&
3070                    ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
3071                    //  It's a new-line.  ^ is true.  Success.
3072                    //  TODO:  what should be done with positions between a CR and LF?
3073                    break;
3074                }
3075                // Not at the start of a line.  Fail.
3076                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3077            }
3078            break;
3079 
3080 
3081        case URX_CARET_M_UNIX:       //  ^, test for start of line in mulit-line + Unix-line mode
3082            {
3083                U_ASSERT(fp->fInputIdx >= fAnchorStart);
3084                if (fp->fInputIdx <= fAnchorStart) {
3085                    // We are at the start input.  Success.
3086                    break;
3087                }
3088                // Check whether character just before the current pos is a new-line
3089                U_ASSERT(fp->fInputIdx <= fAnchorLimit);
3090                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3091                UChar32  c = UTEXT_PREVIOUS32(fInputText);
3092                if (c != 0x0a) {
3093                    // Not at the start of a line.  Back-track out.
3094                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3095                }
3096            }
3097            break;
3098 
3099         case URX_BACKSLASH_B:          // Test for word boundaries
3100             {
3101                 UBool success = isWordBoundary(fp->fInputIdx);
3102                 success ^= (UBool)(opValue != 0);     // flip sense for \B
3103                 if (!success) {
3104                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3105                 }
3106             }
3107             break;
3108 
3109 
3110         case URX_BACKSLASH_BU:          // Test for word boundaries, Unicode-style
3111             {
3112                 UBool success = isUWordBoundary(fp->fInputIdx);
3113                 success ^= (UBool)(opValue != 0);     // flip sense for \B
3114                 if (!success) {
3115                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3116                 }
3117             }
3118             break;
3119 
3120 
3121         case URX_BACKSLASH_D:            // Test for decimal digit
3122             {
3123                 if (fp->fInputIdx >= fActiveLimit) {
3124                     fHitEnd = TRUE;
3125                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3126                     break;
3127                 }
3128 
3129                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3130 
3131                 UChar32 c = UTEXT_NEXT32(fInputText);
3132                 int8_t ctype = u_charType(c);     // TODO:  make a unicode set for this.  Will be faster.
3133                 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
3134                 success ^= (UBool)(opValue != 0);        // flip sense for \D
3135                 if (success) {
3136                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3137                 } else {
3138                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3139                 }
3140             }
3141             break;
3142 
3143 
3144         case URX_BACKSLASH_G:          // Test for position at end of previous match
3145             if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) {
3146                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3147             }
3148             break;
3149 
3150 
3151         case URX_BACKSLASH_X:
3152             //  Match a Grapheme, as defined by Unicode TR 29.
3153             //  Differs slightly from Perl, which consumes combining marks independently
3154             //    of context.
3155             {
3156 
3157                 // Fail if at end of input
3158                 if (fp->fInputIdx >= fActiveLimit) {
3159                     fHitEnd = TRUE;
3160                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3161                     break;
3162                 }
3163 
3164                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3165 
3166                 // Examine (and consume) the current char.
3167                 //   Dispatch into a little state machine, based on the char.
3168                 UChar32  c;
3169                 c = UTEXT_NEXT32(fInputText);
3170                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3171                 UnicodeSet **sets = fPattern->fStaticSets;
3172                 if (sets[URX_GC_NORMAL]->contains(c))  goto GC_Extend;
3173                 if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control;
3174                 if (sets[URX_GC_L]->contains(c))       goto GC_L;
3175                 if (sets[URX_GC_LV]->contains(c))      goto GC_V;
3176                 if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
3177                 if (sets[URX_GC_V]->contains(c))       goto GC_V;
3178                 if (sets[URX_GC_T]->contains(c))       goto GC_T;
3179                 goto GC_Extend;
3180 
3181 
3182 
3183 GC_L:
3184                 if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
3185                 c = UTEXT_NEXT32(fInputText);
3186                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3187                 if (sets[URX_GC_L]->contains(c))       goto GC_L;
3188                 if (sets[URX_GC_LV]->contains(c))      goto GC_V;
3189                 if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
3190                 if (sets[URX_GC_V]->contains(c))       goto GC_V;
3191                 (void)UTEXT_PREVIOUS32(fInputText);
3192                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3193                 goto GC_Extend;
3194 
3195 GC_V:
3196                 if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
3197                 c = UTEXT_NEXT32(fInputText);
3198                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3199                 if (sets[URX_GC_V]->contains(c))       goto GC_V;
3200                 if (sets[URX_GC_T]->contains(c))       goto GC_T;
3201                 (void)UTEXT_PREVIOUS32(fInputText);
3202                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3203                 goto GC_Extend;
3204 
3205 GC_T:
3206                 if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
3207                 c = UTEXT_NEXT32(fInputText);
3208                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3209                 if (sets[URX_GC_T]->contains(c))       goto GC_T;
3210                 (void)UTEXT_PREVIOUS32(fInputText);
3211                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3212                 goto GC_Extend;
3213 
3214 GC_Extend:
3215                 // Combining characters are consumed here
3216                 for (;;) {
3217                     if (fp->fInputIdx >= fActiveLimit) {
3218                         break;
3219                     }
3220                     c = UTEXT_CURRENT32(fInputText);
3221                     if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
3222                         break;
3223                     }
3224                     (void)UTEXT_NEXT32(fInputText);
3225                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3226                 }
3227                 goto GC_Done;
3228 
3229 GC_Control:
3230                 // Most control chars stand alone (don't combine with combining chars),
3231                 //   except for that CR/LF sequence is a single grapheme cluster.
3232                 if (c == 0x0d && fp->fInputIdx < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
3233                     c = UTEXT_NEXT32(fInputText);
3234                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3235                 }
3236 
3237 GC_Done:
3238                 if (fp->fInputIdx >= fActiveLimit) {
3239                     fHitEnd = TRUE;
3240                 }
3241                 break;
3242             }
3243 
3244 
3245 
3246 
3247         case URX_BACKSLASH_Z:          // Test for end of Input
3248             if (fp->fInputIdx < fAnchorLimit) {
3249                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3250             } else {
3251                 fHitEnd = TRUE;
3252                 fRequireEnd = TRUE;
3253             }
3254             break;
3255 
3256 
3257 
3258         case URX_STATIC_SETREF:
3259             {
3260                 // Test input character against one of the predefined sets
3261                 //    (Word Characters, for example)
3262                 // The high bit of the op value is a flag for the match polarity.
3263                 //    0:   success if input char is in set.
3264                 //    1:   success if input char is not in set.
3265                 if (fp->fInputIdx >= fActiveLimit) {
3266                     fHitEnd = TRUE;
3267                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3268                     break;
3269                 }
3270 
3271                 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
3272                 opValue &= ~URX_NEG_SET;
3273                 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
3274 
3275                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3276                 UChar32 c = UTEXT_NEXT32(fInputText);
3277                 if (c < 256) {
3278                     Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
3279                     if (s8->contains(c)) {
3280                         success = !success;
3281                     }
3282                 } else {
3283                     const UnicodeSet *s = fPattern->fStaticSets[opValue];
3284                     if (s->contains(c)) {
3285                         success = !success;
3286                     }
3287                 }
3288                 if (success) {
3289                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3290                 } else {
3291                     // the character wasn't in the set.
3292                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3293                 }
3294             }
3295             break;
3296 
3297 
3298         case URX_STAT_SETREF_N:
3299             {
3300                 // Test input character for NOT being a member of  one of
3301                 //    the predefined sets (Word Characters, for example)
3302                 if (fp->fInputIdx >= fActiveLimit) {
3303                     fHitEnd = TRUE;
3304                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3305                     break;
3306                 }
3307 
3308                 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
3309 
3310                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3311 
3312                 UChar32 c = UTEXT_NEXT32(fInputText);
3313                 if (c < 256) {
3314                     Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
3315                     if (s8->contains(c) == FALSE) {
3316                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3317                         break;
3318                     }
3319                 } else {
3320                     const UnicodeSet *s = fPattern->fStaticSets[opValue];
3321                     if (s->contains(c) == FALSE) {
3322                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3323                         break;
3324                     }
3325                 }
3326                 // the character wasn't in the set.
3327                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3328             }
3329             break;
3330 
3331 
3332         case URX_SETREF:
3333             if (fp->fInputIdx >= fActiveLimit) {
3334                 fHitEnd = TRUE;
3335                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3336                 break;
3337             } else {
3338                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3339 
3340                 // There is input left.  Pick up one char and test it for set membership.
3341                 UChar32 c = UTEXT_NEXT32(fInputText);
3342                 U_ASSERT(opValue > 0 && opValue < sets->size());
3343                 if (c<256) {
3344                     Regex8BitSet *s8 = &fPattern->fSets8[opValue];
3345                     if (s8->contains(c)) {
3346                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3347                         break;
3348                     }
3349                 } else {
3350                     UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
3351                     if (s->contains(c)) {
3352                         // The character is in the set.  A Match.
3353                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3354                         break;
3355                     }
3356                 }
3357 
3358                 // the character wasn't in the set.
3359                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3360             }
3361             break;
3362 
3363 
3364         case URX_DOTANY:
3365             {
3366                 // . matches anything, but stops at end-of-line.
3367                 if (fp->fInputIdx >= fActiveLimit) {
3368                     // At end of input.  Match failed.  Backtrack out.
3369                     fHitEnd = TRUE;
3370                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3371                     break;
3372                 }
3373 
3374                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3375 
3376                 // There is input left.  Advance over one char, unless we've hit end-of-line
3377                 UChar32 c = UTEXT_NEXT32(fInputText);
3378                 if (((c & 0x7f) <= 0x29) &&     // First quickly bypass as many chars as possible
3379                     ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
3380                     // End of line in normal mode.   . does not match.
3381                         fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3382                     break;
3383                 }
3384                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3385             }
3386             break;
3387 
3388 
3389         case URX_DOTANY_ALL:
3390             {
3391                 // ., in dot-matches-all (including new lines) mode
3392                 if (fp->fInputIdx >= fActiveLimit) {
3393                     // At end of input.  Match failed.  Backtrack out.
3394                     fHitEnd = TRUE;
3395                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3396                     break;
3397                 }
3398 
3399                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3400 
3401                 // There is input left.  Advance over one char, except if we are
3402                 //   at a cr/lf, advance over both of them.
3403                 UChar32 c;
3404                 c = UTEXT_NEXT32(fInputText);
3405                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3406                 if (c==0x0d && fp->fInputIdx < fActiveLimit) {
3407                     // In the case of a CR/LF, we need to advance over both.
3408                     UChar32 nextc = UTEXT_CURRENT32(fInputText);
3409                     if (nextc == 0x0a) {
3410                         (void)UTEXT_NEXT32(fInputText);
3411                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3412                     }
3413                 }
3414             }
3415             break;
3416 
3417 
3418         case URX_DOTANY_UNIX:
3419             {
3420                 // '.' operator, matches all, but stops at end-of-line.
3421                 //   UNIX_LINES mode, so 0x0a is the only recognized line ending.
3422                 if (fp->fInputIdx >= fActiveLimit) {
3423                     // At end of input.  Match failed.  Backtrack out.
3424                     fHitEnd = TRUE;
3425                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3426                     break;
3427                 }
3428 
3429                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3430 
3431                 // There is input left.  Advance over one char, unless we've hit end-of-line
3432                 UChar32 c = UTEXT_NEXT32(fInputText);
3433                 if (c == 0x0a) {
3434                     // End of line in normal mode.   '.' does not match the \n
3435                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3436                 } else {
3437                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3438                 }
3439             }
3440             break;
3441 
3442 
3443         case URX_JMP:
3444             fp->fPatIdx = opValue;
3445             break;
3446 
3447         case URX_FAIL:
3448             isMatch = FALSE;
3449             goto breakFromLoop;
3450 
3451         case URX_JMP_SAV:
3452             U_ASSERT(opValue < fPattern->fCompiledPat->size());
3453             fp = StateSave(fp, fp->fPatIdx, status);       // State save to loc following current
3454             fp->fPatIdx = opValue;                         // Then JMP.
3455             break;
3456 
3457         case URX_JMP_SAV_X:
3458             // This opcode is used with (x)+, when x can match a zero length string.
3459             // Same as JMP_SAV, except conditional on the match having made forward progress.
3460             // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
3461             //   data address of the input position at the start of the loop.
3462             {
3463                 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
3464                 int32_t  stoOp = (int32_t)pat[opValue-1];
3465                 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
3466                 int32_t  frameLoc = URX_VAL(stoOp);
3467                 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
3468                 int64_t prevInputIdx = fp->fExtra[frameLoc];
3469                 U_ASSERT(prevInputIdx <= fp->fInputIdx);
3470                 if (prevInputIdx < fp->fInputIdx) {
3471                     // The match did make progress.  Repeat the loop.
3472                     fp = StateSave(fp, fp->fPatIdx, status);  // State save to loc following current
3473                     fp->fPatIdx = opValue;
3474                     fp->fExtra[frameLoc] = fp->fInputIdx;
3475                 }
3476                 // If the input position did not advance, we do nothing here,
3477                 //   execution will fall out of the loop.
3478             }
3479             break;
3480 
3481         case URX_CTR_INIT:
3482             {
3483                 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
3484                 fp->fExtra[opValue] = 0;       //  Set the loop counter variable to zero
3485 
3486                 // Pick up the three extra operands that CTR_INIT has, and
3487                 //    skip the pattern location counter past
3488                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3489                 fp->fPatIdx += 3;
3490                 int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
3491                 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
3492                 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
3493                 U_ASSERT(minCount>=0);
3494                 U_ASSERT(maxCount>=minCount || maxCount==-1);
3495                 U_ASSERT(loopLoc>fp->fPatIdx);
3496 
3497                 if (minCount == 0) {
3498                     fp = StateSave(fp, loopLoc+1, status);
3499                 }
3500                 if (maxCount == 0) {
3501                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3502                 }
3503             }
3504             break;
3505 
3506         case URX_CTR_LOOP:
3507             {
3508                 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
3509                 int32_t initOp = (int32_t)pat[opValue];
3510                 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
3511                 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
3512                 int32_t minCount  = (int32_t)pat[opValue+2];
3513                 int32_t maxCount  = (int32_t)pat[opValue+3];
3514                 // Increment the counter.  Note: we DIDN'T worry about counter
3515                 //   overflow, since the data comes from UnicodeStrings, which
3516                 //   stores its length in an int32_t. Do we have to think about
3517                 //   this now that we're using UText? Probably not, since the length
3518                 //   in UChar32s is still an int32_t.
3519                 (*pCounter)++;
3520                 U_ASSERT(*pCounter > 0);
3521                 if ((uint64_t)*pCounter >= (uint32_t)maxCount) {
3522                     U_ASSERT(*pCounter == maxCount || maxCount == -1);
3523                     break;
3524                 }
3525                 if (*pCounter >= minCount) {
3526                     fp = StateSave(fp, fp->fPatIdx, status);
3527                 }
3528                 fp->fPatIdx = opValue + 4;    // Loop back.
3529             }
3530             break;
3531 
3532         case URX_CTR_INIT_NG:
3533             {
3534                 // Initialize a non-greedy loop
3535                 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
3536                 fp->fExtra[opValue] = 0;       //  Set the loop counter variable to zero
3537 
3538                 // Pick up the three extra operands that CTR_INIT has, and
3539                 //    skip the pattern location counter past
3540                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3541                 fp->fPatIdx += 3;
3542                 int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
3543                 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
3544                 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
3545                 U_ASSERT(minCount>=0);
3546                 U_ASSERT(maxCount>=minCount || maxCount==-1);
3547                 U_ASSERT(loopLoc>fp->fPatIdx);
3548 
3549                 if (minCount == 0) {
3550                     if (maxCount != 0) {
3551                         fp = StateSave(fp, fp->fPatIdx, status);
3552                     }
3553                     fp->fPatIdx = loopLoc+1;   // Continue with stuff after repeated block
3554                 }
3555             }
3556             break;
3557 
3558         case URX_CTR_LOOP_NG:
3559             {
3560                 // Non-greedy {min, max} loops
3561                 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
3562                 int32_t initOp = (int32_t)pat[opValue];
3563                 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
3564                 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
3565                 int32_t minCount  = (int32_t)pat[opValue+2];
3566                 int32_t maxCount  = (int32_t)pat[opValue+3];
3567                 // Increment the counter.  Note: we DIDN'T worry about counter
3568                 //   overflow, since the data comes from UnicodeStrings, which
3569                 //   stores its length in an int32_t. Do we have to think about
3570                 //   this now that we're using UText? Probably not, since the length
3571                 //   in UChar32s is still an int32_t.
3572                 (*pCounter)++;
3573                 U_ASSERT(*pCounter > 0);
3574 
3575                 if ((uint64_t)*pCounter >= (uint32_t)maxCount) {
3576                     // The loop has matched the maximum permitted number of times.
3577                     //   Break out of here with no action.  Matching will
3578                     //   continue with the following pattern.
3579                     U_ASSERT(*pCounter == maxCount || maxCount == -1);
3580                     break;
3581                 }
3582 
3583                 if (*pCounter < minCount) {
3584                     // We haven't met the minimum number of matches yet.
3585                     //   Loop back for another one.
3586                     fp->fPatIdx = opValue + 4;    // Loop back.
3587                 } else {
3588                     // We do have the minimum number of matches.
3589                     //   Fall into the following pattern, but first do
3590                     //   a state save to the top of the loop, so that a failure
3591                     //   in the following pattern will try another iteration of the loop.
3592                     fp = StateSave(fp, opValue + 4, status);
3593                 }
3594             }
3595             break;
3596 
3597         case URX_STO_SP:
3598             U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
3599             fData[opValue] = fStack->size();
3600             break;
3601 
3602         case URX_LD_SP:
3603             {
3604                 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
3605                 int32_t newStackSize = (int32_t)fData[opValue];
3606                 U_ASSERT(newStackSize <= fStack->size());
3607                 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
3608                 if (newFP == (int64_t *)fp) {
3609                     break;
3610                 }
3611                 int32_t i;
3612                 for (i=0; i<fFrameSize; i++) {
3613                     newFP[i] = ((int64_t *)fp)[i];
3614                 }
3615                 fp = (REStackFrame *)newFP;
3616                 fStack->setSize(newStackSize);
3617             }
3618             break;
3619 
3620         case URX_BACKREF:
3621             {
3622                 U_ASSERT(opValue < fFrameSize);
3623                 int64_t groupStartIdx = fp->fExtra[opValue];
3624                 int64_t groupEndIdx   = fp->fExtra[opValue+1];
3625                 U_ASSERT(groupStartIdx <= groupEndIdx);
3626                 if (groupStartIdx < 0) {
3627                     // This capture group has not participated in the match thus far,
3628                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
3629                     break;
3630                 }
3631                 UTEXT_SETNATIVEINDEX(fAltInputText, groupStartIdx);
3632                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3633 
3634                 //   Note: if the capture group match was of an empty string the backref
3635                 //         match succeeds.  Verified by testing:  Perl matches succeed
3636                 //         in this case, so we do too.
3637 
3638                 UBool success = TRUE;
3639                 for (;;) {
3640                     if (utext_getNativeIndex(fAltInputText) >= groupEndIdx) {
3641                         success = TRUE;
3642                         break;
3643                     }
3644                     if (utext_getNativeIndex(fInputText) >= fActiveLimit) {
3645                         success = FALSE;
3646                         fHitEnd = TRUE;
3647                         break;
3648                     }
3649                     UChar32 captureGroupChar = utext_next32(fAltInputText);
3650                     UChar32 inputChar = utext_next32(fInputText);
3651                     if (inputChar != captureGroupChar) {
3652                         success = FALSE;
3653                         break;
3654                     }
3655                 }
3656 
3657                 if (success) {
3658                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3659                 } else {
3660                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3661                 }
3662             }
3663             break;
3664 
3665 
3666 
3667         case URX_BACKREF_I:
3668             {
3669                 U_ASSERT(opValue < fFrameSize);
3670                 int64_t groupStartIdx = fp->fExtra[opValue];
3671                 int64_t groupEndIdx   = fp->fExtra[opValue+1];
3672                 U_ASSERT(groupStartIdx <= groupEndIdx);
3673                 if (groupStartIdx < 0) {
3674                     // This capture group has not participated in the match thus far,
3675                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
3676                     break;
3677                 }
3678                 utext_setNativeIndex(fAltInputText, groupStartIdx);
3679                 utext_setNativeIndex(fInputText, fp->fInputIdx);
3680                 CaseFoldingUTextIterator captureGroupItr(*fAltInputText);
3681                 CaseFoldingUTextIterator inputItr(*fInputText);
3682 
3683                 //   Note: if the capture group match was of an empty string the backref
3684                 //         match succeeds.  Verified by testing:  Perl matches succeed
3685                 //         in this case, so we do too.
3686 
3687                 UBool success = TRUE;
3688                 for (;;) {
3689                     if (!captureGroupItr.inExpansion() && utext_getNativeIndex(fAltInputText) >= groupEndIdx) {
3690                         success = TRUE;
3691                         break;
3692                     }
3693                     if (!inputItr.inExpansion() && utext_getNativeIndex(fInputText) >= fActiveLimit) {
3694                         success = FALSE;
3695                         fHitEnd = TRUE;
3696                         break;
3697                     }
3698                     UChar32 captureGroupChar = captureGroupItr.next();
3699                     UChar32 inputChar = inputItr.next();
3700                     if (inputChar != captureGroupChar) {
3701                         success = FALSE;
3702                         break;
3703                     }
3704                 }
3705 
3706                 if (success && inputItr.inExpansion()) {
3707                     // We otained a match by consuming part of a string obtained from
3708                     // case-folding a single code point of the input text.
3709                     // This does not count as an overall match.
3710                     success = FALSE;
3711                 }
3712 
3713                 if (success) {
3714                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3715                 } else {
3716                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3717                 }
3718 
3719             }
3720             break;
3721 
3722         case URX_STO_INP_LOC:
3723             {
3724                 U_ASSERT(opValue >= 0 && opValue < fFrameSize);
3725                 fp->fExtra[opValue] = fp->fInputIdx;
3726             }
3727             break;
3728 
3729         case URX_JMPX:
3730             {
3731                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3732                 fp->fPatIdx += 1;
3733                 int32_t dataLoc  = URX_VAL(pat[instrOperandLoc]);
3734                 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
3735                 int64_t savedInputIdx = fp->fExtra[dataLoc];
3736                 U_ASSERT(savedInputIdx <= fp->fInputIdx);
3737                 if (savedInputIdx < fp->fInputIdx) {
3738                     fp->fPatIdx = opValue;                               // JMP
3739                 } else {
3740                      fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no progress in loop.
3741                 }
3742             }
3743             break;
3744 
3745         case URX_LA_START:
3746             {
3747                 // Entering a lookahead block.
3748                 // Save Stack Ptr, Input Pos.
3749                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3750                 fData[opValue]   = fStack->size();
3751                 fData[opValue+1] = fp->fInputIdx;
3752                 fActiveStart     = fLookStart;          // Set the match region change for
3753                 fActiveLimit     = fLookLimit;          //   transparent bounds.
3754             }
3755             break;
3756 
3757         case URX_LA_END:
3758             {
3759                 // Leaving a look-ahead block.
3760                 //  restore Stack Ptr, Input Pos to positions they had on entry to block.
3761                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3762                 int32_t stackSize = fStack->size();
3763                 int32_t newStackSize =(int32_t)fData[opValue];
3764                 U_ASSERT(stackSize >= newStackSize);
3765                 if (stackSize > newStackSize) {
3766                     // Copy the current top frame back to the new (cut back) top frame.
3767                     //   This makes the capture groups from within the look-ahead
3768                     //   expression available.
3769                     int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
3770                     int32_t i;
3771                     for (i=0; i<fFrameSize; i++) {
3772                         newFP[i] = ((int64_t *)fp)[i];
3773                     }
3774                     fp = (REStackFrame *)newFP;
3775                     fStack->setSize(newStackSize);
3776                 }
3777                 fp->fInputIdx = fData[opValue+1];
3778 
3779                 // Restore the active region bounds in the input string; they may have
3780                 //    been changed because of transparent bounds on a Region.
3781                 fActiveStart = fRegionStart;
3782                 fActiveLimit = fRegionLimit;
3783             }
3784             break;
3785 
3786         case URX_ONECHAR_I:
3787             // Case insensitive one char.  The char from the pattern is already case folded.
3788             // Input text is not, but case folding the input can not reduce two or more code
3789             // points to one.
3790             if (fp->fInputIdx < fActiveLimit) {
3791                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3792 
3793                 UChar32 c = UTEXT_NEXT32(fInputText);
3794                 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
3795                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3796                     break;
3797                 }
3798             } else {
3799                 fHitEnd = TRUE;
3800             }
3801 
3802             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3803             break;
3804 
3805         case URX_STRING_I:
3806             {
3807                 // Case-insensitive test input against a literal string.
3808                 // Strings require two slots in the compiled pattern, one for the
3809                 //   offset to the string text, and one for the length.
3810                 //   The compiled string has already been case folded.
3811                 {
3812                     const UChar *patternString = litText + opValue;
3813                     int32_t      patternStringIdx  = 0;
3814 
3815                     op      = (int32_t)pat[fp->fPatIdx];
3816                     fp->fPatIdx++;
3817                     opType  = URX_TYPE(op);
3818                     opValue = URX_VAL(op);
3819                     U_ASSERT(opType == URX_STRING_LEN);
3820                     int32_t patternStringLen = opValue;  // Length of the string from the pattern.
3821 
3822 
3823                     UChar32   cPattern;
3824                     UChar32   cText;
3825                     UBool     success = TRUE;
3826 
3827                     UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3828                     CaseFoldingUTextIterator inputIterator(*fInputText);
3829                     while (patternStringIdx < patternStringLen) {
3830                         if (!inputIterator.inExpansion() && UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) {
3831                             success = FALSE;
3832                             fHitEnd = TRUE;
3833                             break;
3834                         }
3835                         U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern);
3836                         cText = inputIterator.next();
3837                         if (cText != cPattern) {
3838                             success = FALSE;
3839                             break;
3840                         }
3841                     }
3842                     if (inputIterator.inExpansion()) {
3843                         success = FALSE;
3844                     }
3845 
3846                     if (success) {
3847                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3848                     } else {
3849                         fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3850                     }
3851                 }
3852             }
3853             break;
3854 
3855         case URX_LB_START:
3856             {
3857                 // Entering a look-behind block.
3858                 // Save Stack Ptr, Input Pos.
3859                 //   TODO:  implement transparent bounds.  Ticket #6067
3860                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3861                 fData[opValue]   = fStack->size();
3862                 fData[opValue+1] = fp->fInputIdx;
3863                 // Init the variable containing the start index for attempted matches.
3864                 fData[opValue+2] = -1;
3865                 // Save input string length, then reset to pin any matches to end at
3866                 //   the current position.
3867                 fData[opValue+3] = fActiveLimit;
3868                 fActiveLimit     = fp->fInputIdx;
3869             }
3870             break;
3871 
3872 
3873         case URX_LB_CONT:
3874             {
3875                 // Positive Look-Behind, at top of loop checking for matches of LB expression
3876                 //    at all possible input starting positions.
3877 
3878                 // Fetch the min and max possible match lengths.  They are the operands
3879                 //   of this op in the pattern.
3880                 int32_t minML = (int32_t)pat[fp->fPatIdx++];
3881                 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
3882                 U_ASSERT(minML <= maxML);
3883                 U_ASSERT(minML >= 0);
3884 
3885                 // Fetch (from data) the last input index where a match was attempted.
3886                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3887                 int64_t  *lbStartIdx = &fData[opValue+2];
3888                 if (*lbStartIdx < 0) {
3889                     // First time through loop.
3890                     *lbStartIdx = fp->fInputIdx - minML;
3891                 } else {
3892                     // 2nd through nth time through the loop.
3893                     // Back up start position for match by one.
3894                     if (*lbStartIdx == 0) {
3895                         (*lbStartIdx)--;
3896                     } else {
3897                         UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx);
3898                         (void)UTEXT_PREVIOUS32(fInputText);
3899                         *lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
3900                     }
3901                 }
3902 
3903                 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
3904                     // We have tried all potential match starting points without
3905                     //  getting a match.  Backtrack out, and out of the
3906                     //   Look Behind altogether.
3907                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3908                     int64_t restoreInputLen = fData[opValue+3];
3909                     U_ASSERT(restoreInputLen >= fActiveLimit);
3910                     U_ASSERT(restoreInputLen <= fInputLength);
3911                     fActiveLimit = restoreInputLen;
3912                     break;
3913                 }
3914 
3915                 //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
3916                 //      (successful match will fall off the end of the loop.)
3917                 fp = StateSave(fp, fp->fPatIdx-3, status);
3918                 fp->fInputIdx = *lbStartIdx;
3919             }
3920             break;
3921 
3922         case URX_LB_END:
3923             // End of a look-behind block, after a successful match.
3924             {
3925                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3926                 if (fp->fInputIdx != fActiveLimit) {
3927                     //  The look-behind expression matched, but the match did not
3928                     //    extend all the way to the point that we are looking behind from.
3929                     //  FAIL out of here, which will take us back to the LB_CONT, which
3930                     //     will retry the match starting at another position or fail
3931                     //     the look-behind altogether, whichever is appropriate.
3932                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3933                     break;
3934                 }
3935 
3936                 // Look-behind match is good.  Restore the orignal input string length,
3937                 //   which had been truncated to pin the end of the lookbehind match to the
3938                 //   position being looked-behind.
3939                 int64_t originalInputLen = fData[opValue+3];
3940                 U_ASSERT(originalInputLen >= fActiveLimit);
3941                 U_ASSERT(originalInputLen <= fInputLength);
3942                 fActiveLimit = originalInputLen;
3943             }
3944             break;
3945 
3946 
3947         case URX_LBN_CONT:
3948             {
3949                 // Negative Look-Behind, at top of loop checking for matches of LB expression
3950                 //    at all possible input starting positions.
3951 
3952                 // Fetch the extra parameters of this op.
3953                 int32_t minML       = (int32_t)pat[fp->fPatIdx++];
3954                 int32_t maxML       = (int32_t)pat[fp->fPatIdx++];
3955                 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
3956                         continueLoc = URX_VAL(continueLoc);
3957                 U_ASSERT(minML <= maxML);
3958                 U_ASSERT(minML >= 0);
3959                 U_ASSERT(continueLoc > fp->fPatIdx);
3960 
3961                 // Fetch (from data) the last input index where a match was attempted.
3962                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3963                 int64_t  *lbStartIdx = &fData[opValue+2];
3964                 if (*lbStartIdx < 0) {
3965                     // First time through loop.
3966                     *lbStartIdx = fp->fInputIdx - minML;
3967                 } else {
3968                     // 2nd through nth time through the loop.
3969                     // Back up start position for match by one.
3970                     if (*lbStartIdx == 0) {
3971                         (*lbStartIdx)--;
3972                     } else {
3973                         UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx);
3974                         (void)UTEXT_PREVIOUS32(fInputText);
3975                         *lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
3976                     }
3977                 }
3978 
3979                 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
3980                     // We have tried all potential match starting points without
3981                     //  getting a match, which means that the negative lookbehind as
3982                     //  a whole has succeeded.  Jump forward to the continue location
3983                     int64_t restoreInputLen = fData[opValue+3];
3984                     U_ASSERT(restoreInputLen >= fActiveLimit);
3985                     U_ASSERT(restoreInputLen <= fInputLength);
3986                     fActiveLimit = restoreInputLen;
3987                     fp->fPatIdx = continueLoc;
3988                     break;
3989                 }
3990 
3991                 //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
3992                 //      (successful match will cause a FAIL out of the loop altogether.)
3993                 fp = StateSave(fp, fp->fPatIdx-4, status);
3994                 fp->fInputIdx = *lbStartIdx;
3995             }
3996             break;
3997 
3998         case URX_LBN_END:
3999             // End of a negative look-behind block, after a successful match.
4000             {
4001                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4002                 if (fp->fInputIdx != fActiveLimit) {
4003                     //  The look-behind expression matched, but the match did not
4004                     //    extend all the way to the point that we are looking behind from.
4005                     //  FAIL out of here, which will take us back to the LB_CONT, which
4006                     //     will retry the match starting at another position or succeed
4007                     //     the look-behind altogether, whichever is appropriate.
4008                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4009                     break;
4010                 }
4011 
4012                 // Look-behind expression matched, which means look-behind test as
4013                 //   a whole Fails
4014 
4015                 //   Restore the orignal input string length, which had been truncated
4016                 //   inorder to pin the end of the lookbehind match
4017                 //   to the position being looked-behind.
4018                 int64_t originalInputLen = fData[opValue+3];
4019                 U_ASSERT(originalInputLen >= fActiveLimit);
4020                 U_ASSERT(originalInputLen <= fInputLength);
4021                 fActiveLimit = originalInputLen;
4022 
4023                 // Restore original stack position, discarding any state saved
4024                 //   by the successful pattern match.
4025                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4026                 int32_t newStackSize = (int32_t)fData[opValue];
4027                 U_ASSERT(fStack->size() > newStackSize);
4028                 fStack->setSize(newStackSize);
4029 
4030                 //  FAIL, which will take control back to someplace
4031                 //  prior to entering the look-behind test.
4032                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4033             }
4034             break;
4035 
4036 
4037         case URX_LOOP_SR_I:
4038             // Loop Initialization for the optimized implementation of
4039             //     [some character set]*
4040             //   This op scans through all matching input.
4041             //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
4042             {
4043                 U_ASSERT(opValue > 0 && opValue < sets->size());
4044                 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
4045                 UnicodeSet   *s  = (UnicodeSet *)sets->elementAt(opValue);
4046 
4047                 // Loop through input, until either the input is exhausted or
4048                 //   we reach a character that is not a member of the set.
4049                 int64_t ix = fp->fInputIdx;
4050                 UTEXT_SETNATIVEINDEX(fInputText, ix);
4051                 for (;;) {
4052                     if (ix >= fActiveLimit) {
4053                         fHitEnd = TRUE;
4054                         break;
4055                     }
4056                     UChar32 c = UTEXT_NEXT32(fInputText);
4057                     if (c<256) {
4058                         if (s8->contains(c) == FALSE) {
4059                             break;
4060                         }
4061                     } else {
4062                         if (s->contains(c) == FALSE) {
4063                             break;
4064                         }
4065                     }
4066                     ix = UTEXT_GETNATIVEINDEX(fInputText);
4067                 }
4068 
4069                 // If there were no matching characters, skip over the loop altogether.
4070                 //   The loop doesn't run at all, a * op always succeeds.
4071                 if (ix == fp->fInputIdx) {
4072                     fp->fPatIdx++;   // skip the URX_LOOP_C op.
4073                     break;
4074                 }
4075 
4076                 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
4077                 //   must follow.  It's operand is the stack location
4078                 //   that holds the starting input index for the match of this [set]*
4079                 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
4080                 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
4081                 int32_t stackLoc = URX_VAL(loopcOp);
4082                 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
4083                 fp->fExtra[stackLoc] = fp->fInputIdx;
4084                 fp->fInputIdx = ix;
4085 
4086                 // Save State to the URX_LOOP_C op that follows this one,
4087                 //   so that match failures in the following code will return to there.
4088                 //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
4089                 fp = StateSave(fp, fp->fPatIdx, status);
4090                 fp->fPatIdx++;
4091             }
4092             break;
4093 
4094 
4095         case URX_LOOP_DOT_I:
4096             // Loop Initialization for the optimized implementation of .*
4097             //   This op scans through all remaining input.
4098             //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
4099             {
4100                 // Loop through input until the input is exhausted (we reach an end-of-line)
4101                 // In DOTALL mode, we can just go straight to the end of the input.
4102                 int64_t ix;
4103                 if ((opValue & 1) == 1) {
4104                     // Dot-matches-All mode.  Jump straight to the end of the string.
4105                     ix = fActiveLimit;
4106                     fHitEnd = TRUE;
4107                 } else {
4108                     // NOT DOT ALL mode.  Line endings do not match '.'
4109                     // Scan forward until a line ending or end of input.
4110                     ix = fp->fInputIdx;
4111                     UTEXT_SETNATIVEINDEX(fInputText, ix);
4112                     for (;;) {
4113                         if (ix >= fActiveLimit) {
4114                             fHitEnd = TRUE;
4115                             break;
4116                         }
4117                         UChar32 c = UTEXT_NEXT32(fInputText);
4118                         if ((c & 0x7f) <= 0x29) {          // Fast filter of non-new-line-s
4119                             if ((c == 0x0a) ||             //  0x0a is newline in both modes.
4120                                (((opValue & 2) == 0) &&    // IF not UNIX_LINES mode
4121                                     (c<=0x0d && c>=0x0a)) || c==0x85 ||c==0x2028 || c==0x2029) {
4122                                 //  char is a line ending.  Exit the scanning loop.
4123                                 break;
4124                             }
4125                         }
4126                         ix = UTEXT_GETNATIVEINDEX(fInputText);
4127                     }
4128                 }
4129 
4130                 // If there were no matching characters, skip over the loop altogether.
4131                 //   The loop doesn't run at all, a * op always succeeds.
4132                 if (ix == fp->fInputIdx) {
4133                     fp->fPatIdx++;   // skip the URX_LOOP_C op.
4134                     break;
4135                 }
4136 
4137                 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
4138                 //   must follow.  It's operand is the stack location
4139                 //   that holds the starting input index for the match of this .*
4140                 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
4141                 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
4142                 int32_t stackLoc = URX_VAL(loopcOp);
4143                 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
4144                 fp->fExtra[stackLoc] = fp->fInputIdx;
4145                 fp->fInputIdx = ix;
4146 
4147                 // Save State to the URX_LOOP_C op that follows this one,
4148                 //   so that match failures in the following code will return to there.
4149                 //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
4150                 fp = StateSave(fp, fp->fPatIdx, status);
4151                 fp->fPatIdx++;
4152             }
4153             break;
4154 
4155 
4156         case URX_LOOP_C:
4157             {
4158                 U_ASSERT(opValue>=0 && opValue<fFrameSize);
4159                 backSearchIndex = fp->fExtra[opValue];
4160                 U_ASSERT(backSearchIndex <= fp->fInputIdx);
4161                 if (backSearchIndex == fp->fInputIdx) {
4162                     // We've backed up the input idx to the point that the loop started.
4163                     // The loop is done.  Leave here without saving state.
4164                     //  Subsequent failures won't come back here.
4165                     break;
4166                 }
4167                 // Set up for the next iteration of the loop, with input index
4168                 //   backed up by one from the last time through,
4169                 //   and a state save to this instruction in case the following code fails again.
4170                 //   (We're going backwards because this loop emulates stack unwinding, not
4171                 //    the initial scan forward.)
4172                 U_ASSERT(fp->fInputIdx > 0);
4173                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
4174                 UChar32 prevC = UTEXT_PREVIOUS32(fInputText);
4175                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
4176 
4177                 UChar32 twoPrevC = UTEXT_PREVIOUS32(fInputText);
4178                 if (prevC == 0x0a &&
4179                     fp->fInputIdx > backSearchIndex &&
4180                     twoPrevC == 0x0d) {
4181                     int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
4182                     if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
4183                         // .*, stepping back over CRLF pair.
4184                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
4185                     }
4186                 }
4187 
4188 
4189                 fp = StateSave(fp, fp->fPatIdx-1, status);
4190             }
4191             break;
4192 
4193 
4194 
4195         default:
4196             // Trouble.  The compiled pattern contains an entry with an
4197             //           unrecognized type tag.
4198             U_ASSERT(FALSE);
4199         }
4200 
4201         if (U_FAILURE(status)) {
4202             isMatch = FALSE;
4203             break;
4204         }
4205     }
4206 
4207 breakFromLoop:
4208     fMatch = isMatch;
4209     if (isMatch) {
4210         fLastMatchEnd = fMatchEnd;
4211         fMatchStart   = startIdx;
4212         fMatchEnd     = fp->fInputIdx;
4213         if (fTraceDebug) {
4214             REGEX_RUN_DEBUG_PRINTF(("Match.  start=%d   end=%d\n\n", fMatchStart, fMatchEnd));
4215         }
4216     }
4217     else
4218     {
4219         if (fTraceDebug) {
4220             REGEX_RUN_DEBUG_PRINTF(("No match\n\n"));
4221         }
4222     }
4223 
4224     fFrame = fp;                // The active stack frame when the engine stopped.
4225                                 //   Contains the capture group results that we need to
4226                                 //    access later.
4227     return;
4228 }
4229 
4230 
4231 //--------------------------------------------------------------------------------
4232 //
4233 //   MatchChunkAt   This is the actual matching engine. Like MatchAt, but with the
4234 //                  assumption that the entire string is available in the UText's
4235 //                  chunk buffer. For now, that means we can use int32_t indexes,
4236 //                  except for anything that needs to be saved (like group starts
4237 //                  and ends).
4238 //
4239 //                  startIdx:    begin matching a this index.
4240 //                  toEnd:       if true, match must extend to end of the input region
4241 //
4242 //--------------------------------------------------------------------------------
MatchChunkAt(int32_t startIdx,UBool toEnd,UErrorCode & status)4243 void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status) {
4244     UBool       isMatch  = FALSE;      // True if the we have a match.
4245 
4246     int32_t     backSearchIndex = INT32_MAX; // used after greedy single-character matches for searching backwards
4247 
4248     int32_t     op;                    // Operation from the compiled pattern, split into
4249     int32_t     opType;                //    the opcode
4250     int32_t     opValue;               //    and the operand value.
4251 
4252 #ifdef REGEX_RUN_DEBUG
4253     if (fTraceDebug)
4254     {
4255         printf("MatchAt(startIdx=%ld)\n", startIdx);
4256         printf("Original Pattern: ");
4257         UChar32 c = utext_next32From(fPattern->fPattern, 0);
4258         while (c != U_SENTINEL) {
4259             if (c<32 || c>256) {
4260                 c = '.';
4261             }
4262             REGEX_DUMP_DEBUG_PRINTF(("%c", c));
4263 
4264             c = UTEXT_NEXT32(fPattern->fPattern);
4265         }
4266         printf("\n");
4267         printf("Input String: ");
4268         c = utext_next32From(fInputText, 0);
4269         while (c != U_SENTINEL) {
4270             if (c<32 || c>256) {
4271                 c = '.';
4272             }
4273             printf("%c", c);
4274 
4275             c = UTEXT_NEXT32(fInputText);
4276         }
4277         printf("\n");
4278         printf("\n");
4279     }
4280 #endif
4281 
4282     if (U_FAILURE(status)) {
4283         return;
4284     }
4285 
4286     //  Cache frequently referenced items from the compiled pattern
4287     //
4288     int64_t             *pat           = fPattern->fCompiledPat->getBuffer();
4289 
4290     const UChar         *litText       = fPattern->fLiteralText.getBuffer();
4291     UVector             *sets          = fPattern->fSets;
4292 
4293     const UChar         *inputBuf      = fInputText->chunkContents;
4294 
4295     fFrameSize = fPattern->fFrameSize;
4296     REStackFrame        *fp            = resetStack();
4297 
4298     fp->fPatIdx   = 0;
4299     fp->fInputIdx = startIdx;
4300 
4301     // Zero out the pattern's static data
4302     int32_t i;
4303     for (i = 0; i<fPattern->fDataSize; i++) {
4304         fData[i] = 0;
4305     }
4306 
4307     //
4308     //  Main loop for interpreting the compiled pattern.
4309     //  One iteration of the loop per pattern operation performed.
4310     //
4311     for (;;) {
4312 #if 0
4313         if (_heapchk() != _HEAPOK) {
4314             fprintf(stderr, "Heap Trouble\n");
4315         }
4316 #endif
4317 
4318         op      = (int32_t)pat[fp->fPatIdx];
4319         opType  = URX_TYPE(op);
4320         opValue = URX_VAL(op);
4321 #ifdef REGEX_RUN_DEBUG
4322         if (fTraceDebug) {
4323             UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
4324             printf("inputIdx=%d   inputChar=%x   sp=%3d   activeLimit=%d  ", fp->fInputIdx,
4325                    UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
4326             fPattern->dumpOp(fp->fPatIdx);
4327         }
4328 #endif
4329         fp->fPatIdx++;
4330 
4331         switch (opType) {
4332 
4333 
4334         case URX_NOP:
4335             break;
4336 
4337 
4338         case URX_BACKTRACK:
4339             // Force a backtrack.  In some circumstances, the pattern compiler
4340             //   will notice that the pattern can't possibly match anything, and will
4341             //   emit one of these at that point.
4342             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4343             break;
4344 
4345 
4346         case URX_ONECHAR:
4347             if (fp->fInputIdx < fActiveLimit) {
4348                 UChar32 c;
4349                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4350                 if (c == opValue) {
4351                     break;
4352                 }
4353             } else {
4354                 fHitEnd = TRUE;
4355             }
4356             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4357             break;
4358 
4359 
4360         case URX_STRING:
4361             {
4362                 // Test input against a literal string.
4363                 // Strings require two slots in the compiled pattern, one for the
4364                 //   offset to the string text, and one for the length.
4365                 int32_t   stringStartIdx = opValue;
4366                 int32_t   stringLen;
4367 
4368                 op      = (int32_t)pat[fp->fPatIdx];     // Fetch the second operand
4369                 fp->fPatIdx++;
4370                 opType    = URX_TYPE(op);
4371                 stringLen = URX_VAL(op);
4372                 U_ASSERT(opType == URX_STRING_LEN);
4373                 U_ASSERT(stringLen >= 2);
4374 
4375                 const UChar * pInp = inputBuf + fp->fInputIdx;
4376                 const UChar * pInpLimit = inputBuf + fActiveLimit;
4377                 const UChar * pPat = litText+stringStartIdx;
4378                 const UChar * pEnd = pInp + stringLen;
4379                 UBool success = TRUE;
4380                 while (pInp < pEnd) {
4381                     if (pInp >= pInpLimit) {
4382                         fHitEnd = TRUE;
4383                         success = FALSE;
4384                         break;
4385                     }
4386                     if (*pInp++ != *pPat++) {
4387                         success = FALSE;
4388                         break;
4389                     }
4390                 }
4391 
4392                 if (success) {
4393                     fp->fInputIdx += stringLen;
4394                 } else {
4395                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4396                 }
4397             }
4398             break;
4399 
4400 
4401         case URX_STATE_SAVE:
4402             fp = StateSave(fp, opValue, status);
4403             break;
4404 
4405 
4406         case URX_END:
4407             // The match loop will exit via this path on a successful match,
4408             //   when we reach the end of the pattern.
4409             if (toEnd && fp->fInputIdx != fActiveLimit) {
4410                 // The pattern matched, but not to the end of input.  Try some more.
4411                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4412                 break;
4413             }
4414             isMatch = TRUE;
4415             goto  breakFromLoop;
4416 
4417             // Start and End Capture stack frame variables are laid out out like this:
4418             //  fp->fExtra[opValue]  - The start of a completed capture group
4419             //             opValue+1 - The end   of a completed capture group
4420             //             opValue+2 - the start of a capture group whose end
4421             //                          has not yet been reached (and might not ever be).
4422         case URX_START_CAPTURE:
4423             U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
4424             fp->fExtra[opValue+2] = fp->fInputIdx;
4425             break;
4426 
4427 
4428         case URX_END_CAPTURE:
4429             U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
4430             U_ASSERT(fp->fExtra[opValue+2] >= 0);            // Start pos for this group must be set.
4431             fp->fExtra[opValue]   = fp->fExtra[opValue+2];   // Tentative start becomes real.
4432             fp->fExtra[opValue+1] = fp->fInputIdx;           // End position
4433             U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
4434             break;
4435 
4436 
4437         case URX_DOLLAR:                   //  $, test for End of line
4438             //     or for position before new line at end of input
4439             if (fp->fInputIdx < fAnchorLimit-2) {
4440                 // We are no where near the end of input.  Fail.
4441                 //   This is the common case.  Keep it first.
4442                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4443                 break;
4444             }
4445             if (fp->fInputIdx >= fAnchorLimit) {
4446                 // We really are at the end of input.  Success.
4447                 fHitEnd = TRUE;
4448                 fRequireEnd = TRUE;
4449                 break;
4450             }
4451 
4452             // If we are positioned just before a new-line that is located at the
4453             //   end of input, succeed.
4454             if (fp->fInputIdx == fAnchorLimit-1) {
4455                 UChar32 c;
4456                 U16_GET(inputBuf, fAnchorStart, fp->fInputIdx, fAnchorLimit, c);
4457 
4458                 if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) {
4459                     if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
4460                         // At new-line at end of input. Success
4461                         fHitEnd = TRUE;
4462                         fRequireEnd = TRUE;
4463                         break;
4464                     }
4465                 }
4466             } else if (fp->fInputIdx == fAnchorLimit-2 &&
4467                 inputBuf[fp->fInputIdx]==0x0d && inputBuf[fp->fInputIdx+1]==0x0a) {
4468                     fHitEnd = TRUE;
4469                     fRequireEnd = TRUE;
4470                     break;                         // At CR/LF at end of input.  Success
4471             }
4472 
4473             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4474 
4475             break;
4476 
4477 
4478         case URX_DOLLAR_D:                   //  $, test for End of Line, in UNIX_LINES mode.
4479             if (fp->fInputIdx >= fAnchorLimit-1) {
4480                 // Either at the last character of input, or off the end.
4481                 if (fp->fInputIdx == fAnchorLimit-1) {
4482                     // At last char of input.  Success if it's a new line.
4483                     if (inputBuf[fp->fInputIdx] == 0x0a) {
4484                         fHitEnd = TRUE;
4485                         fRequireEnd = TRUE;
4486                         break;
4487                     }
4488                 } else {
4489                     // Off the end of input.  Success.
4490                     fHitEnd = TRUE;
4491                     fRequireEnd = TRUE;
4492                     break;
4493                 }
4494             }
4495 
4496             // Not at end of input.  Back-track out.
4497             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4498             break;
4499 
4500 
4501         case URX_DOLLAR_M:                //  $, test for End of line in multi-line mode
4502             {
4503                 if (fp->fInputIdx >= fAnchorLimit) {
4504                     // We really are at the end of input.  Success.
4505                     fHitEnd = TRUE;
4506                     fRequireEnd = TRUE;
4507                     break;
4508                 }
4509                 // If we are positioned just before a new-line, succeed.
4510                 // It makes no difference where the new-line is within the input.
4511                 UChar32 c = inputBuf[fp->fInputIdx];
4512                 if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) {
4513                     // At a line end, except for the odd chance of  being in the middle of a CR/LF sequence
4514                     //  In multi-line mode, hitting a new-line just before the end of input does not
4515                     //   set the hitEnd or requireEnd flags
4516                     if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
4517                         break;
4518                     }
4519                 }
4520                 // not at a new line.  Fail.
4521                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4522             }
4523             break;
4524 
4525 
4526         case URX_DOLLAR_MD:                //  $, test for End of line in multi-line and UNIX_LINES mode
4527             {
4528                 if (fp->fInputIdx >= fAnchorLimit) {
4529                     // We really are at the end of input.  Success.
4530                     fHitEnd = TRUE;
4531                     fRequireEnd = TRUE;  // Java set requireEnd in this case, even though
4532                     break;               //   adding a new-line would not lose the match.
4533                 }
4534                 // If we are not positioned just before a new-line, the test fails; backtrack out.
4535                 // It makes no difference where the new-line is within the input.
4536                 if (inputBuf[fp->fInputIdx] != 0x0a) {
4537                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4538                 }
4539             }
4540             break;
4541 
4542 
4543         case URX_CARET:                    //  ^, test for start of line
4544             if (fp->fInputIdx != fAnchorStart) {
4545                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4546             }
4547             break;
4548 
4549 
4550         case URX_CARET_M:                   //  ^, test for start of line in mulit-line mode
4551             {
4552                 if (fp->fInputIdx == fAnchorStart) {
4553                     // We are at the start input.  Success.
4554                     break;
4555                 }
4556                 // Check whether character just before the current pos is a new-line
4557                 //   unless we are at the end of input
4558                 UChar  c = inputBuf[fp->fInputIdx - 1];
4559                 if ((fp->fInputIdx < fAnchorLimit) &&
4560                     ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
4561                     //  It's a new-line.  ^ is true.  Success.
4562                     //  TODO:  what should be done with positions between a CR and LF?
4563                     break;
4564                 }
4565                 // Not at the start of a line.  Fail.
4566                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4567             }
4568             break;
4569 
4570 
4571         case URX_CARET_M_UNIX:       //  ^, test for start of line in mulit-line + Unix-line mode
4572             {
4573                 U_ASSERT(fp->fInputIdx >= fAnchorStart);
4574                 if (fp->fInputIdx <= fAnchorStart) {
4575                     // We are at the start input.  Success.
4576                     break;
4577                 }
4578                 // Check whether character just before the current pos is a new-line
4579                 U_ASSERT(fp->fInputIdx <= fAnchorLimit);
4580                 UChar  c = inputBuf[fp->fInputIdx - 1];
4581                 if (c != 0x0a) {
4582                     // Not at the start of a line.  Back-track out.
4583                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4584                 }
4585             }
4586             break;
4587 
4588         case URX_BACKSLASH_B:          // Test for word boundaries
4589             {
4590                 UBool success = isChunkWordBoundary((int32_t)fp->fInputIdx);
4591                 success ^= (UBool)(opValue != 0);     // flip sense for \B
4592                 if (!success) {
4593                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4594                 }
4595             }
4596             break;
4597 
4598 
4599         case URX_BACKSLASH_BU:          // Test for word boundaries, Unicode-style
4600             {
4601                 UBool success = isUWordBoundary(fp->fInputIdx);
4602                 success ^= (UBool)(opValue != 0);     // flip sense for \B
4603                 if (!success) {
4604                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4605                 }
4606             }
4607             break;
4608 
4609 
4610         case URX_BACKSLASH_D:            // Test for decimal digit
4611             {
4612                 if (fp->fInputIdx >= fActiveLimit) {
4613                     fHitEnd = TRUE;
4614                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4615                     break;
4616                 }
4617 
4618                 UChar32 c;
4619                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4620                 int8_t ctype = u_charType(c);     // TODO:  make a unicode set for this.  Will be faster.
4621                 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
4622                 success ^= (UBool)(opValue != 0);        // flip sense for \D
4623                 if (!success) {
4624                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4625                 }
4626             }
4627             break;
4628 
4629 
4630         case URX_BACKSLASH_G:          // Test for position at end of previous match
4631             if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) {
4632                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4633             }
4634             break;
4635 
4636 
4637         case URX_BACKSLASH_X:
4638         //  Match a Grapheme, as defined by Unicode TR 29.
4639         //  Differs slightly from Perl, which consumes combining marks independently
4640         //    of context.
4641         {
4642 
4643             // Fail if at end of input
4644             if (fp->fInputIdx >= fActiveLimit) {
4645                 fHitEnd = TRUE;
4646                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4647                 break;
4648             }
4649 
4650             // Examine (and consume) the current char.
4651             //   Dispatch into a little state machine, based on the char.
4652             UChar32  c;
4653             U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4654             UnicodeSet **sets = fPattern->fStaticSets;
4655             if (sets[URX_GC_NORMAL]->contains(c))  goto GC_Extend;
4656             if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control;
4657             if (sets[URX_GC_L]->contains(c))       goto GC_L;
4658             if (sets[URX_GC_LV]->contains(c))      goto GC_V;
4659             if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
4660             if (sets[URX_GC_V]->contains(c))       goto GC_V;
4661             if (sets[URX_GC_T]->contains(c))       goto GC_T;
4662             goto GC_Extend;
4663 
4664 
4665 
4666 GC_L:
4667             if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
4668             U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4669             if (sets[URX_GC_L]->contains(c))       goto GC_L;
4670             if (sets[URX_GC_LV]->contains(c))      goto GC_V;
4671             if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
4672             if (sets[URX_GC_V]->contains(c))       goto GC_V;
4673             U16_PREV(inputBuf, 0, fp->fInputIdx, c);
4674             goto GC_Extend;
4675 
4676 GC_V:
4677             if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
4678             U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4679             if (sets[URX_GC_V]->contains(c))       goto GC_V;
4680             if (sets[URX_GC_T]->contains(c))       goto GC_T;
4681             U16_PREV(inputBuf, 0, fp->fInputIdx, c);
4682             goto GC_Extend;
4683 
4684 GC_T:
4685             if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
4686             U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4687             if (sets[URX_GC_T]->contains(c))       goto GC_T;
4688             U16_PREV(inputBuf, 0, fp->fInputIdx, c);
4689             goto GC_Extend;
4690 
4691 GC_Extend:
4692             // Combining characters are consumed here
4693             for (;;) {
4694                 if (fp->fInputIdx >= fActiveLimit) {
4695                     break;
4696                 }
4697                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4698                 if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
4699                     U16_BACK_1(inputBuf, 0, fp->fInputIdx);
4700                     break;
4701                 }
4702             }
4703             goto GC_Done;
4704 
4705 GC_Control:
4706             // Most control chars stand alone (don't combine with combining chars),
4707             //   except for that CR/LF sequence is a single grapheme cluster.
4708             if (c == 0x0d && fp->fInputIdx < fActiveLimit && inputBuf[fp->fInputIdx] == 0x0a) {
4709                 fp->fInputIdx++;
4710             }
4711 
4712 GC_Done:
4713             if (fp->fInputIdx >= fActiveLimit) {
4714                 fHitEnd = TRUE;
4715             }
4716             break;
4717         }
4718 
4719 
4720 
4721 
4722         case URX_BACKSLASH_Z:          // Test for end of Input
4723             if (fp->fInputIdx < fAnchorLimit) {
4724                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4725             } else {
4726                 fHitEnd = TRUE;
4727                 fRequireEnd = TRUE;
4728             }
4729             break;
4730 
4731 
4732 
4733         case URX_STATIC_SETREF:
4734             {
4735                 // Test input character against one of the predefined sets
4736                 //    (Word Characters, for example)
4737                 // The high bit of the op value is a flag for the match polarity.
4738                 //    0:   success if input char is in set.
4739                 //    1:   success if input char is not in set.
4740                 if (fp->fInputIdx >= fActiveLimit) {
4741                     fHitEnd = TRUE;
4742                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4743                     break;
4744                 }
4745 
4746                 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
4747                 opValue &= ~URX_NEG_SET;
4748                 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
4749 
4750                 UChar32 c;
4751                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4752                 if (c < 256) {
4753                     Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
4754                     if (s8->contains(c)) {
4755                         success = !success;
4756                     }
4757                 } else {
4758                     const UnicodeSet *s = fPattern->fStaticSets[opValue];
4759                     if (s->contains(c)) {
4760                         success = !success;
4761                     }
4762                 }
4763                 if (!success) {
4764                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4765                 }
4766             }
4767             break;
4768 
4769 
4770         case URX_STAT_SETREF_N:
4771             {
4772                 // Test input character for NOT being a member of  one of
4773                 //    the predefined sets (Word Characters, for example)
4774                 if (fp->fInputIdx >= fActiveLimit) {
4775                     fHitEnd = TRUE;
4776                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4777                     break;
4778                 }
4779 
4780                 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
4781 
4782                 UChar32  c;
4783                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4784                 if (c < 256) {
4785                     Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
4786                     if (s8->contains(c) == FALSE) {
4787                         break;
4788                     }
4789                 } else {
4790                     const UnicodeSet *s = fPattern->fStaticSets[opValue];
4791                     if (s->contains(c) == FALSE) {
4792                         break;
4793                     }
4794                 }
4795                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4796             }
4797             break;
4798 
4799 
4800         case URX_SETREF:
4801             {
4802                 if (fp->fInputIdx >= fActiveLimit) {
4803                     fHitEnd = TRUE;
4804                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4805                     break;
4806                 }
4807 
4808                 U_ASSERT(opValue > 0 && opValue < sets->size());
4809 
4810                 // There is input left.  Pick up one char and test it for set membership.
4811                 UChar32  c;
4812                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4813                 if (c<256) {
4814                     Regex8BitSet *s8 = &fPattern->fSets8[opValue];
4815                     if (s8->contains(c)) {
4816                         // The character is in the set.  A Match.
4817                         break;
4818                     }
4819                 } else {
4820                     UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
4821                     if (s->contains(c)) {
4822                         // The character is in the set.  A Match.
4823                         break;
4824                     }
4825                 }
4826 
4827                 // the character wasn't in the set.
4828                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4829             }
4830             break;
4831 
4832 
4833         case URX_DOTANY:
4834             {
4835                 // . matches anything, but stops at end-of-line.
4836                 if (fp->fInputIdx >= fActiveLimit) {
4837                     // At end of input.  Match failed.  Backtrack out.
4838                     fHitEnd = TRUE;
4839                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4840                     break;
4841                 }
4842 
4843                 // There is input left.  Advance over one char, unless we've hit end-of-line
4844                 UChar32  c;
4845                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4846                 if (((c & 0x7f) <= 0x29) &&     // First quickly bypass as many chars as possible
4847                     ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
4848                     // End of line in normal mode.   . does not match.
4849                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4850                     break;
4851                 }
4852             }
4853             break;
4854 
4855 
4856         case URX_DOTANY_ALL:
4857             {
4858                 // . in dot-matches-all (including new lines) mode
4859                 if (fp->fInputIdx >= fActiveLimit) {
4860                     // At end of input.  Match failed.  Backtrack out.
4861                     fHitEnd = TRUE;
4862                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4863                     break;
4864                 }
4865 
4866                 // There is input left.  Advance over one char, except if we are
4867                 //   at a cr/lf, advance over both of them.
4868                 UChar32 c;
4869                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4870                 if (c==0x0d && fp->fInputIdx < fActiveLimit) {
4871                     // In the case of a CR/LF, we need to advance over both.
4872                     if (inputBuf[fp->fInputIdx] == 0x0a) {
4873                         U16_FWD_1(inputBuf, fp->fInputIdx, fActiveLimit);
4874                     }
4875                 }
4876             }
4877             break;
4878 
4879 
4880         case URX_DOTANY_UNIX:
4881             {
4882                 // '.' operator, matches all, but stops at end-of-line.
4883                 //   UNIX_LINES mode, so 0x0a is the only recognized line ending.
4884                 if (fp->fInputIdx >= fActiveLimit) {
4885                     // At end of input.  Match failed.  Backtrack out.
4886                     fHitEnd = TRUE;
4887                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4888                     break;
4889                 }
4890 
4891                 // There is input left.  Advance over one char, unless we've hit end-of-line
4892                 UChar32 c;
4893                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4894                 if (c == 0x0a) {
4895                     // End of line in normal mode.   '.' does not match the \n
4896                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4897                 }
4898             }
4899             break;
4900 
4901 
4902         case URX_JMP:
4903             fp->fPatIdx = opValue;
4904             break;
4905 
4906         case URX_FAIL:
4907             isMatch = FALSE;
4908             goto breakFromLoop;
4909 
4910         case URX_JMP_SAV:
4911             U_ASSERT(opValue < fPattern->fCompiledPat->size());
4912             fp = StateSave(fp, fp->fPatIdx, status);       // State save to loc following current
4913             fp->fPatIdx = opValue;                         // Then JMP.
4914             break;
4915 
4916         case URX_JMP_SAV_X:
4917             // This opcode is used with (x)+, when x can match a zero length string.
4918             // Same as JMP_SAV, except conditional on the match having made forward progress.
4919             // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
4920             //   data address of the input position at the start of the loop.
4921             {
4922                 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
4923                 int32_t  stoOp = (int32_t)pat[opValue-1];
4924                 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
4925                 int32_t  frameLoc = URX_VAL(stoOp);
4926                 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
4927                 int32_t prevInputIdx = (int32_t)fp->fExtra[frameLoc];
4928                 U_ASSERT(prevInputIdx <= fp->fInputIdx);
4929                 if (prevInputIdx < fp->fInputIdx) {
4930                     // The match did make progress.  Repeat the loop.
4931                     fp = StateSave(fp, fp->fPatIdx, status);  // State save to loc following current
4932                     fp->fPatIdx = opValue;
4933                     fp->fExtra[frameLoc] = fp->fInputIdx;
4934                 }
4935                 // If the input position did not advance, we do nothing here,
4936                 //   execution will fall out of the loop.
4937             }
4938             break;
4939 
4940         case URX_CTR_INIT:
4941             {
4942                 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
4943                 fp->fExtra[opValue] = 0;       //  Set the loop counter variable to zero
4944 
4945                 // Pick up the three extra operands that CTR_INIT has, and
4946                 //    skip the pattern location counter past
4947                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
4948                 fp->fPatIdx += 3;
4949                 int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
4950                 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
4951                 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
4952                 U_ASSERT(minCount>=0);
4953                 U_ASSERT(maxCount>=minCount || maxCount==-1);
4954                 U_ASSERT(loopLoc>fp->fPatIdx);
4955 
4956                 if (minCount == 0) {
4957                     fp = StateSave(fp, loopLoc+1, status);
4958                 }
4959                 if (maxCount == 0) {
4960                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4961                 }
4962             }
4963             break;
4964 
4965         case URX_CTR_LOOP:
4966             {
4967                 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
4968                 int32_t initOp = (int32_t)pat[opValue];
4969                 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
4970                 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
4971                 int32_t minCount  = (int32_t)pat[opValue+2];
4972                 int32_t maxCount  = (int32_t)pat[opValue+3];
4973                 // Increment the counter.  Note: we DIDN'T worry about counter
4974                 //   overflow, since the data comes from UnicodeStrings, which
4975                 //   stores its length in an int32_t. Do we have to think about
4976                 //   this now that we're using UText? Probably not, since the length
4977                 //   in UChar32s is still an int32_t.
4978                 (*pCounter)++;
4979                 U_ASSERT(*pCounter > 0);
4980                 if ((uint64_t)*pCounter >= (uint32_t)maxCount) {
4981                     U_ASSERT(*pCounter == maxCount || maxCount == -1);
4982                     break;
4983                 }
4984                 if (*pCounter >= minCount) {
4985                     fp = StateSave(fp, fp->fPatIdx, status);
4986                 }
4987                 fp->fPatIdx = opValue + 4;    // Loop back.
4988             }
4989             break;
4990 
4991         case URX_CTR_INIT_NG:
4992             {
4993                 // Initialize a non-greedy loop
4994                 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
4995                 fp->fExtra[opValue] = 0;       //  Set the loop counter variable to zero
4996 
4997                 // Pick up the three extra operands that CTR_INIT has, and
4998                 //    skip the pattern location counter past
4999                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
5000                 fp->fPatIdx += 3;
5001                 int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
5002                 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
5003                 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
5004                 U_ASSERT(minCount>=0);
5005                 U_ASSERT(maxCount>=minCount || maxCount==-1);
5006                 U_ASSERT(loopLoc>fp->fPatIdx);
5007 
5008                 if (minCount == 0) {
5009                     if (maxCount != 0) {
5010                         fp = StateSave(fp, fp->fPatIdx, status);
5011                     }
5012                     fp->fPatIdx = loopLoc+1;   // Continue with stuff after repeated block
5013                 }
5014             }
5015             break;
5016 
5017         case URX_CTR_LOOP_NG:
5018             {
5019                 // Non-greedy {min, max} loops
5020                 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
5021                 int32_t initOp = (int32_t)pat[opValue];
5022                 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
5023                 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
5024                 int32_t minCount  = (int32_t)pat[opValue+2];
5025                 int32_t maxCount  = (int32_t)pat[opValue+3];
5026                 // Increment the counter.  Note: we DIDN'T worry about counter
5027                 //   overflow, since the data comes from UnicodeStrings, which
5028                 //   stores its length in an int32_t. Do we have to think about
5029                 //   this now that we're using UText? Probably not, since the length
5030                 //   in UChar32s is still an int32_t.
5031                 (*pCounter)++;
5032                 U_ASSERT(*pCounter > 0);
5033 
5034                 if ((uint64_t)*pCounter >= (uint32_t)maxCount) {
5035                     // The loop has matched the maximum permitted number of times.
5036                     //   Break out of here with no action.  Matching will
5037                     //   continue with the following pattern.
5038                     U_ASSERT(*pCounter == maxCount || maxCount == -1);
5039                     break;
5040                 }
5041 
5042                 if (*pCounter < minCount) {
5043                     // We haven't met the minimum number of matches yet.
5044                     //   Loop back for another one.
5045                     fp->fPatIdx = opValue + 4;    // Loop back.
5046                 } else {
5047                     // We do have the minimum number of matches.
5048                     //   Fall into the following pattern, but first do
5049                     //   a state save to the top of the loop, so that a failure
5050                     //   in the following pattern will try another iteration of the loop.
5051                     fp = StateSave(fp, opValue + 4, status);
5052                 }
5053             }
5054             break;
5055 
5056         case URX_STO_SP:
5057             U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
5058             fData[opValue] = fStack->size();
5059             break;
5060 
5061         case URX_LD_SP:
5062             {
5063                 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
5064                 int32_t newStackSize = (int32_t)fData[opValue];
5065                 U_ASSERT(newStackSize <= fStack->size());
5066                 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
5067                 if (newFP == (int64_t *)fp) {
5068                     break;
5069                 }
5070                 int32_t i;
5071                 for (i=0; i<fFrameSize; i++) {
5072                     newFP[i] = ((int64_t *)fp)[i];
5073                 }
5074                 fp = (REStackFrame *)newFP;
5075                 fStack->setSize(newStackSize);
5076             }
5077             break;
5078 
5079         case URX_BACKREF:
5080             {
5081                 U_ASSERT(opValue < fFrameSize);
5082                 int64_t groupStartIdx = fp->fExtra[opValue];
5083                 int64_t groupEndIdx   = fp->fExtra[opValue+1];
5084                 U_ASSERT(groupStartIdx <= groupEndIdx);
5085                 int64_t inputIndex = fp->fInputIdx;
5086                 if (groupStartIdx < 0) {
5087                     // This capture group has not participated in the match thus far,
5088                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
5089                     break;
5090                 }
5091                 UBool success = TRUE;
5092                 for (int64_t groupIndex = groupStartIdx; groupIndex < groupEndIdx; ++groupIndex,++inputIndex) {
5093                     if (inputIndex >= fActiveLimit) {
5094                         success = FALSE;
5095                         fHitEnd = TRUE;
5096                         break;
5097                     }
5098                     if (inputBuf[groupIndex] != inputBuf[inputIndex]) {
5099                         success = FALSE;
5100                         break;
5101                     }
5102                 }
5103                 if (success) {
5104                     fp->fInputIdx = inputIndex;
5105                 } else {
5106                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5107                 }
5108             }
5109             break;
5110 
5111         case URX_BACKREF_I:
5112             {
5113                 U_ASSERT(opValue < fFrameSize);
5114                 int64_t groupStartIdx = fp->fExtra[opValue];
5115                 int64_t groupEndIdx   = fp->fExtra[opValue+1];
5116                 U_ASSERT(groupStartIdx <= groupEndIdx);
5117                 if (groupStartIdx < 0) {
5118                     // This capture group has not participated in the match thus far,
5119                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
5120                     break;
5121                 }
5122                 CaseFoldingUCharIterator captureGroupItr(inputBuf, groupStartIdx, groupEndIdx);
5123                 CaseFoldingUCharIterator inputItr(inputBuf, fp->fInputIdx, fActiveLimit);
5124 
5125                 //   Note: if the capture group match was of an empty string the backref
5126                 //         match succeeds.  Verified by testing:  Perl matches succeed
5127                 //         in this case, so we do too.
5128 
5129                 UBool success = TRUE;
5130                 for (;;) {
5131                     UChar32 captureGroupChar = captureGroupItr.next();
5132                     if (captureGroupChar == U_SENTINEL) {
5133                         success = TRUE;
5134                         break;
5135                     }
5136                     UChar32 inputChar = inputItr.next();
5137                     if (inputChar == U_SENTINEL) {
5138                         success = FALSE;
5139                         fHitEnd = TRUE;
5140                         break;
5141                     }
5142                     if (inputChar != captureGroupChar) {
5143                         success = FALSE;
5144                         break;
5145                     }
5146                 }
5147 
5148                 if (success && inputItr.inExpansion()) {
5149                     // We otained a match by consuming part of a string obtained from
5150                     // case-folding a single code point of the input text.
5151                     // This does not count as an overall match.
5152                     success = FALSE;
5153                 }
5154 
5155                 if (success) {
5156                     fp->fInputIdx = inputItr.getIndex();
5157                 } else {
5158                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5159                 }
5160             }
5161             break;
5162 
5163         case URX_STO_INP_LOC:
5164             {
5165                 U_ASSERT(opValue >= 0 && opValue < fFrameSize);
5166                 fp->fExtra[opValue] = fp->fInputIdx;
5167             }
5168             break;
5169 
5170         case URX_JMPX:
5171             {
5172                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
5173                 fp->fPatIdx += 1;
5174                 int32_t dataLoc  = URX_VAL(pat[instrOperandLoc]);
5175                 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
5176                 int32_t savedInputIdx = (int32_t)fp->fExtra[dataLoc];
5177                 U_ASSERT(savedInputIdx <= fp->fInputIdx);
5178                 if (savedInputIdx < fp->fInputIdx) {
5179                     fp->fPatIdx = opValue;                               // JMP
5180                 } else {
5181                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no progress in loop.
5182                 }
5183             }
5184             break;
5185 
5186         case URX_LA_START:
5187             {
5188                 // Entering a lookahead block.
5189                 // Save Stack Ptr, Input Pos.
5190                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5191                 fData[opValue]   = fStack->size();
5192                 fData[opValue+1] = fp->fInputIdx;
5193                 fActiveStart     = fLookStart;          // Set the match region change for
5194                 fActiveLimit     = fLookLimit;          //   transparent bounds.
5195             }
5196             break;
5197 
5198         case URX_LA_END:
5199             {
5200                 // Leaving a look-ahead block.
5201                 //  restore Stack Ptr, Input Pos to positions they had on entry to block.
5202                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5203                 int32_t stackSize = fStack->size();
5204                 int32_t newStackSize = (int32_t)fData[opValue];
5205                 U_ASSERT(stackSize >= newStackSize);
5206                 if (stackSize > newStackSize) {
5207                     // Copy the current top frame back to the new (cut back) top frame.
5208                     //   This makes the capture groups from within the look-ahead
5209                     //   expression available.
5210                     int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
5211                     int32_t i;
5212                     for (i=0; i<fFrameSize; i++) {
5213                         newFP[i] = ((int64_t *)fp)[i];
5214                     }
5215                     fp = (REStackFrame *)newFP;
5216                     fStack->setSize(newStackSize);
5217                 }
5218                 fp->fInputIdx = fData[opValue+1];
5219 
5220                 // Restore the active region bounds in the input string; they may have
5221                 //    been changed because of transparent bounds on a Region.
5222                 fActiveStart = fRegionStart;
5223                 fActiveLimit = fRegionLimit;
5224             }
5225             break;
5226 
5227         case URX_ONECHAR_I:
5228             if (fp->fInputIdx < fActiveLimit) {
5229                 UChar32 c;
5230                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5231                 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
5232                     break;
5233                 }
5234             } else {
5235                 fHitEnd = TRUE;
5236             }
5237             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5238             break;
5239 
5240         case URX_STRING_I:
5241             // Case-insensitive test input against a literal string.
5242             // Strings require two slots in the compiled pattern, one for the
5243             //   offset to the string text, and one for the length.
5244             //   The compiled string has already been case folded.
5245             {
5246                 const UChar *patternString = litText + opValue;
5247 
5248                 op      = (int32_t)pat[fp->fPatIdx];
5249                 fp->fPatIdx++;
5250                 opType  = URX_TYPE(op);
5251                 opValue = URX_VAL(op);
5252                 U_ASSERT(opType == URX_STRING_LEN);
5253                 int32_t patternStringLen = opValue;  // Length of the string from the pattern.
5254 
5255                 UChar32      cText;
5256                 UChar32      cPattern;
5257                 UBool        success = TRUE;
5258                 int32_t      patternStringIdx  = 0;
5259                 CaseFoldingUCharIterator inputIterator(inputBuf, fp->fInputIdx, fActiveLimit);
5260                 while (patternStringIdx < patternStringLen) {
5261                     U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern);
5262                     cText = inputIterator.next();
5263                     if (cText != cPattern) {
5264                         success = FALSE;
5265                         if (cText == U_SENTINEL) {
5266                             fHitEnd = TRUE;
5267                         }
5268                         break;
5269                     }
5270                 }
5271                 if (inputIterator.inExpansion()) {
5272                     success = FALSE;
5273                 }
5274 
5275                 if (success) {
5276                     fp->fInputIdx = inputIterator.getIndex();
5277                 } else {
5278                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5279                 }
5280             }
5281             break;
5282 
5283         case URX_LB_START:
5284             {
5285                 // Entering a look-behind block.
5286                 // Save Stack Ptr, Input Pos.
5287                 //   TODO:  implement transparent bounds.  Ticket #6067
5288                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5289                 fData[opValue]   = fStack->size();
5290                 fData[opValue+1] = fp->fInputIdx;
5291                 // Init the variable containing the start index for attempted matches.
5292                 fData[opValue+2] = -1;
5293                 // Save input string length, then reset to pin any matches to end at
5294                 //   the current position.
5295                 fData[opValue+3] = fActiveLimit;
5296                 fActiveLimit     = fp->fInputIdx;
5297             }
5298             break;
5299 
5300 
5301         case URX_LB_CONT:
5302             {
5303                 // Positive Look-Behind, at top of loop checking for matches of LB expression
5304                 //    at all possible input starting positions.
5305 
5306                 // Fetch the min and max possible match lengths.  They are the operands
5307                 //   of this op in the pattern.
5308                 int32_t minML = (int32_t)pat[fp->fPatIdx++];
5309                 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
5310                 U_ASSERT(minML <= maxML);
5311                 U_ASSERT(minML >= 0);
5312 
5313                 // Fetch (from data) the last input index where a match was attempted.
5314                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5315                 int64_t  *lbStartIdx = &fData[opValue+2];
5316                 if (*lbStartIdx < 0) {
5317                     // First time through loop.
5318                     *lbStartIdx = fp->fInputIdx - minML;
5319                 } else {
5320                     // 2nd through nth time through the loop.
5321                     // Back up start position for match by one.
5322                     if (*lbStartIdx == 0) {
5323                         (*lbStartIdx)--;
5324                     } else {
5325                         U16_BACK_1(inputBuf, 0, *lbStartIdx);
5326                     }
5327                 }
5328 
5329                 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
5330                     // We have tried all potential match starting points without
5331                     //  getting a match.  Backtrack out, and out of the
5332                     //   Look Behind altogether.
5333                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5334                     int64_t restoreInputLen = fData[opValue+3];
5335                     U_ASSERT(restoreInputLen >= fActiveLimit);
5336                     U_ASSERT(restoreInputLen <= fInputLength);
5337                     fActiveLimit = restoreInputLen;
5338                     break;
5339                 }
5340 
5341                 //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
5342                 //      (successful match will fall off the end of the loop.)
5343                 fp = StateSave(fp, fp->fPatIdx-3, status);
5344                 fp->fInputIdx =  *lbStartIdx;
5345             }
5346             break;
5347 
5348         case URX_LB_END:
5349             // End of a look-behind block, after a successful match.
5350             {
5351                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5352                 if (fp->fInputIdx != fActiveLimit) {
5353                     //  The look-behind expression matched, but the match did not
5354                     //    extend all the way to the point that we are looking behind from.
5355                     //  FAIL out of here, which will take us back to the LB_CONT, which
5356                     //     will retry the match starting at another position or fail
5357                     //     the look-behind altogether, whichever is appropriate.
5358                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5359                     break;
5360                 }
5361 
5362                 // Look-behind match is good.  Restore the orignal input string length,
5363                 //   which had been truncated to pin the end of the lookbehind match to the
5364                 //   position being looked-behind.
5365                 int64_t originalInputLen = fData[opValue+3];
5366                 U_ASSERT(originalInputLen >= fActiveLimit);
5367                 U_ASSERT(originalInputLen <= fInputLength);
5368                 fActiveLimit = originalInputLen;
5369             }
5370             break;
5371 
5372 
5373         case URX_LBN_CONT:
5374             {
5375                 // Negative Look-Behind, at top of loop checking for matches of LB expression
5376                 //    at all possible input starting positions.
5377 
5378                 // Fetch the extra parameters of this op.
5379                 int32_t minML       = (int32_t)pat[fp->fPatIdx++];
5380                 int32_t maxML       = (int32_t)pat[fp->fPatIdx++];
5381                 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
5382                 continueLoc = URX_VAL(continueLoc);
5383                 U_ASSERT(minML <= maxML);
5384                 U_ASSERT(minML >= 0);
5385                 U_ASSERT(continueLoc > fp->fPatIdx);
5386 
5387                 // Fetch (from data) the last input index where a match was attempted.
5388                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5389                 int64_t  *lbStartIdx = &fData[opValue+2];
5390                 if (*lbStartIdx < 0) {
5391                     // First time through loop.
5392                     *lbStartIdx = fp->fInputIdx - minML;
5393                 } else {
5394                     // 2nd through nth time through the loop.
5395                     // Back up start position for match by one.
5396                     if (*lbStartIdx == 0) {
5397                         (*lbStartIdx)--;   // Because U16_BACK is unsafe starting at 0.
5398                     } else {
5399                         U16_BACK_1(inputBuf, 0, *lbStartIdx);
5400                     }
5401                 }
5402 
5403                 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
5404                     // We have tried all potential match starting points without
5405                     //  getting a match, which means that the negative lookbehind as
5406                     //  a whole has succeeded.  Jump forward to the continue location
5407                     int64_t restoreInputLen = fData[opValue+3];
5408                     U_ASSERT(restoreInputLen >= fActiveLimit);
5409                     U_ASSERT(restoreInputLen <= fInputLength);
5410                     fActiveLimit = restoreInputLen;
5411                     fp->fPatIdx = continueLoc;
5412                     break;
5413                 }
5414 
5415                 //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
5416                 //      (successful match will cause a FAIL out of the loop altogether.)
5417                 fp = StateSave(fp, fp->fPatIdx-4, status);
5418                 fp->fInputIdx =  *lbStartIdx;
5419             }
5420             break;
5421 
5422         case URX_LBN_END:
5423             // End of a negative look-behind block, after a successful match.
5424             {
5425                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5426                 if (fp->fInputIdx != fActiveLimit) {
5427                     //  The look-behind expression matched, but the match did not
5428                     //    extend all the way to the point that we are looking behind from.
5429                     //  FAIL out of here, which will take us back to the LB_CONT, which
5430                     //     will retry the match starting at another position or succeed
5431                     //     the look-behind altogether, whichever is appropriate.
5432                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5433                     break;
5434                 }
5435 
5436                 // Look-behind expression matched, which means look-behind test as
5437                 //   a whole Fails
5438 
5439                 //   Restore the orignal input string length, which had been truncated
5440                 //   inorder to pin the end of the lookbehind match
5441                 //   to the position being looked-behind.
5442                 int64_t originalInputLen = fData[opValue+3];
5443                 U_ASSERT(originalInputLen >= fActiveLimit);
5444                 U_ASSERT(originalInputLen <= fInputLength);
5445                 fActiveLimit = originalInputLen;
5446 
5447                 // Restore original stack position, discarding any state saved
5448                 //   by the successful pattern match.
5449                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5450                 int32_t newStackSize = (int32_t)fData[opValue];
5451                 U_ASSERT(fStack->size() > newStackSize);
5452                 fStack->setSize(newStackSize);
5453 
5454                 //  FAIL, which will take control back to someplace
5455                 //  prior to entering the look-behind test.
5456                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5457             }
5458             break;
5459 
5460 
5461         case URX_LOOP_SR_I:
5462             // Loop Initialization for the optimized implementation of
5463             //     [some character set]*
5464             //   This op scans through all matching input.
5465             //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
5466             {
5467                 U_ASSERT(opValue > 0 && opValue < sets->size());
5468                 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
5469                 UnicodeSet   *s  = (UnicodeSet *)sets->elementAt(opValue);
5470 
5471                 // Loop through input, until either the input is exhausted or
5472                 //   we reach a character that is not a member of the set.
5473                 int32_t ix = (int32_t)fp->fInputIdx;
5474                 for (;;) {
5475                     if (ix >= fActiveLimit) {
5476                         fHitEnd = TRUE;
5477                         break;
5478                     }
5479                     UChar32   c;
5480                     U16_NEXT(inputBuf, ix, fActiveLimit, c);
5481                     if (c<256) {
5482                         if (s8->contains(c) == FALSE) {
5483                             U16_BACK_1(inputBuf, 0, ix);
5484                             break;
5485                         }
5486                     } else {
5487                         if (s->contains(c) == FALSE) {
5488                             U16_BACK_1(inputBuf, 0, ix);
5489                             break;
5490                         }
5491                     }
5492                 }
5493 
5494                 // If there were no matching characters, skip over the loop altogether.
5495                 //   The loop doesn't run at all, a * op always succeeds.
5496                 if (ix == fp->fInputIdx) {
5497                     fp->fPatIdx++;   // skip the URX_LOOP_C op.
5498                     break;
5499                 }
5500 
5501                 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
5502                 //   must follow.  It's operand is the stack location
5503                 //   that holds the starting input index for the match of this [set]*
5504                 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
5505                 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
5506                 int32_t stackLoc = URX_VAL(loopcOp);
5507                 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
5508                 fp->fExtra[stackLoc] = fp->fInputIdx;
5509                 fp->fInputIdx = ix;
5510 
5511                 // Save State to the URX_LOOP_C op that follows this one,
5512                 //   so that match failures in the following code will return to there.
5513                 //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
5514                 fp = StateSave(fp, fp->fPatIdx, status);
5515                 fp->fPatIdx++;
5516             }
5517             break;
5518 
5519 
5520         case URX_LOOP_DOT_I:
5521             // Loop Initialization for the optimized implementation of .*
5522             //   This op scans through all remaining input.
5523             //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
5524             {
5525                 // Loop through input until the input is exhausted (we reach an end-of-line)
5526                 // In DOTALL mode, we can just go straight to the end of the input.
5527                 int32_t ix;
5528                 if ((opValue & 1) == 1) {
5529                     // Dot-matches-All mode.  Jump straight to the end of the string.
5530                     ix = (int32_t)fActiveLimit;
5531                     fHitEnd = TRUE;
5532                 } else {
5533                     // NOT DOT ALL mode.  Line endings do not match '.'
5534                     // Scan forward until a line ending or end of input.
5535                     ix = (int32_t)fp->fInputIdx;
5536                     for (;;) {
5537                         if (ix >= fActiveLimit) {
5538                             fHitEnd = TRUE;
5539                             break;
5540                         }
5541                         UChar32   c;
5542                         U16_NEXT(inputBuf, ix, fActiveLimit, c);   // c = inputBuf[ix++]
5543                         if ((c & 0x7f) <= 0x29) {          // Fast filter of non-new-line-s
5544                             if ((c == 0x0a) ||             //  0x0a is newline in both modes.
5545                                 (((opValue & 2) == 0) &&    // IF not UNIX_LINES mode
5546                                    ((c<=0x0d && c>=0x0a) || c==0x85 || c==0x2028 || c==0x2029))) {
5547                                 //  char is a line ending.  Put the input pos back to the
5548                                 //    line ending char, and exit the scanning loop.
5549                                 U16_BACK_1(inputBuf, 0, ix);
5550                                 break;
5551                             }
5552                         }
5553                     }
5554                 }
5555 
5556                 // If there were no matching characters, skip over the loop altogether.
5557                 //   The loop doesn't run at all, a * op always succeeds.
5558                 if (ix == fp->fInputIdx) {
5559                     fp->fPatIdx++;   // skip the URX_LOOP_C op.
5560                     break;
5561                 }
5562 
5563                 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
5564                 //   must follow.  It's operand is the stack location
5565                 //   that holds the starting input index for the match of this .*
5566                 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
5567                 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
5568                 int32_t stackLoc = URX_VAL(loopcOp);
5569                 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
5570                 fp->fExtra[stackLoc] = fp->fInputIdx;
5571                 fp->fInputIdx = ix;
5572 
5573                 // Save State to the URX_LOOP_C op that follows this one,
5574                 //   so that match failures in the following code will return to there.
5575                 //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
5576                 fp = StateSave(fp, fp->fPatIdx, status);
5577                 fp->fPatIdx++;
5578             }
5579             break;
5580 
5581 
5582         case URX_LOOP_C:
5583             {
5584                 U_ASSERT(opValue>=0 && opValue<fFrameSize);
5585                 backSearchIndex = (int32_t)fp->fExtra[opValue];
5586                 U_ASSERT(backSearchIndex <= fp->fInputIdx);
5587                 if (backSearchIndex == fp->fInputIdx) {
5588                     // We've backed up the input idx to the point that the loop started.
5589                     // The loop is done.  Leave here without saving state.
5590                     //  Subsequent failures won't come back here.
5591                     break;
5592                 }
5593                 // Set up for the next iteration of the loop, with input index
5594                 //   backed up by one from the last time through,
5595                 //   and a state save to this instruction in case the following code fails again.
5596                 //   (We're going backwards because this loop emulates stack unwinding, not
5597                 //    the initial scan forward.)
5598                 U_ASSERT(fp->fInputIdx > 0);
5599                 UChar32 prevC;
5600                 U16_PREV(inputBuf, 0, fp->fInputIdx, prevC); // !!!: should this 0 be one of f*Limit?
5601 
5602                 if (prevC == 0x0a &&
5603                     fp->fInputIdx > backSearchIndex &&
5604                     inputBuf[fp->fInputIdx-1] == 0x0d) {
5605                     int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
5606                     if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
5607                         // .*, stepping back over CRLF pair.
5608                         U16_BACK_1(inputBuf, 0, fp->fInputIdx);
5609                     }
5610                 }
5611 
5612 
5613                 fp = StateSave(fp, fp->fPatIdx-1, status);
5614             }
5615             break;
5616 
5617 
5618 
5619         default:
5620             // Trouble.  The compiled pattern contains an entry with an
5621             //           unrecognized type tag.
5622             U_ASSERT(FALSE);
5623         }
5624 
5625         if (U_FAILURE(status)) {
5626             isMatch = FALSE;
5627             break;
5628         }
5629     }
5630 
5631 breakFromLoop:
5632     fMatch = isMatch;
5633     if (isMatch) {
5634         fLastMatchEnd = fMatchEnd;
5635         fMatchStart   = startIdx;
5636         fMatchEnd     = fp->fInputIdx;
5637         if (fTraceDebug) {
5638             REGEX_RUN_DEBUG_PRINTF(("Match.  start=%d   end=%d\n\n", fMatchStart, fMatchEnd));
5639         }
5640     }
5641     else
5642     {
5643         if (fTraceDebug) {
5644             REGEX_RUN_DEBUG_PRINTF(("No match\n\n"));
5645         }
5646     }
5647 
5648     fFrame = fp;                // The active stack frame when the engine stopped.
5649     //   Contains the capture group results that we need to
5650     //    access later.
5651 
5652     return;
5653 }
5654 
5655 
5656 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher)
5657 
5658 U_NAMESPACE_END
5659 
5660 #endif  // !UCONFIG_NO_REGULAR_EXPRESSIONS
5661