• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **************************************************************************
5 *   Copyright (C) 2002-2016 International Business Machines Corporation
6 *   and others. All rights reserved.
7 **************************************************************************
8 */
9 //
10 //  file:  rematch.cpp
11 //
12 //         Contains the implementation of class RegexMatcher,
13 //         which is one of the main API classes for the ICU regular expression package.
14 //
15 
16 #include "unicode/utypes.h"
17 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
18 
19 #include "unicode/regex.h"
20 #include "unicode/uniset.h"
21 #include "unicode/uchar.h"
22 #include "unicode/ustring.h"
23 #include "unicode/rbbi.h"
24 #include "unicode/utf.h"
25 #include "unicode/utf16.h"
26 #include "uassert.h"
27 #include "cmemory.h"
28 #include "cstr.h"
29 #include "uvector.h"
30 #include "uvectr32.h"
31 #include "uvectr64.h"
32 #include "regeximp.h"
33 #include "regexst.h"
34 #include "regextxt.h"
35 #include "ucase.h"
36 
37 // #include <malloc.h>        // Needed for heapcheck testing
38 
39 
40 U_NAMESPACE_BEGIN
41 
42 // Default limit for the size of the back track stack, to avoid system
43 //    failures causedby heap exhaustion.  Units are in 32 bit words, not bytes.
44 // This value puts ICU's limits higher than most other regexp implementations,
45 //    which use recursion rather than the heap, and take more storage per
46 //    backtrack point.
47 //
48 static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY = 8000000;
49 
50 // Time limit counter constant.
51 //   Time limits for expression evaluation are in terms of quanta of work by
52 //   the engine, each of which is 10,000 state saves.
53 //   This constant determines that state saves per tick number.
54 static const int32_t TIMER_INITIAL_VALUE = 10000;
55 
56 
57 // Test for any of the Unicode line terminating characters.
isLineTerminator(UChar32 c)58 static inline UBool isLineTerminator(UChar32 c) {
59     if (c & ~(0x0a | 0x0b | 0x0c | 0x0d | 0x85 | 0x2028 | 0x2029)) {
60         return false;
61     }
62     return (c<=0x0d && c>=0x0a) || c==0x85 || c==0x2028 || c==0x2029;
63 }
64 
65 //-----------------------------------------------------------------------------
66 //
67 //   Constructor and Destructor
68 //
69 //-----------------------------------------------------------------------------
RegexMatcher(const RegexPattern * pat)70 RegexMatcher::RegexMatcher(const RegexPattern *pat)  {
71     fDeferredStatus = U_ZERO_ERROR;
72     init(fDeferredStatus);
73     if (U_FAILURE(fDeferredStatus)) {
74         return;
75     }
76     if (pat==NULL) {
77         fDeferredStatus = U_ILLEGAL_ARGUMENT_ERROR;
78         return;
79     }
80     fPattern = pat;
81     init2(RegexStaticSets::gStaticSets->fEmptyText, fDeferredStatus);
82 }
83 
84 
85 
RegexMatcher(const UnicodeString & regexp,const UnicodeString & input,uint32_t flags,UErrorCode & status)86 RegexMatcher::RegexMatcher(const UnicodeString &regexp, const UnicodeString &input,
87                            uint32_t flags, UErrorCode &status) {
88     init(status);
89     if (U_FAILURE(status)) {
90         return;
91     }
92     UParseError    pe;
93     fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
94     fPattern           = fPatternOwned;
95 
96     UText inputText = UTEXT_INITIALIZER;
97     utext_openConstUnicodeString(&inputText, &input, &status);
98     init2(&inputText, status);
99     utext_close(&inputText);
100 
101     fInputUniStrMaybeMutable = TRUE;
102 }
103 
104 
RegexMatcher(UText * regexp,UText * input,uint32_t flags,UErrorCode & status)105 RegexMatcher::RegexMatcher(UText *regexp, UText *input,
106                            uint32_t flags, UErrorCode &status) {
107     init(status);
108     if (U_FAILURE(status)) {
109         return;
110     }
111     UParseError    pe;
112     fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
113     if (U_FAILURE(status)) {
114         return;
115     }
116 
117     fPattern           = fPatternOwned;
118     init2(input, status);
119 }
120 
121 
RegexMatcher(const UnicodeString & regexp,uint32_t flags,UErrorCode & status)122 RegexMatcher::RegexMatcher(const UnicodeString &regexp,
123                            uint32_t flags, UErrorCode &status) {
124     init(status);
125     if (U_FAILURE(status)) {
126         return;
127     }
128     UParseError    pe;
129     fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
130     if (U_FAILURE(status)) {
131         return;
132     }
133     fPattern           = fPatternOwned;
134     init2(RegexStaticSets::gStaticSets->fEmptyText, status);
135 }
136 
RegexMatcher(UText * regexp,uint32_t flags,UErrorCode & status)137 RegexMatcher::RegexMatcher(UText *regexp,
138                            uint32_t flags, UErrorCode &status) {
139     init(status);
140     if (U_FAILURE(status)) {
141         return;
142     }
143     UParseError    pe;
144     fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
145         if (U_FAILURE(status)) {
146         return;
147     }
148 
149     fPattern           = fPatternOwned;
150     init2(RegexStaticSets::gStaticSets->fEmptyText, status);
151 }
152 
153 
154 
155 
~RegexMatcher()156 RegexMatcher::~RegexMatcher() {
157     delete fStack;
158     if (fData != fSmallData) {
159         uprv_free(fData);
160         fData = NULL;
161     }
162     if (fPatternOwned) {
163         delete fPatternOwned;
164         fPatternOwned = NULL;
165         fPattern = NULL;
166     }
167 
168     if (fInput) {
169         delete fInput;
170     }
171     if (fInputText) {
172         utext_close(fInputText);
173     }
174     if (fAltInputText) {
175         utext_close(fAltInputText);
176     }
177 
178     #if UCONFIG_NO_BREAK_ITERATION==0
179     delete fWordBreakItr;
180     delete fGCBreakItr;
181     #endif
182 }
183 
184 //
185 //   init()   common initialization for use by all constructors.
186 //            Initialize all fields, get the object into a consistent state.
187 //            This must be done even when the initial status shows an error,
188 //            so that the object is initialized sufficiently well for the destructor
189 //            to run safely.
190 //
init(UErrorCode & status)191 void RegexMatcher::init(UErrorCode &status) {
192     fPattern           = NULL;
193     fPatternOwned      = NULL;
194     fFrameSize         = 0;
195     fRegionStart       = 0;
196     fRegionLimit       = 0;
197     fAnchorStart       = 0;
198     fAnchorLimit       = 0;
199     fLookStart         = 0;
200     fLookLimit         = 0;
201     fActiveStart       = 0;
202     fActiveLimit       = 0;
203     fTransparentBounds = FALSE;
204     fAnchoringBounds   = TRUE;
205     fMatch             = FALSE;
206     fMatchStart        = 0;
207     fMatchEnd          = 0;
208     fLastMatchEnd      = -1;
209     fAppendPosition    = 0;
210     fHitEnd            = FALSE;
211     fRequireEnd        = FALSE;
212     fStack             = NULL;
213     fFrame             = NULL;
214     fTimeLimit         = 0;
215     fTime              = 0;
216     fTickCounter       = 0;
217     fStackLimit        = DEFAULT_BACKTRACK_STACK_CAPACITY;
218     fCallbackFn        = NULL;
219     fCallbackContext   = NULL;
220     fFindProgressCallbackFn      = NULL;
221     fFindProgressCallbackContext = NULL;
222     fTraceDebug        = FALSE;
223     fDeferredStatus    = status;
224     fData              = fSmallData;
225     fWordBreakItr      = NULL;
226     fGCBreakItr        = NULL;
227 
228     fStack             = NULL;
229     fInputText         = NULL;
230     fAltInputText      = NULL;
231     fInput             = NULL;
232     fInputLength       = 0;
233     fInputUniStrMaybeMutable = FALSE;
234 }
235 
236 //
237 //  init2()   Common initialization for use by RegexMatcher constructors, part 2.
238 //            This handles the common setup to be done after the Pattern is available.
239 //
init2(UText * input,UErrorCode & status)240 void RegexMatcher::init2(UText *input, UErrorCode &status) {
241     if (U_FAILURE(status)) {
242         fDeferredStatus = status;
243         return;
244     }
245 
246     if (fPattern->fDataSize > UPRV_LENGTHOF(fSmallData)) {
247         fData = (int64_t *)uprv_malloc(fPattern->fDataSize * sizeof(int64_t));
248         if (fData == NULL) {
249             status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
250             return;
251         }
252     }
253 
254     fStack = new UVector64(status);
255     if (fStack == NULL) {
256         status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
257         return;
258     }
259 
260     reset(input);
261     setStackLimit(DEFAULT_BACKTRACK_STACK_CAPACITY, status);
262     if (U_FAILURE(status)) {
263         fDeferredStatus = status;
264         return;
265     }
266 }
267 
268 
269 static const UChar BACKSLASH  = 0x5c;
270 static const UChar DOLLARSIGN = 0x24;
271 static const UChar LEFTBRACKET = 0x7b;
272 static const UChar RIGHTBRACKET = 0x7d;
273 
274 //--------------------------------------------------------------------------------
275 //
276 //    appendReplacement
277 //
278 //--------------------------------------------------------------------------------
appendReplacement(UnicodeString & dest,const UnicodeString & replacement,UErrorCode & status)279 RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest,
280                                               const UnicodeString &replacement,
281                                               UErrorCode &status) {
282     UText replacementText = UTEXT_INITIALIZER;
283 
284     utext_openConstUnicodeString(&replacementText, &replacement, &status);
285     if (U_SUCCESS(status)) {
286         UText resultText = UTEXT_INITIALIZER;
287         utext_openUnicodeString(&resultText, &dest, &status);
288 
289         if (U_SUCCESS(status)) {
290             appendReplacement(&resultText, &replacementText, status);
291             utext_close(&resultText);
292         }
293         utext_close(&replacementText);
294     }
295 
296     return *this;
297 }
298 
299 //
300 //    appendReplacement, UText mode
301 //
appendReplacement(UText * dest,UText * replacement,UErrorCode & status)302 RegexMatcher &RegexMatcher::appendReplacement(UText *dest,
303                                               UText *replacement,
304                                               UErrorCode &status) {
305     if (U_FAILURE(status)) {
306         return *this;
307     }
308     if (U_FAILURE(fDeferredStatus)) {
309         status = fDeferredStatus;
310         return *this;
311     }
312     if (fMatch == FALSE) {
313         status = U_REGEX_INVALID_STATE;
314         return *this;
315     }
316 
317     // Copy input string from the end of previous match to start of current match
318     int64_t  destLen = utext_nativeLength(dest);
319     if (fMatchStart > fAppendPosition) {
320         if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
321             destLen += utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
322                                      (int32_t)(fMatchStart-fAppendPosition), &status);
323         } else {
324             int32_t len16;
325             if (UTEXT_USES_U16(fInputText)) {
326                 len16 = (int32_t)(fMatchStart-fAppendPosition);
327             } else {
328                 UErrorCode lengthStatus = U_ZERO_ERROR;
329                 len16 = utext_extract(fInputText, fAppendPosition, fMatchStart, NULL, 0, &lengthStatus);
330             }
331             UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
332             if (inputChars == NULL) {
333                 status = U_MEMORY_ALLOCATION_ERROR;
334                 return *this;
335             }
336             utext_extract(fInputText, fAppendPosition, fMatchStart, inputChars, len16+1, &status);
337             destLen += utext_replace(dest, destLen, destLen, inputChars, len16, &status);
338             uprv_free(inputChars);
339         }
340     }
341     fAppendPosition = fMatchEnd;
342 
343 
344     // scan the replacement text, looking for substitutions ($n) and \escapes.
345     //  TODO:  optimize this loop by efficiently scanning for '$' or '\',
346     //         move entire ranges not containing substitutions.
347     UTEXT_SETNATIVEINDEX(replacement, 0);
348     for (UChar32 c = UTEXT_NEXT32(replacement); U_SUCCESS(status) && c != U_SENTINEL;  c = UTEXT_NEXT32(replacement)) {
349         if (c == BACKSLASH) {
350             // Backslash Escape.  Copy the following char out without further checks.
351             //                    Note:  Surrogate pairs don't need any special handling
352             //                           The second half wont be a '$' or a '\', and
353             //                           will move to the dest normally on the next
354             //                           loop iteration.
355             c = UTEXT_CURRENT32(replacement);
356             if (c == U_SENTINEL) {
357                 break;
358             }
359 
360             if (c==0x55/*U*/ || c==0x75/*u*/) {
361                 // We have a \udddd or \Udddddddd escape sequence.
362                 int32_t offset = 0;
363                 struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(replacement);
364                 UChar32 escapedChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context);
365                 if (escapedChar != (UChar32)0xFFFFFFFF) {
366                     if (U_IS_BMP(escapedChar)) {
367                         UChar c16 = (UChar)escapedChar;
368                         destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
369                     } else {
370                         UChar surrogate[2];
371                         surrogate[0] = U16_LEAD(escapedChar);
372                         surrogate[1] = U16_TRAIL(escapedChar);
373                         if (U_SUCCESS(status)) {
374                             destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
375                         }
376                     }
377                     // TODO:  Report errors for mal-formed \u escapes?
378                     //        As this is, the original sequence is output, which may be OK.
379                     if (context.lastOffset == offset) {
380                         (void)UTEXT_PREVIOUS32(replacement);
381                     } else if (context.lastOffset != offset-1) {
382                         utext_moveIndex32(replacement, offset - context.lastOffset - 1);
383                     }
384                 }
385             } else {
386                 (void)UTEXT_NEXT32(replacement);
387                 // Plain backslash escape.  Just put out the escaped character.
388                 if (U_IS_BMP(c)) {
389                     UChar c16 = (UChar)c;
390                     destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
391                 } else {
392                     UChar surrogate[2];
393                     surrogate[0] = U16_LEAD(c);
394                     surrogate[1] = U16_TRAIL(c);
395                     if (U_SUCCESS(status)) {
396                         destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
397                     }
398                 }
399             }
400         } else if (c != DOLLARSIGN) {
401             // Normal char, not a $.  Copy it out without further checks.
402             if (U_IS_BMP(c)) {
403                 UChar c16 = (UChar)c;
404                 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
405             } else {
406                 UChar surrogate[2];
407                 surrogate[0] = U16_LEAD(c);
408                 surrogate[1] = U16_TRAIL(c);
409                 if (U_SUCCESS(status)) {
410                     destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
411                 }
412             }
413         } else {
414             // We've got a $.  Pick up a capture group name or number if one follows.
415             // Consume digits so long as the resulting group number <= the number of
416             // number of capture groups in the pattern.
417 
418             int32_t groupNum  = 0;
419             int32_t numDigits = 0;
420             UChar32 nextChar = utext_current32(replacement);
421             if (nextChar == LEFTBRACKET) {
422                 // Scan for a Named Capture Group, ${name}.
423                 UnicodeString groupName;
424                 utext_next32(replacement);
425                 while(U_SUCCESS(status) && nextChar != RIGHTBRACKET) {
426                     nextChar = utext_next32(replacement);
427                     if (nextChar == U_SENTINEL) {
428                         status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
429                     } else if ((nextChar >= 0x41 && nextChar <= 0x5a) ||       // A..Z
430                                (nextChar >= 0x61 && nextChar <= 0x7a) ||       // a..z
431                                (nextChar >= 0x31 && nextChar <= 0x39)) {       // 0..9
432                         groupName.append(nextChar);
433                     } else if (nextChar == RIGHTBRACKET) {
434                         groupNum = fPattern->fNamedCaptureMap ? uhash_geti(fPattern->fNamedCaptureMap, &groupName) : 0;
435                         if (groupNum == 0) {
436                             status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
437                         }
438                     } else {
439                         // Character was something other than a name char or a closing '}'
440                         status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
441                     }
442                 }
443 
444             } else if (u_isdigit(nextChar)) {
445                 // $n    Scan for a capture group number
446                 int32_t numCaptureGroups = fPattern->fGroupMap->size();
447                 for (;;) {
448                     nextChar = UTEXT_CURRENT32(replacement);
449                     if (nextChar == U_SENTINEL) {
450                         break;
451                     }
452                     if (u_isdigit(nextChar) == FALSE) {
453                         break;
454                     }
455                     int32_t nextDigitVal = u_charDigitValue(nextChar);
456                     if (groupNum*10 + nextDigitVal > numCaptureGroups) {
457                         // Don't consume the next digit if it makes the capture group number too big.
458                         if (numDigits == 0) {
459                             status = U_INDEX_OUTOFBOUNDS_ERROR;
460                         }
461                         break;
462                     }
463                     (void)UTEXT_NEXT32(replacement);
464                     groupNum=groupNum*10 + nextDigitVal;
465                     ++numDigits;
466                 }
467             } else {
468                 // $ not followed by capture group name or number.
469                 status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
470             }
471 
472             if (U_SUCCESS(status)) {
473                 destLen += appendGroup(groupNum, dest, status);
474             }
475         }  // End of $ capture group handling
476     }  // End of per-character loop through the replacement string.
477 
478     return *this;
479 }
480 
481 
482 
483 //--------------------------------------------------------------------------------
484 //
485 //    appendTail     Intended to be used in conjunction with appendReplacement()
486 //                   To the destination string, append everything following
487 //                   the last match position from the input string.
488 //
489 //                   Note:  Match ranges do not affect appendTail or appendReplacement
490 //
491 //--------------------------------------------------------------------------------
appendTail(UnicodeString & dest)492 UnicodeString &RegexMatcher::appendTail(UnicodeString &dest) {
493     UErrorCode status = U_ZERO_ERROR;
494     UText resultText = UTEXT_INITIALIZER;
495     utext_openUnicodeString(&resultText, &dest, &status);
496 
497     if (U_SUCCESS(status)) {
498         appendTail(&resultText, status);
499         utext_close(&resultText);
500     }
501 
502     return dest;
503 }
504 
505 //
506 //   appendTail, UText mode
507 //
appendTail(UText * dest,UErrorCode & status)508 UText *RegexMatcher::appendTail(UText *dest, UErrorCode &status) {
509     if (U_FAILURE(status)) {
510         return dest;
511     }
512     if (U_FAILURE(fDeferredStatus)) {
513         status = fDeferredStatus;
514         return dest;
515     }
516 
517     if (fInputLength > fAppendPosition) {
518         if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
519             int64_t destLen = utext_nativeLength(dest);
520             utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
521                           (int32_t)(fInputLength-fAppendPosition), &status);
522         } else {
523             int32_t len16;
524             if (UTEXT_USES_U16(fInputText)) {
525                 len16 = (int32_t)(fInputLength-fAppendPosition);
526             } else {
527                 len16 = utext_extract(fInputText, fAppendPosition, fInputLength, NULL, 0, &status);
528                 status = U_ZERO_ERROR; // buffer overflow
529             }
530 
531             UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16));
532             if (inputChars == NULL) {
533                 fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
534             } else {
535                 utext_extract(fInputText, fAppendPosition, fInputLength, inputChars, len16, &status); // unterminated
536                 int64_t destLen = utext_nativeLength(dest);
537                 utext_replace(dest, destLen, destLen, inputChars, len16, &status);
538                 uprv_free(inputChars);
539             }
540         }
541     }
542     return dest;
543 }
544 
545 
546 
547 //--------------------------------------------------------------------------------
548 //
549 //   end
550 //
551 //--------------------------------------------------------------------------------
end(UErrorCode & err) const552 int32_t RegexMatcher::end(UErrorCode &err) const {
553     return end(0, err);
554 }
555 
end64(UErrorCode & err) const556 int64_t RegexMatcher::end64(UErrorCode &err) const {
557     return end64(0, err);
558 }
559 
end64(int32_t group,UErrorCode & err) const560 int64_t RegexMatcher::end64(int32_t group, UErrorCode &err) const {
561     if (U_FAILURE(err)) {
562         return -1;
563     }
564     if (fMatch == FALSE) {
565         err = U_REGEX_INVALID_STATE;
566         return -1;
567     }
568     if (group < 0 || group > fPattern->fGroupMap->size()) {
569         err = U_INDEX_OUTOFBOUNDS_ERROR;
570         return -1;
571     }
572     int64_t e = -1;
573     if (group == 0) {
574         e = fMatchEnd;
575     } else {
576         // Get the position within the stack frame of the variables for
577         //    this capture group.
578         int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
579         U_ASSERT(groupOffset < fPattern->fFrameSize);
580         U_ASSERT(groupOffset >= 0);
581         e = fFrame->fExtra[groupOffset + 1];
582     }
583 
584         return e;
585 }
586 
end(int32_t group,UErrorCode & err) const587 int32_t RegexMatcher::end(int32_t group, UErrorCode &err) const {
588     return (int32_t)end64(group, err);
589 }
590 
591 //--------------------------------------------------------------------------------
592 //
593 //   findProgressInterrupt  This function is called once for each advance in the target
594 //                          string from the find() function, and calls the user progress callback
595 //                          function if there is one installed.
596 //
597 //         Return:  TRUE if the find operation is to be terminated.
598 //                  FALSE if the find operation is to continue running.
599 //
600 //--------------------------------------------------------------------------------
findProgressInterrupt(int64_t pos,UErrorCode & status)601 UBool RegexMatcher::findProgressInterrupt(int64_t pos, UErrorCode &status) {
602     if (fFindProgressCallbackFn && !(*fFindProgressCallbackFn)(fFindProgressCallbackContext, pos)) {
603         status = U_REGEX_STOPPED_BY_CALLER;
604         return TRUE;
605     }
606     return FALSE;
607 }
608 
609 //--------------------------------------------------------------------------------
610 //
611 //   find()
612 //
613 //--------------------------------------------------------------------------------
find()614 UBool RegexMatcher::find() {
615     if (U_FAILURE(fDeferredStatus)) {
616         return FALSE;
617     }
618     UErrorCode status = U_ZERO_ERROR;
619     UBool result = find(status);
620     return result;
621 }
622 
623 //--------------------------------------------------------------------------------
624 //
625 //   find()
626 //
627 //--------------------------------------------------------------------------------
find(UErrorCode & status)628 UBool RegexMatcher::find(UErrorCode &status) {
629     // Start at the position of the last match end.  (Will be zero if the
630     //   matcher has been reset.)
631     //
632     if (U_FAILURE(status)) {
633         return FALSE;
634     }
635     if (U_FAILURE(fDeferredStatus)) {
636         status = fDeferredStatus;
637         return FALSE;
638     }
639 
640     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
641         return findUsingChunk(status);
642     }
643 
644     int64_t startPos = fMatchEnd;
645     if (startPos==0) {
646         startPos = fActiveStart;
647     }
648 
649     if (fMatch) {
650         // Save the position of any previous successful match.
651         fLastMatchEnd = fMatchEnd;
652 
653         if (fMatchStart == fMatchEnd) {
654             // Previous match had zero length.  Move start position up one position
655             //  to avoid sending find() into a loop on zero-length matches.
656             if (startPos >= fActiveLimit) {
657                 fMatch = FALSE;
658                 fHitEnd = TRUE;
659                 return FALSE;
660             }
661             UTEXT_SETNATIVEINDEX(fInputText, startPos);
662             (void)UTEXT_NEXT32(fInputText);
663             startPos = UTEXT_GETNATIVEINDEX(fInputText);
664         }
665     } else {
666         if (fLastMatchEnd >= 0) {
667             // A previous find() failed to match.  Don't try again.
668             //   (without this test, a pattern with a zero-length match
669             //    could match again at the end of an input string.)
670             fHitEnd = TRUE;
671             return FALSE;
672         }
673     }
674 
675 
676     // Compute the position in the input string beyond which a match can not begin, because
677     //   the minimum length match would extend past the end of the input.
678     //   Note:  some patterns that cannot match anything will have fMinMatchLength==Max Int.
679     //          Be aware of possible overflows if making changes here.
680     int64_t testStartLimit;
681     if (UTEXT_USES_U16(fInputText)) {
682         testStartLimit = fActiveLimit - fPattern->fMinMatchLen;
683         if (startPos > testStartLimit) {
684             fMatch = FALSE;
685             fHitEnd = TRUE;
686             return FALSE;
687         }
688     } else {
689         // We don't know exactly how long the minimum match length is in native characters.
690         // Treat anything > 0 as 1.
691         testStartLimit = fActiveLimit - (fPattern->fMinMatchLen > 0 ? 1 : 0);
692     }
693 
694     UChar32  c;
695     U_ASSERT(startPos >= 0);
696 
697     switch (fPattern->fStartType) {
698     case START_NO_INFO:
699         // No optimization was found.
700         //  Try a match at each input position.
701         for (;;) {
702             MatchAt(startPos, FALSE, status);
703             if (U_FAILURE(status)) {
704                 return FALSE;
705             }
706             if (fMatch) {
707                 return TRUE;
708             }
709             if (startPos >= testStartLimit) {
710                 fHitEnd = TRUE;
711                 return FALSE;
712             }
713             UTEXT_SETNATIVEINDEX(fInputText, startPos);
714             (void)UTEXT_NEXT32(fInputText);
715             startPos = UTEXT_GETNATIVEINDEX(fInputText);
716             // Note that it's perfectly OK for a pattern to have a zero-length
717             //   match at the end of a string, so we must make sure that the loop
718             //   runs with startPos == testStartLimit the last time through.
719             if  (findProgressInterrupt(startPos, status))
720                 return FALSE;
721         }
722         UPRV_UNREACHABLE;
723 
724     case START_START:
725         // Matches are only possible at the start of the input string
726         //   (pattern begins with ^ or \A)
727         if (startPos > fActiveStart) {
728             fMatch = FALSE;
729             return FALSE;
730         }
731         MatchAt(startPos, FALSE, status);
732         if (U_FAILURE(status)) {
733             return FALSE;
734         }
735         return fMatch;
736 
737 
738     case START_SET:
739         {
740             // Match may start on any char from a pre-computed set.
741             U_ASSERT(fPattern->fMinMatchLen > 0);
742             UTEXT_SETNATIVEINDEX(fInputText, startPos);
743             for (;;) {
744                 int64_t pos = startPos;
745                 c = UTEXT_NEXT32(fInputText);
746                 startPos = UTEXT_GETNATIVEINDEX(fInputText);
747                 // c will be -1 (U_SENTINEL) at end of text, in which case we
748                 // skip this next block (so we don't have a negative array index)
749                 // and handle end of text in the following block.
750                 if (c >= 0 && ((c<256 && fPattern->fInitialChars8->contains(c)) ||
751                               (c>=256 && fPattern->fInitialChars->contains(c)))) {
752                     MatchAt(pos, FALSE, status);
753                     if (U_FAILURE(status)) {
754                         return FALSE;
755                     }
756                     if (fMatch) {
757                         return TRUE;
758                     }
759                     UTEXT_SETNATIVEINDEX(fInputText, pos);
760                 }
761                 if (startPos > testStartLimit) {
762                     fMatch = FALSE;
763                     fHitEnd = TRUE;
764                     return FALSE;
765                 }
766                 if  (findProgressInterrupt(startPos, status))
767                     return FALSE;
768             }
769         }
770         UPRV_UNREACHABLE;
771 
772     case START_STRING:
773     case START_CHAR:
774         {
775             // Match starts on exactly one char.
776             U_ASSERT(fPattern->fMinMatchLen > 0);
777             UChar32 theChar = fPattern->fInitialChar;
778             UTEXT_SETNATIVEINDEX(fInputText, startPos);
779             for (;;) {
780                 int64_t pos = startPos;
781                 c = UTEXT_NEXT32(fInputText);
782                 startPos = UTEXT_GETNATIVEINDEX(fInputText);
783                 if (c == theChar) {
784                     MatchAt(pos, FALSE, status);
785                     if (U_FAILURE(status)) {
786                         return FALSE;
787                     }
788                     if (fMatch) {
789                         return TRUE;
790                     }
791                     UTEXT_SETNATIVEINDEX(fInputText, startPos);
792                 }
793                 if (startPos > testStartLimit) {
794                     fMatch = FALSE;
795                     fHitEnd = TRUE;
796                     return FALSE;
797                 }
798                 if  (findProgressInterrupt(startPos, status))
799                     return FALSE;
800            }
801         }
802         UPRV_UNREACHABLE;
803 
804     case START_LINE:
805         {
806             UChar32 ch;
807             if (startPos == fAnchorStart) {
808                 MatchAt(startPos, FALSE, status);
809                 if (U_FAILURE(status)) {
810                     return FALSE;
811                 }
812                 if (fMatch) {
813                     return TRUE;
814                 }
815                 UTEXT_SETNATIVEINDEX(fInputText, startPos);
816                 ch = UTEXT_NEXT32(fInputText);
817                 startPos = UTEXT_GETNATIVEINDEX(fInputText);
818             } else {
819                 UTEXT_SETNATIVEINDEX(fInputText, startPos);
820                 ch = UTEXT_PREVIOUS32(fInputText);
821                 UTEXT_SETNATIVEINDEX(fInputText, startPos);
822             }
823 
824             if (fPattern->fFlags & UREGEX_UNIX_LINES) {
825                 for (;;) {
826                     if (ch == 0x0a) {
827                             MatchAt(startPos, FALSE, status);
828                             if (U_FAILURE(status)) {
829                                 return FALSE;
830                             }
831                             if (fMatch) {
832                                 return TRUE;
833                             }
834                             UTEXT_SETNATIVEINDEX(fInputText, startPos);
835                     }
836                     if (startPos >= testStartLimit) {
837                         fMatch = FALSE;
838                         fHitEnd = TRUE;
839                         return FALSE;
840                     }
841                     ch = UTEXT_NEXT32(fInputText);
842                     startPos = UTEXT_GETNATIVEINDEX(fInputText);
843                     // Note that it's perfectly OK for a pattern to have a zero-length
844                     //   match at the end of a string, so we must make sure that the loop
845                     //   runs with startPos == testStartLimit the last time through.
846                     if  (findProgressInterrupt(startPos, status))
847                         return FALSE;
848                 }
849             } else {
850                 for (;;) {
851                     if (isLineTerminator(ch)) {
852                         if (ch == 0x0d && startPos < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
853                             (void)UTEXT_NEXT32(fInputText);
854                             startPos = UTEXT_GETNATIVEINDEX(fInputText);
855                         }
856                         MatchAt(startPos, FALSE, status);
857                         if (U_FAILURE(status)) {
858                             return FALSE;
859                         }
860                         if (fMatch) {
861                             return TRUE;
862                         }
863                         UTEXT_SETNATIVEINDEX(fInputText, startPos);
864                     }
865                     if (startPos >= testStartLimit) {
866                         fMatch = FALSE;
867                         fHitEnd = TRUE;
868                         return FALSE;
869                     }
870                     ch = UTEXT_NEXT32(fInputText);
871                     startPos = UTEXT_GETNATIVEINDEX(fInputText);
872                     // Note that it's perfectly OK for a pattern to have a zero-length
873                     //   match at the end of a string, so we must make sure that the loop
874                     //   runs with startPos == testStartLimit the last time through.
875                     if  (findProgressInterrupt(startPos, status))
876                         return FALSE;
877                 }
878             }
879         }
880 
881     default:
882         UPRV_UNREACHABLE;
883     }
884 
885     UPRV_UNREACHABLE;
886 }
887 
888 
889 
find(int64_t start,UErrorCode & status)890 UBool RegexMatcher::find(int64_t start, UErrorCode &status) {
891     if (U_FAILURE(status)) {
892         return FALSE;
893     }
894     if (U_FAILURE(fDeferredStatus)) {
895         status = fDeferredStatus;
896         return FALSE;
897     }
898     this->reset();                        // Note:  Reset() is specified by Java Matcher documentation.
899                                           //        This will reset the region to be the full input length.
900     if (start < 0) {
901         status = U_INDEX_OUTOFBOUNDS_ERROR;
902         return FALSE;
903     }
904 
905     int64_t nativeStart = start;
906     if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
907         status = U_INDEX_OUTOFBOUNDS_ERROR;
908         return FALSE;
909     }
910     fMatchEnd = nativeStart;
911     return find(status);
912 }
913 
914 
915 //--------------------------------------------------------------------------------
916 //
917 //   findUsingChunk() -- like find(), but with the advance knowledge that the
918 //                       entire string is available in the UText's chunk buffer.
919 //
920 //--------------------------------------------------------------------------------
findUsingChunk(UErrorCode & status)921 UBool RegexMatcher::findUsingChunk(UErrorCode &status) {
922     // Start at the position of the last match end.  (Will be zero if the
923     //   matcher has been reset.
924     //
925 
926     int32_t startPos = (int32_t)fMatchEnd;
927     if (startPos==0) {
928         startPos = (int32_t)fActiveStart;
929     }
930 
931     const UChar *inputBuf = fInputText->chunkContents;
932 
933     if (fMatch) {
934         // Save the position of any previous successful match.
935         fLastMatchEnd = fMatchEnd;
936 
937         if (fMatchStart == fMatchEnd) {
938             // Previous match had zero length.  Move start position up one position
939             //  to avoid sending find() into a loop on zero-length matches.
940             if (startPos >= fActiveLimit) {
941                 fMatch = FALSE;
942                 fHitEnd = TRUE;
943                 return FALSE;
944             }
945             U16_FWD_1(inputBuf, startPos, fInputLength);
946         }
947     } else {
948         if (fLastMatchEnd >= 0) {
949             // A previous find() failed to match.  Don't try again.
950             //   (without this test, a pattern with a zero-length match
951             //    could match again at the end of an input string.)
952             fHitEnd = TRUE;
953             return FALSE;
954         }
955     }
956 
957 
958     // Compute the position in the input string beyond which a match can not begin, because
959     //   the minimum length match would extend past the end of the input.
960     //   Note:  some patterns that cannot match anything will have fMinMatchLength==Max Int.
961     //          Be aware of possible overflows if making changes here.
962     //   Note:  a match can begin at inputBuf + testLen; it is an inclusive limit.
963     int32_t testLen  = (int32_t)(fActiveLimit - fPattern->fMinMatchLen);
964     if (startPos > testLen) {
965         fMatch = FALSE;
966         fHitEnd = TRUE;
967         return FALSE;
968     }
969 
970     UChar32  c;
971     U_ASSERT(startPos >= 0);
972 
973     switch (fPattern->fStartType) {
974     case START_NO_INFO:
975         // No optimization was found.
976         //  Try a match at each input position.
977         for (;;) {
978             MatchChunkAt(startPos, FALSE, status);
979             if (U_FAILURE(status)) {
980                 return FALSE;
981             }
982             if (fMatch) {
983                 return TRUE;
984             }
985             if (startPos >= testLen) {
986                 fHitEnd = TRUE;
987                 return FALSE;
988             }
989             U16_FWD_1(inputBuf, startPos, fActiveLimit);
990             // Note that it's perfectly OK for a pattern to have a zero-length
991             //   match at the end of a string, so we must make sure that the loop
992             //   runs with startPos == testLen the last time through.
993             if  (findProgressInterrupt(startPos, status))
994                 return FALSE;
995         }
996         UPRV_UNREACHABLE;
997 
998     case START_START:
999         // Matches are only possible at the start of the input string
1000         //   (pattern begins with ^ or \A)
1001         if (startPos > fActiveStart) {
1002             fMatch = FALSE;
1003             return FALSE;
1004         }
1005         MatchChunkAt(startPos, FALSE, status);
1006         if (U_FAILURE(status)) {
1007             return FALSE;
1008         }
1009         return fMatch;
1010 
1011 
1012     case START_SET:
1013     {
1014         // Match may start on any char from a pre-computed set.
1015         U_ASSERT(fPattern->fMinMatchLen > 0);
1016         for (;;) {
1017             int32_t pos = startPos;
1018             U16_NEXT(inputBuf, startPos, fActiveLimit, c);  // like c = inputBuf[startPos++];
1019             if ((c<256 && fPattern->fInitialChars8->contains(c)) ||
1020                 (c>=256 && fPattern->fInitialChars->contains(c))) {
1021                 MatchChunkAt(pos, FALSE, status);
1022                 if (U_FAILURE(status)) {
1023                     return FALSE;
1024                 }
1025                 if (fMatch) {
1026                     return TRUE;
1027                 }
1028             }
1029             if (startPos > testLen) {
1030                 fMatch = FALSE;
1031                 fHitEnd = TRUE;
1032                 return FALSE;
1033             }
1034             if  (findProgressInterrupt(startPos, status))
1035                 return FALSE;
1036         }
1037     }
1038     UPRV_UNREACHABLE;
1039 
1040     case START_STRING:
1041     case START_CHAR:
1042     {
1043         // Match starts on exactly one char.
1044         U_ASSERT(fPattern->fMinMatchLen > 0);
1045         UChar32 theChar = fPattern->fInitialChar;
1046         for (;;) {
1047             int32_t pos = startPos;
1048             U16_NEXT(inputBuf, startPos, fActiveLimit, c);  // like c = inputBuf[startPos++];
1049             if (c == theChar) {
1050                 MatchChunkAt(pos, FALSE, status);
1051                 if (U_FAILURE(status)) {
1052                     return FALSE;
1053                 }
1054                 if (fMatch) {
1055                     return TRUE;
1056                 }
1057             }
1058             if (startPos > testLen) {
1059                 fMatch = FALSE;
1060                 fHitEnd = TRUE;
1061                 return FALSE;
1062             }
1063             if  (findProgressInterrupt(startPos, status))
1064                 return FALSE;
1065         }
1066     }
1067     UPRV_UNREACHABLE;
1068 
1069     case START_LINE:
1070     {
1071         UChar32 ch;
1072         if (startPos == fAnchorStart) {
1073             MatchChunkAt(startPos, FALSE, status);
1074             if (U_FAILURE(status)) {
1075                 return FALSE;
1076             }
1077             if (fMatch) {
1078                 return TRUE;
1079             }
1080             U16_FWD_1(inputBuf, startPos, fActiveLimit);
1081         }
1082 
1083         if (fPattern->fFlags & UREGEX_UNIX_LINES) {
1084             for (;;) {
1085                 ch = inputBuf[startPos-1];
1086                 if (ch == 0x0a) {
1087                     MatchChunkAt(startPos, FALSE, status);
1088                     if (U_FAILURE(status)) {
1089                         return FALSE;
1090                     }
1091                     if (fMatch) {
1092                         return TRUE;
1093                     }
1094                 }
1095                 if (startPos >= testLen) {
1096                     fMatch = FALSE;
1097                     fHitEnd = TRUE;
1098                     return FALSE;
1099                 }
1100                 U16_FWD_1(inputBuf, startPos, fActiveLimit);
1101                 // Note that it's perfectly OK for a pattern to have a zero-length
1102                 //   match at the end of a string, so we must make sure that the loop
1103                 //   runs with startPos == testLen the last time through.
1104                 if  (findProgressInterrupt(startPos, status))
1105                     return FALSE;
1106             }
1107         } else {
1108             for (;;) {
1109                 ch = inputBuf[startPos-1];
1110                 if (isLineTerminator(ch)) {
1111                     if (ch == 0x0d && startPos < fActiveLimit && inputBuf[startPos] == 0x0a) {
1112                         startPos++;
1113                     }
1114                     MatchChunkAt(startPos, FALSE, status);
1115                     if (U_FAILURE(status)) {
1116                         return FALSE;
1117                     }
1118                     if (fMatch) {
1119                         return TRUE;
1120                     }
1121                 }
1122                 if (startPos >= testLen) {
1123                     fMatch = FALSE;
1124                     fHitEnd = TRUE;
1125                     return FALSE;
1126                 }
1127                 U16_FWD_1(inputBuf, startPos, fActiveLimit);
1128                 // Note that it's perfectly OK for a pattern to have a zero-length
1129                 //   match at the end of a string, so we must make sure that the loop
1130                 //   runs with startPos == testLen the last time through.
1131                 if  (findProgressInterrupt(startPos, status))
1132                     return FALSE;
1133             }
1134         }
1135     }
1136 
1137     default:
1138         UPRV_UNREACHABLE;
1139     }
1140 
1141     UPRV_UNREACHABLE;
1142 }
1143 
1144 
1145 
1146 //--------------------------------------------------------------------------------
1147 //
1148 //  group()
1149 //
1150 //--------------------------------------------------------------------------------
group(UErrorCode & status) const1151 UnicodeString RegexMatcher::group(UErrorCode &status) const {
1152     return group(0, status);
1153 }
1154 
1155 //  Return immutable shallow clone
group(UText * dest,int64_t & group_len,UErrorCode & status) const1156 UText *RegexMatcher::group(UText *dest, int64_t &group_len, UErrorCode &status) const {
1157     return group(0, dest, group_len, status);
1158 }
1159 
1160 //  Return immutable shallow clone
group(int32_t groupNum,UText * dest,int64_t & group_len,UErrorCode & status) const1161 UText *RegexMatcher::group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const {
1162     group_len = 0;
1163     if (U_FAILURE(status)) {
1164         return dest;
1165     }
1166     if (U_FAILURE(fDeferredStatus)) {
1167         status = fDeferredStatus;
1168     } else if (fMatch == FALSE) {
1169         status = U_REGEX_INVALID_STATE;
1170     } else if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
1171         status = U_INDEX_OUTOFBOUNDS_ERROR;
1172     }
1173 
1174     if (U_FAILURE(status)) {
1175         return dest;
1176     }
1177 
1178     int64_t s, e;
1179     if (groupNum == 0) {
1180         s = fMatchStart;
1181         e = fMatchEnd;
1182     } else {
1183         int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1184         U_ASSERT(groupOffset < fPattern->fFrameSize);
1185         U_ASSERT(groupOffset >= 0);
1186         s = fFrame->fExtra[groupOffset];
1187         e = fFrame->fExtra[groupOffset+1];
1188     }
1189 
1190     if (s < 0) {
1191         // A capture group wasn't part of the match
1192         return utext_clone(dest, fInputText, FALSE, TRUE, &status);
1193     }
1194     U_ASSERT(s <= e);
1195     group_len = e - s;
1196 
1197     dest = utext_clone(dest, fInputText, FALSE, TRUE, &status);
1198     if (dest)
1199         UTEXT_SETNATIVEINDEX(dest, s);
1200     return dest;
1201 }
1202 
group(int32_t groupNum,UErrorCode & status) const1203 UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const {
1204     UnicodeString result;
1205     int64_t groupStart = start64(groupNum, status);
1206     int64_t groupEnd = end64(groupNum, status);
1207     if (U_FAILURE(status) || groupStart == -1 || groupStart == groupEnd) {
1208         return result;
1209     }
1210 
1211     // Get the group length using a utext_extract preflight.
1212     //    UText is actually pretty efficient at this when underlying encoding is UTF-16.
1213     int32_t length = utext_extract(fInputText, groupStart, groupEnd, NULL, 0, &status);
1214     if (status != U_BUFFER_OVERFLOW_ERROR) {
1215         return result;
1216     }
1217 
1218     status = U_ZERO_ERROR;
1219     UChar *buf = result.getBuffer(length);
1220     if (buf == NULL) {
1221         status = U_MEMORY_ALLOCATION_ERROR;
1222     } else {
1223         int32_t extractLength = utext_extract(fInputText, groupStart, groupEnd, buf, length, &status);
1224         result.releaseBuffer(extractLength);
1225         U_ASSERT(length == extractLength);
1226     }
1227     return result;
1228 }
1229 
1230 
1231 //--------------------------------------------------------------------------------
1232 //
1233 //  appendGroup() -- currently internal only, appends a group to a UText rather
1234 //                   than replacing its contents
1235 //
1236 //--------------------------------------------------------------------------------
1237 
appendGroup(int32_t groupNum,UText * dest,UErrorCode & status) const1238 int64_t RegexMatcher::appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const {
1239     if (U_FAILURE(status)) {
1240         return 0;
1241     }
1242     if (U_FAILURE(fDeferredStatus)) {
1243         status = fDeferredStatus;
1244         return 0;
1245     }
1246     int64_t destLen = utext_nativeLength(dest);
1247 
1248     if (fMatch == FALSE) {
1249         status = U_REGEX_INVALID_STATE;
1250         return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1251     }
1252     if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
1253         status = U_INDEX_OUTOFBOUNDS_ERROR;
1254         return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1255     }
1256 
1257     int64_t s, e;
1258     if (groupNum == 0) {
1259         s = fMatchStart;
1260         e = fMatchEnd;
1261     } else {
1262         int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1263         U_ASSERT(groupOffset < fPattern->fFrameSize);
1264         U_ASSERT(groupOffset >= 0);
1265         s = fFrame->fExtra[groupOffset];
1266         e = fFrame->fExtra[groupOffset+1];
1267     }
1268 
1269     if (s < 0) {
1270         // A capture group wasn't part of the match
1271         return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1272     }
1273     U_ASSERT(s <= e);
1274 
1275     int64_t deltaLen;
1276     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1277         U_ASSERT(e <= fInputLength);
1278         deltaLen = utext_replace(dest, destLen, destLen, fInputText->chunkContents+s, (int32_t)(e-s), &status);
1279     } else {
1280         int32_t len16;
1281         if (UTEXT_USES_U16(fInputText)) {
1282             len16 = (int32_t)(e-s);
1283         } else {
1284             UErrorCode lengthStatus = U_ZERO_ERROR;
1285             len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus);
1286         }
1287         UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
1288         if (groupChars == NULL) {
1289             status = U_MEMORY_ALLOCATION_ERROR;
1290             return 0;
1291         }
1292         utext_extract(fInputText, s, e, groupChars, len16+1, &status);
1293 
1294         deltaLen = utext_replace(dest, destLen, destLen, groupChars, len16, &status);
1295         uprv_free(groupChars);
1296     }
1297     return deltaLen;
1298 }
1299 
1300 
1301 
1302 //--------------------------------------------------------------------------------
1303 //
1304 //  groupCount()
1305 //
1306 //--------------------------------------------------------------------------------
groupCount() const1307 int32_t RegexMatcher::groupCount() const {
1308     return fPattern->fGroupMap->size();
1309 }
1310 
1311 //--------------------------------------------------------------------------------
1312 //
1313 //  hasAnchoringBounds()
1314 //
1315 //--------------------------------------------------------------------------------
hasAnchoringBounds() const1316 UBool RegexMatcher::hasAnchoringBounds() const {
1317     return fAnchoringBounds;
1318 }
1319 
1320 
1321 //--------------------------------------------------------------------------------
1322 //
1323 //  hasTransparentBounds()
1324 //
1325 //--------------------------------------------------------------------------------
hasTransparentBounds() const1326 UBool RegexMatcher::hasTransparentBounds() const {
1327     return fTransparentBounds;
1328 }
1329 
1330 
1331 
1332 //--------------------------------------------------------------------------------
1333 //
1334 //  hitEnd()
1335 //
1336 //--------------------------------------------------------------------------------
hitEnd() const1337 UBool RegexMatcher::hitEnd() const {
1338     return fHitEnd;
1339 }
1340 
1341 
1342 //--------------------------------------------------------------------------------
1343 //
1344 //  input()
1345 //
1346 //--------------------------------------------------------------------------------
input() const1347 const UnicodeString &RegexMatcher::input() const {
1348     if (!fInput) {
1349         UErrorCode status = U_ZERO_ERROR;
1350         int32_t len16;
1351         if (UTEXT_USES_U16(fInputText)) {
1352             len16 = (int32_t)fInputLength;
1353         } else {
1354             len16 = utext_extract(fInputText, 0, fInputLength, NULL, 0, &status);
1355             status = U_ZERO_ERROR; // overflow, length status
1356         }
1357         UnicodeString *result = new UnicodeString(len16, 0, 0);
1358 
1359         UChar *inputChars = result->getBuffer(len16);
1360         utext_extract(fInputText, 0, fInputLength, inputChars, len16, &status); // unterminated warning
1361         result->releaseBuffer(len16);
1362 
1363         (*(const UnicodeString **)&fInput) = result; // pointer assignment, rather than operator=
1364     }
1365 
1366     return *fInput;
1367 }
1368 
1369 //--------------------------------------------------------------------------------
1370 //
1371 //  inputText()
1372 //
1373 //--------------------------------------------------------------------------------
inputText() const1374 UText *RegexMatcher::inputText() const {
1375     return fInputText;
1376 }
1377 
1378 
1379 //--------------------------------------------------------------------------------
1380 //
1381 //  getInput() -- like inputText(), but makes a clone or copies into another UText
1382 //
1383 //--------------------------------------------------------------------------------
getInput(UText * dest,UErrorCode & status) const1384 UText *RegexMatcher::getInput (UText *dest, UErrorCode &status) const {
1385     if (U_FAILURE(status)) {
1386         return dest;
1387     }
1388     if (U_FAILURE(fDeferredStatus)) {
1389         status = fDeferredStatus;
1390         return dest;
1391     }
1392 
1393     if (dest) {
1394         if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1395             utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents, (int32_t)fInputLength, &status);
1396         } else {
1397             int32_t input16Len;
1398             if (UTEXT_USES_U16(fInputText)) {
1399                 input16Len = (int32_t)fInputLength;
1400             } else {
1401                 UErrorCode lengthStatus = U_ZERO_ERROR;
1402                 input16Len = utext_extract(fInputText, 0, fInputLength, NULL, 0, &lengthStatus); // buffer overflow error
1403             }
1404             UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(input16Len));
1405             if (inputChars == NULL) {
1406                 return dest;
1407             }
1408 
1409             status = U_ZERO_ERROR;
1410             utext_extract(fInputText, 0, fInputLength, inputChars, input16Len, &status); // not terminated warning
1411             status = U_ZERO_ERROR;
1412             utext_replace(dest, 0, utext_nativeLength(dest), inputChars, input16Len, &status);
1413 
1414             uprv_free(inputChars);
1415         }
1416         return dest;
1417     } else {
1418         return utext_clone(NULL, fInputText, FALSE, TRUE, &status);
1419     }
1420 }
1421 
1422 
1423 static UBool compat_SyncMutableUTextContents(UText *ut);
compat_SyncMutableUTextContents(UText * ut)1424 static UBool compat_SyncMutableUTextContents(UText *ut) {
1425     UBool retVal = FALSE;
1426 
1427     //  In the following test, we're really only interested in whether the UText should switch
1428     //  between heap and stack allocation.  If length hasn't changed, we won't, so the chunkContents
1429     //  will still point to the correct data.
1430     if (utext_nativeLength(ut) != ut->nativeIndexingLimit) {
1431         UnicodeString *us=(UnicodeString *)ut->context;
1432 
1433         // Update to the latest length.
1434         // For example, (utext_nativeLength(ut) != ut->nativeIndexingLimit).
1435         int32_t newLength = us->length();
1436 
1437         // Update the chunk description.
1438         // The buffer may have switched between stack- and heap-based.
1439         ut->chunkContents    = us->getBuffer();
1440         ut->chunkLength      = newLength;
1441         ut->chunkNativeLimit = newLength;
1442         ut->nativeIndexingLimit = newLength;
1443         retVal = TRUE;
1444     }
1445 
1446     return retVal;
1447 }
1448 
1449 //--------------------------------------------------------------------------------
1450 //
1451 //  lookingAt()
1452 //
1453 //--------------------------------------------------------------------------------
lookingAt(UErrorCode & status)1454 UBool RegexMatcher::lookingAt(UErrorCode &status) {
1455     if (U_FAILURE(status)) {
1456         return FALSE;
1457     }
1458     if (U_FAILURE(fDeferredStatus)) {
1459         status = fDeferredStatus;
1460         return FALSE;
1461     }
1462 
1463     if (fInputUniStrMaybeMutable) {
1464         if (compat_SyncMutableUTextContents(fInputText)) {
1465         fInputLength = utext_nativeLength(fInputText);
1466         reset();
1467         }
1468     }
1469     else {
1470         resetPreserveRegion();
1471     }
1472     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1473         MatchChunkAt((int32_t)fActiveStart, FALSE, status);
1474     } else {
1475         MatchAt(fActiveStart, FALSE, status);
1476     }
1477     return fMatch;
1478 }
1479 
1480 
lookingAt(int64_t start,UErrorCode & status)1481 UBool RegexMatcher::lookingAt(int64_t start, UErrorCode &status) {
1482     if (U_FAILURE(status)) {
1483         return FALSE;
1484     }
1485     if (U_FAILURE(fDeferredStatus)) {
1486         status = fDeferredStatus;
1487         return FALSE;
1488     }
1489     reset();
1490 
1491     if (start < 0) {
1492         status = U_INDEX_OUTOFBOUNDS_ERROR;
1493         return FALSE;
1494     }
1495 
1496     if (fInputUniStrMaybeMutable) {
1497         if (compat_SyncMutableUTextContents(fInputText)) {
1498         fInputLength = utext_nativeLength(fInputText);
1499         reset();
1500         }
1501     }
1502 
1503     int64_t nativeStart;
1504     nativeStart = start;
1505     if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
1506         status = U_INDEX_OUTOFBOUNDS_ERROR;
1507         return FALSE;
1508     }
1509 
1510     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1511         MatchChunkAt((int32_t)nativeStart, FALSE, status);
1512     } else {
1513         MatchAt(nativeStart, FALSE, status);
1514     }
1515     return fMatch;
1516 }
1517 
1518 
1519 
1520 //--------------------------------------------------------------------------------
1521 //
1522 //  matches()
1523 //
1524 //--------------------------------------------------------------------------------
matches(UErrorCode & status)1525 UBool RegexMatcher::matches(UErrorCode &status) {
1526     if (U_FAILURE(status)) {
1527         return FALSE;
1528     }
1529     if (U_FAILURE(fDeferredStatus)) {
1530         status = fDeferredStatus;
1531         return FALSE;
1532     }
1533 
1534     if (fInputUniStrMaybeMutable) {
1535         if (compat_SyncMutableUTextContents(fInputText)) {
1536         fInputLength = utext_nativeLength(fInputText);
1537         reset();
1538         }
1539     }
1540     else {
1541         resetPreserveRegion();
1542     }
1543 
1544     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1545         MatchChunkAt((int32_t)fActiveStart, TRUE, status);
1546     } else {
1547         MatchAt(fActiveStart, TRUE, status);
1548     }
1549     return fMatch;
1550 }
1551 
1552 
matches(int64_t start,UErrorCode & status)1553 UBool RegexMatcher::matches(int64_t start, UErrorCode &status) {
1554     if (U_FAILURE(status)) {
1555         return FALSE;
1556     }
1557     if (U_FAILURE(fDeferredStatus)) {
1558         status = fDeferredStatus;
1559         return FALSE;
1560     }
1561     reset();
1562 
1563     if (start < 0) {
1564         status = U_INDEX_OUTOFBOUNDS_ERROR;
1565         return FALSE;
1566     }
1567 
1568     if (fInputUniStrMaybeMutable) {
1569         if (compat_SyncMutableUTextContents(fInputText)) {
1570         fInputLength = utext_nativeLength(fInputText);
1571         reset();
1572         }
1573     }
1574 
1575     int64_t nativeStart;
1576     nativeStart = start;
1577     if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
1578         status = U_INDEX_OUTOFBOUNDS_ERROR;
1579         return FALSE;
1580     }
1581 
1582     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1583         MatchChunkAt((int32_t)nativeStart, TRUE, status);
1584     } else {
1585         MatchAt(nativeStart, TRUE, status);
1586     }
1587     return fMatch;
1588 }
1589 
1590 
1591 
1592 //--------------------------------------------------------------------------------
1593 //
1594 //    pattern
1595 //
1596 //--------------------------------------------------------------------------------
pattern() const1597 const RegexPattern &RegexMatcher::pattern() const {
1598     return *fPattern;
1599 }
1600 
1601 
1602 
1603 //--------------------------------------------------------------------------------
1604 //
1605 //    region
1606 //
1607 //--------------------------------------------------------------------------------
region(int64_t regionStart,int64_t regionLimit,int64_t startIndex,UErrorCode & status)1608 RegexMatcher &RegexMatcher::region(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status) {
1609     if (U_FAILURE(status)) {
1610         return *this;
1611     }
1612 
1613     if (regionStart>regionLimit || regionStart<0 || regionLimit<0) {
1614         status = U_ILLEGAL_ARGUMENT_ERROR;
1615     }
1616 
1617     int64_t nativeStart = regionStart;
1618     int64_t nativeLimit = regionLimit;
1619     if (nativeStart > fInputLength || nativeLimit > fInputLength) {
1620       status = U_ILLEGAL_ARGUMENT_ERROR;
1621     }
1622 
1623     if (startIndex == -1)
1624       this->reset();
1625     else
1626       resetPreserveRegion();
1627 
1628     fRegionStart = nativeStart;
1629     fRegionLimit = nativeLimit;
1630     fActiveStart = nativeStart;
1631     fActiveLimit = nativeLimit;
1632 
1633     if (startIndex != -1) {
1634       if (startIndex < fActiveStart || startIndex > fActiveLimit) {
1635           status = U_INDEX_OUTOFBOUNDS_ERROR;
1636       }
1637       fMatchEnd = startIndex;
1638     }
1639 
1640     if (!fTransparentBounds) {
1641         fLookStart = nativeStart;
1642         fLookLimit = nativeLimit;
1643     }
1644     if (fAnchoringBounds) {
1645         fAnchorStart = nativeStart;
1646         fAnchorLimit = nativeLimit;
1647     }
1648     return *this;
1649 }
1650 
region(int64_t start,int64_t limit,UErrorCode & status)1651 RegexMatcher &RegexMatcher::region(int64_t start, int64_t limit, UErrorCode &status) {
1652   return region(start, limit, -1, status);
1653 }
1654 
1655 //--------------------------------------------------------------------------------
1656 //
1657 //    regionEnd
1658 //
1659 //--------------------------------------------------------------------------------
regionEnd() const1660 int32_t RegexMatcher::regionEnd() const {
1661     return (int32_t)fRegionLimit;
1662 }
1663 
regionEnd64() const1664 int64_t RegexMatcher::regionEnd64() const {
1665     return fRegionLimit;
1666 }
1667 
1668 //--------------------------------------------------------------------------------
1669 //
1670 //    regionStart
1671 //
1672 //--------------------------------------------------------------------------------
regionStart() const1673 int32_t RegexMatcher::regionStart() const {
1674     return (int32_t)fRegionStart;
1675 }
1676 
regionStart64() const1677 int64_t RegexMatcher::regionStart64() const {
1678     return fRegionStart;
1679 }
1680 
1681 
1682 //--------------------------------------------------------------------------------
1683 //
1684 //    replaceAll
1685 //
1686 //--------------------------------------------------------------------------------
replaceAll(const UnicodeString & replacement,UErrorCode & status)1687 UnicodeString RegexMatcher::replaceAll(const UnicodeString &replacement, UErrorCode &status) {
1688     UText replacementText = UTEXT_INITIALIZER;
1689     UText resultText = UTEXT_INITIALIZER;
1690     UnicodeString resultString;
1691     if (U_FAILURE(status)) {
1692         return resultString;
1693     }
1694 
1695     utext_openConstUnicodeString(&replacementText, &replacement, &status);
1696     utext_openUnicodeString(&resultText, &resultString, &status);
1697 
1698     replaceAll(&replacementText, &resultText, status);
1699 
1700     utext_close(&resultText);
1701     utext_close(&replacementText);
1702 
1703     return resultString;
1704 }
1705 
1706 
1707 //
1708 //    replaceAll, UText mode
1709 //
replaceAll(UText * replacement,UText * dest,UErrorCode & status)1710 UText *RegexMatcher::replaceAll(UText *replacement, UText *dest, UErrorCode &status) {
1711     if (U_FAILURE(status)) {
1712         return dest;
1713     }
1714     if (U_FAILURE(fDeferredStatus)) {
1715         status = fDeferredStatus;
1716         return dest;
1717     }
1718 
1719     if (dest == NULL) {
1720         UnicodeString emptyString;
1721         UText empty = UTEXT_INITIALIZER;
1722 
1723         utext_openUnicodeString(&empty, &emptyString, &status);
1724         dest = utext_clone(NULL, &empty, TRUE, FALSE, &status);
1725         utext_close(&empty);
1726     }
1727 
1728     if (U_SUCCESS(status)) {
1729         reset();
1730         while (find()) {
1731             appendReplacement(dest, replacement, status);
1732             if (U_FAILURE(status)) {
1733                 break;
1734             }
1735         }
1736         appendTail(dest, status);
1737     }
1738 
1739     return dest;
1740 }
1741 
1742 
1743 //--------------------------------------------------------------------------------
1744 //
1745 //    replaceFirst
1746 //
1747 //--------------------------------------------------------------------------------
replaceFirst(const UnicodeString & replacement,UErrorCode & status)1748 UnicodeString RegexMatcher::replaceFirst(const UnicodeString &replacement, UErrorCode &status) {
1749     UText replacementText = UTEXT_INITIALIZER;
1750     UText resultText = UTEXT_INITIALIZER;
1751     UnicodeString resultString;
1752 
1753     utext_openConstUnicodeString(&replacementText, &replacement, &status);
1754     utext_openUnicodeString(&resultText, &resultString, &status);
1755 
1756     replaceFirst(&replacementText, &resultText, status);
1757 
1758     utext_close(&resultText);
1759     utext_close(&replacementText);
1760 
1761     return resultString;
1762 }
1763 
1764 //
1765 //    replaceFirst, UText mode
1766 //
replaceFirst(UText * replacement,UText * dest,UErrorCode & status)1767 UText *RegexMatcher::replaceFirst(UText *replacement, UText *dest, UErrorCode &status) {
1768     if (U_FAILURE(status)) {
1769         return dest;
1770     }
1771     if (U_FAILURE(fDeferredStatus)) {
1772         status = fDeferredStatus;
1773         return dest;
1774     }
1775 
1776     reset();
1777     if (!find()) {
1778         return getInput(dest, status);
1779     }
1780 
1781     if (dest == NULL) {
1782         UnicodeString emptyString;
1783         UText empty = UTEXT_INITIALIZER;
1784 
1785         utext_openUnicodeString(&empty, &emptyString, &status);
1786         dest = utext_clone(NULL, &empty, TRUE, FALSE, &status);
1787         utext_close(&empty);
1788     }
1789 
1790     appendReplacement(dest, replacement, status);
1791     appendTail(dest, status);
1792 
1793     return dest;
1794 }
1795 
1796 
1797 //--------------------------------------------------------------------------------
1798 //
1799 //     requireEnd
1800 //
1801 //--------------------------------------------------------------------------------
requireEnd() const1802 UBool RegexMatcher::requireEnd() const {
1803     return fRequireEnd;
1804 }
1805 
1806 
1807 //--------------------------------------------------------------------------------
1808 //
1809 //     reset
1810 //
1811 //--------------------------------------------------------------------------------
reset()1812 RegexMatcher &RegexMatcher::reset() {
1813     fRegionStart    = 0;
1814     fRegionLimit    = fInputLength;
1815     fActiveStart    = 0;
1816     fActiveLimit    = fInputLength;
1817     fAnchorStart    = 0;
1818     fAnchorLimit    = fInputLength;
1819     fLookStart      = 0;
1820     fLookLimit      = fInputLength;
1821     resetPreserveRegion();
1822     return *this;
1823 }
1824 
1825 
1826 
resetPreserveRegion()1827 void RegexMatcher::resetPreserveRegion() {
1828     fMatchStart     = 0;
1829     fMatchEnd       = 0;
1830     fLastMatchEnd   = -1;
1831     fAppendPosition = 0;
1832     fMatch          = FALSE;
1833     fHitEnd         = FALSE;
1834     fRequireEnd     = FALSE;
1835     fTime           = 0;
1836     fTickCounter    = TIMER_INITIAL_VALUE;
1837     //resetStack(); // more expensive than it looks...
1838 }
1839 
1840 
reset(const UnicodeString & input)1841 RegexMatcher &RegexMatcher::reset(const UnicodeString &input) {
1842     fInputText = utext_openConstUnicodeString(fInputText, &input, &fDeferredStatus);
1843     if (fPattern->fNeedsAltInput) {
1844         fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
1845     }
1846     if (U_FAILURE(fDeferredStatus)) {
1847         return *this;
1848     }
1849     fInputLength = utext_nativeLength(fInputText);
1850 
1851     reset();
1852     delete fInput;
1853     fInput = NULL;
1854 
1855     //  Do the following for any UnicodeString.
1856     //  This is for compatibility for those clients who modify the input string "live" during regex operations.
1857     fInputUniStrMaybeMutable = TRUE;
1858 
1859 #if UCONFIG_NO_BREAK_ITERATION==0
1860     if (fWordBreakItr) {
1861         fWordBreakItr->setText(fInputText, fDeferredStatus);
1862     }
1863     if (fGCBreakItr) {
1864         fGCBreakItr->setText(fInputText, fDeferredStatus);
1865     }
1866 #endif
1867 
1868     return *this;
1869 }
1870 
1871 
reset(UText * input)1872 RegexMatcher &RegexMatcher::reset(UText *input) {
1873     if (fInputText != input) {
1874         fInputText = utext_clone(fInputText, input, FALSE, TRUE, &fDeferredStatus);
1875         if (fPattern->fNeedsAltInput) fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
1876         if (U_FAILURE(fDeferredStatus)) {
1877             return *this;
1878         }
1879         fInputLength = utext_nativeLength(fInputText);
1880 
1881         delete fInput;
1882         fInput = NULL;
1883 
1884 #if UCONFIG_NO_BREAK_ITERATION==0
1885         if (fWordBreakItr) {
1886             fWordBreakItr->setText(input, fDeferredStatus);
1887         }
1888         if (fGCBreakItr) {
1889             fGCBreakItr->setText(fInputText, fDeferredStatus);
1890         }
1891 #endif
1892     }
1893     reset();
1894     fInputUniStrMaybeMutable = FALSE;
1895 
1896     return *this;
1897 }
1898 
1899 /*RegexMatcher &RegexMatcher::reset(const UChar *) {
1900     fDeferredStatus = U_INTERNAL_PROGRAM_ERROR;
1901     return *this;
1902 }*/
1903 
reset(int64_t position,UErrorCode & status)1904 RegexMatcher &RegexMatcher::reset(int64_t position, UErrorCode &status) {
1905     if (U_FAILURE(status)) {
1906         return *this;
1907     }
1908     reset();       // Reset also resets the region to be the entire string.
1909 
1910     if (position < 0 || position > fActiveLimit) {
1911         status = U_INDEX_OUTOFBOUNDS_ERROR;
1912         return *this;
1913     }
1914     fMatchEnd = position;
1915     return *this;
1916 }
1917 
1918 
1919 //--------------------------------------------------------------------------------
1920 //
1921 //    refresh
1922 //
1923 //--------------------------------------------------------------------------------
refreshInputText(UText * input,UErrorCode & status)1924 RegexMatcher &RegexMatcher::refreshInputText(UText *input, UErrorCode &status) {
1925     if (U_FAILURE(status)) {
1926         return *this;
1927     }
1928     if (input == NULL) {
1929         status = U_ILLEGAL_ARGUMENT_ERROR;
1930         return *this;
1931     }
1932     if (utext_nativeLength(fInputText) != utext_nativeLength(input)) {
1933         status = U_ILLEGAL_ARGUMENT_ERROR;
1934         return *this;
1935     }
1936     int64_t  pos = utext_getNativeIndex(fInputText);
1937     //  Shallow read-only clone of the new UText into the existing input UText
1938     fInputText = utext_clone(fInputText, input, FALSE, TRUE, &status);
1939     if (U_FAILURE(status)) {
1940         return *this;
1941     }
1942     utext_setNativeIndex(fInputText, pos);
1943 
1944     if (fAltInputText != NULL) {
1945         pos = utext_getNativeIndex(fAltInputText);
1946         fAltInputText = utext_clone(fAltInputText, input, FALSE, TRUE, &status);
1947         if (U_FAILURE(status)) {
1948             return *this;
1949         }
1950         utext_setNativeIndex(fAltInputText, pos);
1951     }
1952     return *this;
1953 }
1954 
1955 
1956 
1957 //--------------------------------------------------------------------------------
1958 //
1959 //    setTrace
1960 //
1961 //--------------------------------------------------------------------------------
setTrace(UBool state)1962 void RegexMatcher::setTrace(UBool state) {
1963     fTraceDebug = state;
1964 }
1965 
1966 
1967 
1968 /**
1969   *  UText, replace entire contents of the destination UText with a substring of the source UText.
1970   *
1971   *     @param src    The source UText
1972   *     @param dest   The destination UText. Must be writable.
1973   *                   May be NULL, in which case a new UText will be allocated.
1974   *     @param start  Start index of source substring.
1975   *     @param limit  Limit index of source substring.
1976   *     @param status An error code.
1977   */
utext_extract_replace(UText * src,UText * dest,int64_t start,int64_t limit,UErrorCode * status)1978 static UText *utext_extract_replace(UText *src, UText *dest, int64_t start, int64_t limit, UErrorCode *status) {
1979     if (U_FAILURE(*status)) {
1980         return dest;
1981     }
1982     if (start == limit) {
1983         if (dest) {
1984             utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, status);
1985             return dest;
1986         } else {
1987             return utext_openUChars(NULL, NULL, 0, status);
1988         }
1989     }
1990     int32_t length = utext_extract(src, start, limit, NULL, 0, status);
1991     if (*status != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(*status)) {
1992         return dest;
1993     }
1994     *status = U_ZERO_ERROR;
1995     MaybeStackArray<UChar, 40> buffer;
1996     if (length >= buffer.getCapacity()) {
1997         UChar *newBuf = buffer.resize(length+1);   // Leave space for terminating Nul.
1998         if (newBuf == NULL) {
1999             *status = U_MEMORY_ALLOCATION_ERROR;
2000         }
2001     }
2002     utext_extract(src, start, limit, buffer.getAlias(), length+1, status);
2003     if (dest) {
2004         utext_replace(dest, 0, utext_nativeLength(dest), buffer.getAlias(), length, status);
2005         return dest;
2006     }
2007 
2008     // Caller did not provide a prexisting UText.
2009     // Open a new one, and have it adopt the text buffer storage.
2010     if (U_FAILURE(*status)) {
2011         return NULL;
2012     }
2013     int32_t ownedLength = 0;
2014     UChar *ownedBuf = buffer.orphanOrClone(length+1, ownedLength);
2015     if (ownedBuf == NULL) {
2016         *status = U_MEMORY_ALLOCATION_ERROR;
2017         return NULL;
2018     }
2019     UText *result = utext_openUChars(NULL, ownedBuf, length, status);
2020     if (U_FAILURE(*status)) {
2021         uprv_free(ownedBuf);
2022         return NULL;
2023     }
2024     result->providerProperties |= (1 << UTEXT_PROVIDER_OWNS_TEXT);
2025     return result;
2026 }
2027 
2028 
2029 //---------------------------------------------------------------------
2030 //
2031 //   split
2032 //
2033 //---------------------------------------------------------------------
split(const UnicodeString & input,UnicodeString dest[],int32_t destCapacity,UErrorCode & status)2034 int32_t  RegexMatcher::split(const UnicodeString &input,
2035         UnicodeString    dest[],
2036         int32_t          destCapacity,
2037         UErrorCode      &status)
2038 {
2039     UText inputText = UTEXT_INITIALIZER;
2040     utext_openConstUnicodeString(&inputText, &input, &status);
2041     if (U_FAILURE(status)) {
2042         return 0;
2043     }
2044 
2045     UText **destText = (UText **)uprv_malloc(sizeof(UText*)*destCapacity);
2046     if (destText == NULL) {
2047         status = U_MEMORY_ALLOCATION_ERROR;
2048         return 0;
2049     }
2050     int32_t i;
2051     for (i = 0; i < destCapacity; i++) {
2052         destText[i] = utext_openUnicodeString(NULL, &dest[i], &status);
2053     }
2054 
2055     int32_t fieldCount = split(&inputText, destText, destCapacity, status);
2056 
2057     for (i = 0; i < destCapacity; i++) {
2058         utext_close(destText[i]);
2059     }
2060 
2061     uprv_free(destText);
2062     utext_close(&inputText);
2063     return fieldCount;
2064 }
2065 
2066 //
2067 //   split, UText mode
2068 //
split(UText * input,UText * dest[],int32_t destCapacity,UErrorCode & status)2069 int32_t  RegexMatcher::split(UText *input,
2070         UText           *dest[],
2071         int32_t          destCapacity,
2072         UErrorCode      &status)
2073 {
2074     //
2075     // Check arguments for validity
2076     //
2077     if (U_FAILURE(status)) {
2078         return 0;
2079     }
2080 
2081     if (destCapacity < 1) {
2082         status = U_ILLEGAL_ARGUMENT_ERROR;
2083         return 0;
2084     }
2085 
2086     //
2087     // Reset for the input text
2088     //
2089     reset(input);
2090     int64_t   nextOutputStringStart = 0;
2091     if (fActiveLimit == 0) {
2092         return 0;
2093     }
2094 
2095     //
2096     // Loop through the input text, searching for the delimiter pattern
2097     //
2098     int32_t i;
2099     int32_t numCaptureGroups = fPattern->fGroupMap->size();
2100     for (i=0; ; i++) {
2101         if (i>=destCapacity-1) {
2102             // There is one or zero output string left.
2103             // Fill the last output string with whatever is left from the input, then exit the loop.
2104             //  ( i will be == destCapacity if we filled the output array while processing
2105             //    capture groups of the delimiter expression, in which case we will discard the
2106             //    last capture group saved in favor of the unprocessed remainder of the
2107             //    input string.)
2108             i = destCapacity-1;
2109             if (fActiveLimit > nextOutputStringStart) {
2110                 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2111                     if (dest[i]) {
2112                         utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2113                                       input->chunkContents+nextOutputStringStart,
2114                                       (int32_t)(fActiveLimit-nextOutputStringStart), &status);
2115                     } else {
2116                         UText remainingText = UTEXT_INITIALIZER;
2117                         utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2118                                          fActiveLimit-nextOutputStringStart, &status);
2119                         dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2120                         utext_close(&remainingText);
2121                     }
2122                 } else {
2123                     UErrorCode lengthStatus = U_ZERO_ERROR;
2124                     int32_t remaining16Length =
2125                         utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus);
2126                     UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2127                     if (remainingChars == NULL) {
2128                         status = U_MEMORY_ALLOCATION_ERROR;
2129                         break;
2130                     }
2131 
2132                     utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
2133                     if (dest[i]) {
2134                         utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2135                     } else {
2136                         UText remainingText = UTEXT_INITIALIZER;
2137                         utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2138                         dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2139                         utext_close(&remainingText);
2140                     }
2141 
2142                     uprv_free(remainingChars);
2143                 }
2144             }
2145             break;
2146         }
2147         if (find()) {
2148             // We found another delimiter.  Move everything from where we started looking
2149             //  up until the start of the delimiter into the next output string.
2150             if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2151                 if (dest[i]) {
2152                     utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2153                                   input->chunkContents+nextOutputStringStart,
2154                                   (int32_t)(fMatchStart-nextOutputStringStart), &status);
2155                 } else {
2156                     UText remainingText = UTEXT_INITIALIZER;
2157                     utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2158                                       fMatchStart-nextOutputStringStart, &status);
2159                     dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2160                     utext_close(&remainingText);
2161                 }
2162             } else {
2163                 UErrorCode lengthStatus = U_ZERO_ERROR;
2164                 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fMatchStart, NULL, 0, &lengthStatus);
2165                 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2166                 if (remainingChars == NULL) {
2167                     status = U_MEMORY_ALLOCATION_ERROR;
2168                     break;
2169                 }
2170                 utext_extract(input, nextOutputStringStart, fMatchStart, remainingChars, remaining16Length+1, &status);
2171                 if (dest[i]) {
2172                     utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2173                 } else {
2174                     UText remainingText = UTEXT_INITIALIZER;
2175                     utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2176                     dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2177                     utext_close(&remainingText);
2178                 }
2179 
2180                 uprv_free(remainingChars);
2181             }
2182             nextOutputStringStart = fMatchEnd;
2183 
2184             // If the delimiter pattern has capturing parentheses, the captured
2185             //  text goes out into the next n destination strings.
2186             int32_t groupNum;
2187             for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
2188                 if (i >= destCapacity-2) {
2189                     // Never fill the last available output string with capture group text.
2190                     // It will filled with the last field, the remainder of the
2191                     //  unsplit input text.
2192                     break;
2193                 }
2194                 i++;
2195                 dest[i] = utext_extract_replace(fInputText, dest[i],
2196                                                start64(groupNum, status), end64(groupNum, status), &status);
2197             }
2198 
2199             if (nextOutputStringStart == fActiveLimit) {
2200                 // The delimiter was at the end of the string.  We're done, but first
2201                 // we output one last empty string, for the empty field following
2202                 //   the delimiter at the end of input.
2203                 if (i+1 < destCapacity) {
2204                     ++i;
2205                     if (dest[i] == NULL) {
2206                         dest[i] = utext_openUChars(NULL, NULL, 0, &status);
2207                     } else {
2208                         static const UChar emptyString[] = {(UChar)0};
2209                         utext_replace(dest[i], 0, utext_nativeLength(dest[i]), emptyString, 0, &status);
2210                     }
2211                 }
2212                 break;
2213 
2214             }
2215         }
2216         else
2217         {
2218             // We ran off the end of the input while looking for the next delimiter.
2219             // All the remaining text goes into the current output string.
2220             if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2221                 if (dest[i]) {
2222                     utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2223                                   input->chunkContents+nextOutputStringStart,
2224                                   (int32_t)(fActiveLimit-nextOutputStringStart), &status);
2225                 } else {
2226                     UText remainingText = UTEXT_INITIALIZER;
2227                     utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2228                                      fActiveLimit-nextOutputStringStart, &status);
2229                     dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2230                     utext_close(&remainingText);
2231                 }
2232             } else {
2233                 UErrorCode lengthStatus = U_ZERO_ERROR;
2234                 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus);
2235                 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2236                 if (remainingChars == NULL) {
2237                     status = U_MEMORY_ALLOCATION_ERROR;
2238                     break;
2239                 }
2240 
2241                 utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
2242                 if (dest[i]) {
2243                     utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2244                 } else {
2245                     UText remainingText = UTEXT_INITIALIZER;
2246                     utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2247                     dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2248                     utext_close(&remainingText);
2249                 }
2250 
2251                 uprv_free(remainingChars);
2252             }
2253             break;
2254         }
2255         if (U_FAILURE(status)) {
2256             break;
2257         }
2258     }   // end of for loop
2259     return i+1;
2260 }
2261 
2262 
2263 //--------------------------------------------------------------------------------
2264 //
2265 //     start
2266 //
2267 //--------------------------------------------------------------------------------
start(UErrorCode & status) const2268 int32_t RegexMatcher::start(UErrorCode &status) const {
2269     return start(0, status);
2270 }
2271 
start64(UErrorCode & status) const2272 int64_t RegexMatcher::start64(UErrorCode &status) const {
2273     return start64(0, status);
2274 }
2275 
2276 //--------------------------------------------------------------------------------
2277 //
2278 //     start(int32_t group, UErrorCode &status)
2279 //
2280 //--------------------------------------------------------------------------------
2281 
start64(int32_t group,UErrorCode & status) const2282 int64_t RegexMatcher::start64(int32_t group, UErrorCode &status) const {
2283     if (U_FAILURE(status)) {
2284         return -1;
2285     }
2286     if (U_FAILURE(fDeferredStatus)) {
2287         status = fDeferredStatus;
2288         return -1;
2289     }
2290     if (fMatch == FALSE) {
2291         status = U_REGEX_INVALID_STATE;
2292         return -1;
2293     }
2294     if (group < 0 || group > fPattern->fGroupMap->size()) {
2295         status = U_INDEX_OUTOFBOUNDS_ERROR;
2296         return -1;
2297     }
2298     int64_t s;
2299     if (group == 0) {
2300         s = fMatchStart;
2301     } else {
2302         int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
2303         U_ASSERT(groupOffset < fPattern->fFrameSize);
2304         U_ASSERT(groupOffset >= 0);
2305         s = fFrame->fExtra[groupOffset];
2306     }
2307 
2308     return s;
2309 }
2310 
2311 
start(int32_t group,UErrorCode & status) const2312 int32_t RegexMatcher::start(int32_t group, UErrorCode &status) const {
2313     return (int32_t)start64(group, status);
2314 }
2315 
2316 //--------------------------------------------------------------------------------
2317 //
2318 //     useAnchoringBounds
2319 //
2320 //--------------------------------------------------------------------------------
useAnchoringBounds(UBool b)2321 RegexMatcher &RegexMatcher::useAnchoringBounds(UBool b) {
2322     fAnchoringBounds = b;
2323     fAnchorStart = (fAnchoringBounds ? fRegionStart : 0);
2324     fAnchorLimit = (fAnchoringBounds ? fRegionLimit : fInputLength);
2325     return *this;
2326 }
2327 
2328 
2329 //--------------------------------------------------------------------------------
2330 //
2331 //     useTransparentBounds
2332 //
2333 //--------------------------------------------------------------------------------
useTransparentBounds(UBool b)2334 RegexMatcher &RegexMatcher::useTransparentBounds(UBool b) {
2335     fTransparentBounds = b;
2336     fLookStart = (fTransparentBounds ? 0 : fRegionStart);
2337     fLookLimit = (fTransparentBounds ? fInputLength : fRegionLimit);
2338     return *this;
2339 }
2340 
2341 //--------------------------------------------------------------------------------
2342 //
2343 //     setTimeLimit
2344 //
2345 //--------------------------------------------------------------------------------
setTimeLimit(int32_t limit,UErrorCode & status)2346 void RegexMatcher::setTimeLimit(int32_t limit, UErrorCode &status) {
2347     if (U_FAILURE(status)) {
2348         return;
2349     }
2350     if (U_FAILURE(fDeferredStatus)) {
2351         status = fDeferredStatus;
2352         return;
2353     }
2354     if (limit < 0) {
2355         status = U_ILLEGAL_ARGUMENT_ERROR;
2356         return;
2357     }
2358     fTimeLimit = limit;
2359 }
2360 
2361 
2362 //--------------------------------------------------------------------------------
2363 //
2364 //     getTimeLimit
2365 //
2366 //--------------------------------------------------------------------------------
getTimeLimit() const2367 int32_t RegexMatcher::getTimeLimit() const {
2368     return fTimeLimit;
2369 }
2370 
2371 
2372 //--------------------------------------------------------------------------------
2373 //
2374 //     setStackLimit
2375 //
2376 //--------------------------------------------------------------------------------
setStackLimit(int32_t limit,UErrorCode & status)2377 void RegexMatcher::setStackLimit(int32_t limit, UErrorCode &status) {
2378     if (U_FAILURE(status)) {
2379         return;
2380     }
2381     if (U_FAILURE(fDeferredStatus)) {
2382         status = fDeferredStatus;
2383         return;
2384     }
2385     if (limit < 0) {
2386         status = U_ILLEGAL_ARGUMENT_ERROR;
2387         return;
2388     }
2389 
2390     // Reset the matcher.  This is needed here in case there is a current match
2391     //    whose final stack frame (containing the match results, pointed to by fFrame)
2392     //    would be lost by resizing to a smaller stack size.
2393     reset();
2394 
2395     if (limit == 0) {
2396         // Unlimited stack expansion
2397         fStack->setMaxCapacity(0);
2398     } else {
2399         // Change the units of the limit  from bytes to ints, and bump the size up
2400         //   to be big enough to hold at least one stack frame for the pattern,
2401         //   if it isn't there already.
2402         int32_t adjustedLimit = limit / sizeof(int32_t);
2403         if (adjustedLimit < fPattern->fFrameSize) {
2404             adjustedLimit = fPattern->fFrameSize;
2405         }
2406         fStack->setMaxCapacity(adjustedLimit);
2407     }
2408     fStackLimit = limit;
2409 }
2410 
2411 
2412 //--------------------------------------------------------------------------------
2413 //
2414 //     getStackLimit
2415 //
2416 //--------------------------------------------------------------------------------
getStackLimit() const2417 int32_t RegexMatcher::getStackLimit() const {
2418     return fStackLimit;
2419 }
2420 
2421 
2422 //--------------------------------------------------------------------------------
2423 //
2424 //     setMatchCallback
2425 //
2426 //--------------------------------------------------------------------------------
setMatchCallback(URegexMatchCallback * callback,const void * context,UErrorCode & status)2427 void RegexMatcher::setMatchCallback(URegexMatchCallback     *callback,
2428                                     const void              *context,
2429                                     UErrorCode              &status) {
2430     if (U_FAILURE(status)) {
2431         return;
2432     }
2433     fCallbackFn = callback;
2434     fCallbackContext = context;
2435 }
2436 
2437 
2438 //--------------------------------------------------------------------------------
2439 //
2440 //     getMatchCallback
2441 //
2442 //--------------------------------------------------------------------------------
getMatchCallback(URegexMatchCallback * & callback,const void * & context,UErrorCode & status)2443 void RegexMatcher::getMatchCallback(URegexMatchCallback   *&callback,
2444                                   const void              *&context,
2445                                   UErrorCode              &status) {
2446     if (U_FAILURE(status)) {
2447        return;
2448     }
2449     callback = fCallbackFn;
2450     context  = fCallbackContext;
2451 }
2452 
2453 
2454 //--------------------------------------------------------------------------------
2455 //
2456 //     setMatchCallback
2457 //
2458 //--------------------------------------------------------------------------------
setFindProgressCallback(URegexFindProgressCallback * callback,const void * context,UErrorCode & status)2459 void RegexMatcher::setFindProgressCallback(URegexFindProgressCallback      *callback,
2460                                                 const void                      *context,
2461                                                 UErrorCode                      &status) {
2462     if (U_FAILURE(status)) {
2463         return;
2464     }
2465     fFindProgressCallbackFn = callback;
2466     fFindProgressCallbackContext = context;
2467 }
2468 
2469 
2470 //--------------------------------------------------------------------------------
2471 //
2472 //     getMatchCallback
2473 //
2474 //--------------------------------------------------------------------------------
getFindProgressCallback(URegexFindProgressCallback * & callback,const void * & context,UErrorCode & status)2475 void RegexMatcher::getFindProgressCallback(URegexFindProgressCallback    *&callback,
2476                                                 const void                    *&context,
2477                                                 UErrorCode                    &status) {
2478     if (U_FAILURE(status)) {
2479        return;
2480     }
2481     callback = fFindProgressCallbackFn;
2482     context  = fFindProgressCallbackContext;
2483 }
2484 
2485 
2486 //================================================================================
2487 //
2488 //    Code following this point in this file is the internal
2489 //    Match Engine Implementation.
2490 //
2491 //================================================================================
2492 
2493 
2494 //--------------------------------------------------------------------------------
2495 //
2496 //   resetStack
2497 //           Discard any previous contents of the state save stack, and initialize a
2498 //           new stack frame to all -1.  The -1s are needed for capture group limits,
2499 //           where they indicate that a group has not yet matched anything.
2500 //--------------------------------------------------------------------------------
resetStack()2501 REStackFrame *RegexMatcher::resetStack() {
2502     // Discard any previous contents of the state save stack, and initialize a
2503     //  new stack frame with all -1 data.  The -1s are needed for capture group limits,
2504     //  where they indicate that a group has not yet matched anything.
2505     fStack->removeAllElements();
2506 
2507     REStackFrame *iFrame = (REStackFrame *)fStack->reserveBlock(fPattern->fFrameSize, fDeferredStatus);
2508     if(U_FAILURE(fDeferredStatus)) {
2509         return NULL;
2510     }
2511 
2512     int32_t i;
2513     for (i=0; i<fPattern->fFrameSize-RESTACKFRAME_HDRCOUNT; i++) {
2514         iFrame->fExtra[i] = -1;
2515     }
2516     return iFrame;
2517 }
2518 
2519 
2520 
2521 //--------------------------------------------------------------------------------
2522 //
2523 //   isWordBoundary
2524 //                     in perl, "xab..cd..", \b is true at positions 0,3,5,7
2525 //                     For us,
2526 //                       If the current char is a combining mark,
2527 //                          \b is FALSE.
2528 //                       Else Scan backwards to the first non-combining char.
2529 //                            We are at a boundary if the this char and the original chars are
2530 //                               opposite in membership in \w set
2531 //
2532 //          parameters:   pos   - the current position in the input buffer
2533 //
2534 //              TODO:  double-check edge cases at region boundaries.
2535 //
2536 //--------------------------------------------------------------------------------
isWordBoundary(int64_t pos)2537 UBool RegexMatcher::isWordBoundary(int64_t pos) {
2538     UBool isBoundary = FALSE;
2539     UBool cIsWord    = FALSE;
2540 
2541     if (pos >= fLookLimit) {
2542         fHitEnd = TRUE;
2543     } else {
2544         // Determine whether char c at current position is a member of the word set of chars.
2545         // If we're off the end of the string, behave as though we're not at a word char.
2546         UTEXT_SETNATIVEINDEX(fInputText, pos);
2547         UChar32  c = UTEXT_CURRENT32(fInputText);
2548         if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
2549             // Current char is a combining one.  Not a boundary.
2550             return FALSE;
2551         }
2552         cIsWord = RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET].contains(c);
2553     }
2554 
2555     // Back up until we come to a non-combining char, determine whether
2556     //  that char is a word char.
2557     UBool prevCIsWord = FALSE;
2558     for (;;) {
2559         if (UTEXT_GETNATIVEINDEX(fInputText) <= fLookStart) {
2560             break;
2561         }
2562         UChar32 prevChar = UTEXT_PREVIOUS32(fInputText);
2563         if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
2564               || u_charType(prevChar) == U_FORMAT_CHAR)) {
2565             prevCIsWord = RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET].contains(prevChar);
2566             break;
2567         }
2568     }
2569     isBoundary = cIsWord ^ prevCIsWord;
2570     return isBoundary;
2571 }
2572 
isChunkWordBoundary(int32_t pos)2573 UBool RegexMatcher::isChunkWordBoundary(int32_t pos) {
2574     UBool isBoundary = FALSE;
2575     UBool cIsWord    = FALSE;
2576 
2577     const UChar *inputBuf = fInputText->chunkContents;
2578 
2579     if (pos >= fLookLimit) {
2580         fHitEnd = TRUE;
2581     } else {
2582         // Determine whether char c at current position is a member of the word set of chars.
2583         // If we're off the end of the string, behave as though we're not at a word char.
2584         UChar32 c;
2585         U16_GET(inputBuf, fLookStart, pos, fLookLimit, c);
2586         if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
2587             // Current char is a combining one.  Not a boundary.
2588             return FALSE;
2589         }
2590         cIsWord = RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET].contains(c);
2591     }
2592 
2593     // Back up until we come to a non-combining char, determine whether
2594     //  that char is a word char.
2595     UBool prevCIsWord = FALSE;
2596     for (;;) {
2597         if (pos <= fLookStart) {
2598             break;
2599         }
2600         UChar32 prevChar;
2601         U16_PREV(inputBuf, fLookStart, pos, prevChar);
2602         if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
2603               || u_charType(prevChar) == U_FORMAT_CHAR)) {
2604             prevCIsWord = RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET].contains(prevChar);
2605             break;
2606         }
2607     }
2608     isBoundary = cIsWord ^ prevCIsWord;
2609     return isBoundary;
2610 }
2611 
2612 //--------------------------------------------------------------------------------
2613 //
2614 //   isUWordBoundary
2615 //
2616 //         Test for a word boundary using RBBI word break.
2617 //
2618 //          parameters:   pos   - the current position in the input buffer
2619 //
2620 //--------------------------------------------------------------------------------
isUWordBoundary(int64_t pos,UErrorCode & status)2621 UBool RegexMatcher::isUWordBoundary(int64_t pos, UErrorCode &status) {
2622     UBool       returnVal = FALSE;
2623 
2624 #if UCONFIG_NO_BREAK_ITERATION==0
2625     // Note: this point will never be reached if break iteration is configured out.
2626     //       Regex patterns that would require this function will fail to compile.
2627 
2628     // If we haven't yet created a break iterator for this matcher, do it now.
2629     if (fWordBreakItr == nullptr) {
2630         fWordBreakItr = BreakIterator::createWordInstance(Locale::getEnglish(), status);
2631         if (U_FAILURE(status)) {
2632             return FALSE;
2633         }
2634         fWordBreakItr->setText(fInputText, status);
2635     }
2636 
2637     // Note: zero width boundary tests like \b see through transparent region bounds,
2638     //       which is why fLookLimit is used here, rather than fActiveLimit.
2639     if (pos >= fLookLimit) {
2640         fHitEnd = TRUE;
2641         returnVal = TRUE;   // With Unicode word rules, only positions within the interior of "real"
2642                             //    words are not boundaries.  All non-word chars stand by themselves,
2643                             //    with word boundaries on both sides.
2644     } else {
2645         returnVal = fWordBreakItr->isBoundary((int32_t)pos);
2646     }
2647 #endif
2648     return   returnVal;
2649 }
2650 
2651 
followingGCBoundary(int64_t pos,UErrorCode & status)2652 int64_t RegexMatcher::followingGCBoundary(int64_t pos, UErrorCode &status) {
2653     int64_t result = pos;
2654 
2655 #if UCONFIG_NO_BREAK_ITERATION==0
2656     // Note: this point will never be reached if break iteration is configured out.
2657     //       Regex patterns that would require this function will fail to compile.
2658 
2659     // If we haven't yet created a break iterator for this matcher, do it now.
2660     if (fGCBreakItr == nullptr) {
2661         fGCBreakItr = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
2662         if (U_FAILURE(status)) {
2663             return pos;
2664         }
2665         fGCBreakItr->setText(fInputText, status);
2666     }
2667     result = fGCBreakItr->following(pos);
2668     if (result == BreakIterator::DONE) {
2669         result = pos;
2670     }
2671 #endif
2672     return result;
2673 }
2674 
2675 //--------------------------------------------------------------------------------
2676 //
2677 //   IncrementTime     This function is called once each TIMER_INITIAL_VALUE state
2678 //                     saves. Increment the "time" counter, and call the
2679 //                     user callback function if there is one installed.
2680 //
2681 //                     If the match operation needs to be aborted, either for a time-out
2682 //                     or because the user callback asked for it, just set an error status.
2683 //                     The engine will pick that up and stop in its outer loop.
2684 //
2685 //--------------------------------------------------------------------------------
IncrementTime(UErrorCode & status)2686 void RegexMatcher::IncrementTime(UErrorCode &status) {
2687     fTickCounter = TIMER_INITIAL_VALUE;
2688     fTime++;
2689     if (fCallbackFn != NULL) {
2690         if ((*fCallbackFn)(fCallbackContext, fTime) == FALSE) {
2691             status = U_REGEX_STOPPED_BY_CALLER;
2692             return;
2693         }
2694     }
2695     if (fTimeLimit > 0 && fTime >= fTimeLimit) {
2696         status = U_REGEX_TIME_OUT;
2697     }
2698 }
2699 
2700 //--------------------------------------------------------------------------------
2701 //
2702 //   StateSave
2703 //       Make a new stack frame, initialized as a copy of the current stack frame.
2704 //       Set the pattern index in the original stack frame from the operand value
2705 //       in the opcode.  Execution of the engine continues with the state in
2706 //       the newly created stack frame
2707 //
2708 //       Note that reserveBlock() may grow the stack, resulting in the
2709 //       whole thing being relocated in memory.
2710 //
2711 //    Parameters:
2712 //       fp           The top frame pointer when called.  At return, a new
2713 //                    fame will be present
2714 //       savePatIdx   An index into the compiled pattern.  Goes into the original
2715 //                    (not new) frame.  If execution ever back-tracks out of the
2716 //                    new frame, this will be where we continue from in the pattern.
2717 //    Return
2718 //                    The new frame pointer.
2719 //
2720 //--------------------------------------------------------------------------------
StateSave(REStackFrame * fp,int64_t savePatIdx,UErrorCode & status)2721 inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status) {
2722     if (U_FAILURE(status)) {
2723         return fp;
2724     }
2725     // push storage for a new frame.
2726     int64_t *newFP = fStack->reserveBlock(fFrameSize, status);
2727     if (U_FAILURE(status)) {
2728         // Failure on attempted stack expansion.
2729         //   Stack function set some other error code, change it to a more
2730         //   specific one for regular expressions.
2731         status = U_REGEX_STACK_OVERFLOW;
2732         // We need to return a writable stack frame, so just return the
2733         //    previous frame.  The match operation will stop quickly
2734         //    because of the error status, after which the frame will never
2735         //    be looked at again.
2736         return fp;
2737     }
2738     fp = (REStackFrame *)(newFP - fFrameSize);  // in case of realloc of stack.
2739 
2740     // New stack frame = copy of old top frame.
2741     int64_t *source = (int64_t *)fp;
2742     int64_t *dest   = newFP;
2743     for (;;) {
2744         *dest++ = *source++;
2745         if (source == newFP) {
2746             break;
2747         }
2748     }
2749 
2750     fTickCounter--;
2751     if (fTickCounter <= 0) {
2752        IncrementTime(status);    // Re-initializes fTickCounter
2753     }
2754     fp->fPatIdx = savePatIdx;
2755     return (REStackFrame *)newFP;
2756 }
2757 
2758 #if defined(REGEX_DEBUG)
2759 namespace {
StringFromUText(UText * ut)2760 UnicodeString StringFromUText(UText *ut) {
2761     UnicodeString result;
2762     for (UChar32 c = utext_next32From(ut, 0); c != U_SENTINEL; c = UTEXT_NEXT32(ut)) {
2763         result.append(c);
2764     }
2765     return result;
2766 }
2767 }
2768 #endif // REGEX_DEBUG
2769 
2770 
2771 //--------------------------------------------------------------------------------
2772 //
2773 //   MatchAt      This is the actual matching engine.
2774 //
2775 //                  startIdx:    begin matching a this index.
2776 //                  toEnd:       if true, match must extend to end of the input region
2777 //
2778 //--------------------------------------------------------------------------------
MatchAt(int64_t startIdx,UBool toEnd,UErrorCode & status)2779 void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) {
2780     UBool       isMatch  = FALSE;      // True if the we have a match.
2781 
2782     int64_t     backSearchIndex = U_INT64_MAX; // used after greedy single-character matches for searching backwards
2783 
2784     int32_t     op;                    // Operation from the compiled pattern, split into
2785     int32_t     opType;                //    the opcode
2786     int32_t     opValue;               //    and the operand value.
2787 
2788 #ifdef REGEX_RUN_DEBUG
2789     if (fTraceDebug) {
2790         printf("MatchAt(startIdx=%ld)\n", startIdx);
2791         printf("Original Pattern: \"%s\"\n", CStr(StringFromUText(fPattern->fPattern))());
2792         printf("Input String:     \"%s\"\n\n", CStr(StringFromUText(fInputText))());
2793     }
2794 #endif
2795 
2796     if (U_FAILURE(status)) {
2797         return;
2798     }
2799 
2800     //  Cache frequently referenced items from the compiled pattern
2801     //
2802     int64_t             *pat           = fPattern->fCompiledPat->getBuffer();
2803 
2804     const UChar         *litText       = fPattern->fLiteralText.getBuffer();
2805     UVector             *fSets         = fPattern->fSets;
2806 
2807     fFrameSize = fPattern->fFrameSize;
2808     REStackFrame        *fp            = resetStack();
2809     if (U_FAILURE(fDeferredStatus)) {
2810         status = fDeferredStatus;
2811         return;
2812     }
2813 
2814     fp->fPatIdx   = 0;
2815     fp->fInputIdx = startIdx;
2816 
2817     // Zero out the pattern's static data
2818     int32_t i;
2819     for (i = 0; i<fPattern->fDataSize; i++) {
2820         fData[i] = 0;
2821     }
2822 
2823     //
2824     //  Main loop for interpreting the compiled pattern.
2825     //  One iteration of the loop per pattern operation performed.
2826     //
2827     for (;;) {
2828         op      = (int32_t)pat[fp->fPatIdx];
2829         opType  = URX_TYPE(op);
2830         opValue = URX_VAL(op);
2831 #ifdef REGEX_RUN_DEBUG
2832         if (fTraceDebug) {
2833             UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2834             printf("inputIdx=%ld   inputChar=%x   sp=%3ld   activeLimit=%ld  ", fp->fInputIdx,
2835                 UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
2836             fPattern->dumpOp(fp->fPatIdx);
2837         }
2838 #endif
2839         fp->fPatIdx++;
2840 
2841         switch (opType) {
2842 
2843 
2844         case URX_NOP:
2845             break;
2846 
2847 
2848         case URX_BACKTRACK:
2849             // Force a backtrack.  In some circumstances, the pattern compiler
2850             //   will notice that the pattern can't possibly match anything, and will
2851             //   emit one of these at that point.
2852             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2853             break;
2854 
2855 
2856         case URX_ONECHAR:
2857             if (fp->fInputIdx < fActiveLimit) {
2858                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2859                 UChar32 c = UTEXT_NEXT32(fInputText);
2860                 if (c == opValue) {
2861                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
2862                     break;
2863                 }
2864             } else {
2865                 fHitEnd = TRUE;
2866             }
2867             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2868             break;
2869 
2870 
2871         case URX_STRING:
2872             {
2873                 // Test input against a literal string.
2874                 // Strings require two slots in the compiled pattern, one for the
2875                 //   offset to the string text, and one for the length.
2876 
2877                 int32_t   stringStartIdx = opValue;
2878                 op      = (int32_t)pat[fp->fPatIdx];     // Fetch the second operand
2879                 fp->fPatIdx++;
2880                 opType    = URX_TYPE(op);
2881                 int32_t stringLen = URX_VAL(op);
2882                 U_ASSERT(opType == URX_STRING_LEN);
2883                 U_ASSERT(stringLen >= 2);
2884 
2885                 const UChar *patternString = litText+stringStartIdx;
2886                 int32_t patternStringIndex = 0;
2887                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2888                 UChar32 inputChar;
2889                 UChar32 patternChar;
2890                 UBool success = TRUE;
2891                 while (patternStringIndex < stringLen) {
2892                     if (UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) {
2893                         success = FALSE;
2894                         fHitEnd = TRUE;
2895                         break;
2896                     }
2897                     inputChar = UTEXT_NEXT32(fInputText);
2898                     U16_NEXT(patternString, patternStringIndex, stringLen, patternChar);
2899                     if (patternChar != inputChar) {
2900                         success = FALSE;
2901                         break;
2902                     }
2903                 }
2904 
2905                 if (success) {
2906                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
2907                 } else {
2908                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2909                 }
2910             }
2911             break;
2912 
2913 
2914         case URX_STATE_SAVE:
2915             fp = StateSave(fp, opValue, status);
2916             break;
2917 
2918 
2919         case URX_END:
2920             // The match loop will exit via this path on a successful match,
2921             //   when we reach the end of the pattern.
2922             if (toEnd && fp->fInputIdx != fActiveLimit) {
2923                 // The pattern matched, but not to the end of input.  Try some more.
2924                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2925                 break;
2926             }
2927             isMatch = TRUE;
2928             goto  breakFromLoop;
2929 
2930         // Start and End Capture stack frame variables are laid out out like this:
2931             //  fp->fExtra[opValue]  - The start of a completed capture group
2932             //             opValue+1 - The end   of a completed capture group
2933             //             opValue+2 - the start of a capture group whose end
2934             //                          has not yet been reached (and might not ever be).
2935         case URX_START_CAPTURE:
2936             U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
2937             fp->fExtra[opValue+2] = fp->fInputIdx;
2938             break;
2939 
2940 
2941         case URX_END_CAPTURE:
2942             U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
2943             U_ASSERT(fp->fExtra[opValue+2] >= 0);            // Start pos for this group must be set.
2944             fp->fExtra[opValue]   = fp->fExtra[opValue+2];   // Tentative start becomes real.
2945             fp->fExtra[opValue+1] = fp->fInputIdx;           // End position
2946             U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
2947             break;
2948 
2949 
2950         case URX_DOLLAR:                   //  $, test for End of line
2951                                            //     or for position before new line at end of input
2952             {
2953                 if (fp->fInputIdx >= fAnchorLimit) {
2954                     // We really are at the end of input.  Success.
2955                     fHitEnd = TRUE;
2956                     fRequireEnd = TRUE;
2957                     break;
2958                 }
2959 
2960                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2961 
2962                 // If we are positioned just before a new-line that is located at the
2963                 //   end of input, succeed.
2964                 UChar32 c = UTEXT_NEXT32(fInputText);
2965                 if (UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
2966                     if (isLineTerminator(c)) {
2967                         // If not in the middle of a CR/LF sequence
2968                         if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && ((void)UTEXT_PREVIOUS32(fInputText), UTEXT_PREVIOUS32(fInputText))==0x0d)) {
2969                             // At new-line at end of input. Success
2970                             fHitEnd = TRUE;
2971                             fRequireEnd = TRUE;
2972 
2973                             break;
2974                         }
2975                     }
2976                 } else {
2977                     UChar32 nextC = UTEXT_NEXT32(fInputText);
2978                     if (c == 0x0d && nextC == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
2979                         fHitEnd = TRUE;
2980                         fRequireEnd = TRUE;
2981                         break;                         // At CR/LF at end of input.  Success
2982                     }
2983                 }
2984 
2985                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2986             }
2987             break;
2988 
2989 
2990          case URX_DOLLAR_D:                   //  $, test for End of Line, in UNIX_LINES mode.
2991             if (fp->fInputIdx >= fAnchorLimit) {
2992                 // Off the end of input.  Success.
2993                 fHitEnd = TRUE;
2994                 fRequireEnd = TRUE;
2995                 break;
2996             } else {
2997                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2998                 UChar32 c = UTEXT_NEXT32(fInputText);
2999                 // Either at the last character of input, or off the end.
3000                 if (c == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) == fAnchorLimit) {
3001                     fHitEnd = TRUE;
3002                     fRequireEnd = TRUE;
3003                     break;
3004                 }
3005             }
3006 
3007             // Not at end of input.  Back-track out.
3008             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3009             break;
3010 
3011 
3012          case URX_DOLLAR_M:                //  $, test for End of line in multi-line mode
3013              {
3014                  if (fp->fInputIdx >= fAnchorLimit) {
3015                      // We really are at the end of input.  Success.
3016                      fHitEnd = TRUE;
3017                      fRequireEnd = TRUE;
3018                      break;
3019                  }
3020                  // If we are positioned just before a new-line, succeed.
3021                  // It makes no difference where the new-line is within the input.
3022                  UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3023                  UChar32 c = UTEXT_CURRENT32(fInputText);
3024                  if (isLineTerminator(c)) {
3025                      // At a line end, except for the odd chance of  being in the middle of a CR/LF sequence
3026                      //  In multi-line mode, hitting a new-line just before the end of input does not
3027                      //   set the hitEnd or requireEnd flags
3028                      if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && UTEXT_PREVIOUS32(fInputText)==0x0d)) {
3029                         break;
3030                      }
3031                  }
3032                  // not at a new line.  Fail.
3033                  fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3034              }
3035              break;
3036 
3037 
3038          case URX_DOLLAR_MD:                //  $, test for End of line in multi-line and UNIX_LINES mode
3039              {
3040                  if (fp->fInputIdx >= fAnchorLimit) {
3041                      // We really are at the end of input.  Success.
3042                      fHitEnd = TRUE;
3043                      fRequireEnd = TRUE;  // Java set requireEnd in this case, even though
3044                      break;               //   adding a new-line would not lose the match.
3045                  }
3046                  // If we are not positioned just before a new-line, the test fails; backtrack out.
3047                  // It makes no difference where the new-line is within the input.
3048                  UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3049                  if (UTEXT_CURRENT32(fInputText) != 0x0a) {
3050                      fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3051                  }
3052              }
3053              break;
3054 
3055 
3056        case URX_CARET:                    //  ^, test for start of line
3057             if (fp->fInputIdx != fAnchorStart) {
3058                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3059             }
3060             break;
3061 
3062 
3063        case URX_CARET_M:                   //  ^, test for start of line in mulit-line mode
3064            {
3065                if (fp->fInputIdx == fAnchorStart) {
3066                    // We are at the start input.  Success.
3067                    break;
3068                }
3069                // Check whether character just before the current pos is a new-line
3070                //   unless we are at the end of input
3071                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3072                UChar32  c = UTEXT_PREVIOUS32(fInputText);
3073                if ((fp->fInputIdx < fAnchorLimit) && isLineTerminator(c)) {
3074                    //  It's a new-line.  ^ is true.  Success.
3075                    //  TODO:  what should be done with positions between a CR and LF?
3076                    break;
3077                }
3078                // Not at the start of a line.  Fail.
3079                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3080            }
3081            break;
3082 
3083 
3084        case URX_CARET_M_UNIX:       //  ^, test for start of line in mulit-line + Unix-line mode
3085            {
3086                U_ASSERT(fp->fInputIdx >= fAnchorStart);
3087                if (fp->fInputIdx <= fAnchorStart) {
3088                    // We are at the start input.  Success.
3089                    break;
3090                }
3091                // Check whether character just before the current pos is a new-line
3092                U_ASSERT(fp->fInputIdx <= fAnchorLimit);
3093                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3094                UChar32  c = UTEXT_PREVIOUS32(fInputText);
3095                if (c != 0x0a) {
3096                    // Not at the start of a line.  Back-track out.
3097                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3098                }
3099            }
3100            break;
3101 
3102         case URX_BACKSLASH_B:          // Test for word boundaries
3103             {
3104                 UBool success = isWordBoundary(fp->fInputIdx);
3105                 success ^= (UBool)(opValue != 0);     // flip sense for \B
3106                 if (!success) {
3107                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3108                 }
3109             }
3110             break;
3111 
3112 
3113         case URX_BACKSLASH_BU:          // Test for word boundaries, Unicode-style
3114             {
3115                 UBool success = isUWordBoundary(fp->fInputIdx, status);
3116                 success ^= (UBool)(opValue != 0);     // flip sense for \B
3117                 if (!success) {
3118                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3119                 }
3120             }
3121             break;
3122 
3123 
3124         case URX_BACKSLASH_D:            // Test for decimal digit
3125             {
3126                 if (fp->fInputIdx >= fActiveLimit) {
3127                     fHitEnd = TRUE;
3128                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3129                     break;
3130                 }
3131 
3132                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3133 
3134                 UChar32 c = UTEXT_NEXT32(fInputText);
3135                 int8_t ctype = u_charType(c);     // TODO:  make a unicode set for this.  Will be faster.
3136                 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
3137                 success ^= (UBool)(opValue != 0);        // flip sense for \D
3138                 if (success) {
3139                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3140                 } else {
3141                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3142                 }
3143             }
3144             break;
3145 
3146 
3147         case URX_BACKSLASH_G:          // Test for position at end of previous match
3148             if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) {
3149                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3150             }
3151             break;
3152 
3153 
3154         case URX_BACKSLASH_H:            // Test for \h, horizontal white space.
3155             {
3156                 if (fp->fInputIdx >= fActiveLimit) {
3157                     fHitEnd = TRUE;
3158                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3159                     break;
3160                 }
3161                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3162                 UChar32 c = UTEXT_NEXT32(fInputText);
3163                 int8_t ctype = u_charType(c);
3164                 UBool success = (ctype == U_SPACE_SEPARATOR || c == 9);  // SPACE_SEPARATOR || TAB
3165                 success ^= (UBool)(opValue != 0);        // flip sense for \H
3166                 if (success) {
3167                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3168                 } else {
3169                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3170                 }
3171             }
3172             break;
3173 
3174 
3175         case URX_BACKSLASH_R:            // Test for \R, any line break sequence.
3176             {
3177                 if (fp->fInputIdx >= fActiveLimit) {
3178                     fHitEnd = TRUE;
3179                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3180                     break;
3181                 }
3182                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3183                 UChar32 c = UTEXT_NEXT32(fInputText);
3184                 if (isLineTerminator(c)) {
3185                     if (c == 0x0d && utext_current32(fInputText) == 0x0a) {
3186                         utext_next32(fInputText);
3187                     }
3188                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3189                 } else {
3190                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3191                 }
3192             }
3193             break;
3194 
3195 
3196         case URX_BACKSLASH_V:            // \v, any single line ending character.
3197             {
3198                 if (fp->fInputIdx >= fActiveLimit) {
3199                     fHitEnd = TRUE;
3200                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3201                     break;
3202                 }
3203                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3204                 UChar32 c = UTEXT_NEXT32(fInputText);
3205                 UBool success = isLineTerminator(c);
3206                 success ^= (UBool)(opValue != 0);        // flip sense for \V
3207                 if (success) {
3208                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3209                 } else {
3210                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3211                 }
3212             }
3213             break;
3214 
3215 
3216         case URX_BACKSLASH_X:
3217             //  Match a Grapheme, as defined by Unicode UAX 29.
3218 
3219             // Fail if at end of input
3220             if (fp->fInputIdx >= fActiveLimit) {
3221                 fHitEnd = TRUE;
3222                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3223                 break;
3224             }
3225 
3226             fp->fInputIdx = followingGCBoundary(fp->fInputIdx, status);
3227             if (fp->fInputIdx >= fActiveLimit) {
3228                 fHitEnd = TRUE;
3229                 fp->fInputIdx = fActiveLimit;
3230             }
3231             break;
3232 
3233 
3234         case URX_BACKSLASH_Z:          // Test for end of Input
3235             if (fp->fInputIdx < fAnchorLimit) {
3236                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3237             } else {
3238                 fHitEnd = TRUE;
3239                 fRequireEnd = TRUE;
3240             }
3241             break;
3242 
3243 
3244 
3245         case URX_STATIC_SETREF:
3246             {
3247                 // Test input character against one of the predefined sets
3248                 //    (Word Characters, for example)
3249                 // The high bit of the op value is a flag for the match polarity.
3250                 //    0:   success if input char is in set.
3251                 //    1:   success if input char is not in set.
3252                 if (fp->fInputIdx >= fActiveLimit) {
3253                     fHitEnd = TRUE;
3254                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3255                     break;
3256                 }
3257 
3258                 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
3259                 opValue &= ~URX_NEG_SET;
3260                 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
3261 
3262                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3263                 UChar32 c = UTEXT_NEXT32(fInputText);
3264                 if (c < 256) {
3265                     Regex8BitSet &s8 = RegexStaticSets::gStaticSets->fPropSets8[opValue];
3266                     if (s8.contains(c)) {
3267                         success = !success;
3268                     }
3269                 } else {
3270                     const UnicodeSet &s = RegexStaticSets::gStaticSets->fPropSets[opValue];
3271                     if (s.contains(c)) {
3272                         success = !success;
3273                     }
3274                 }
3275                 if (success) {
3276                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3277                 } else {
3278                     // the character wasn't in the set.
3279                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3280                 }
3281             }
3282             break;
3283 
3284 
3285         case URX_STAT_SETREF_N:
3286             {
3287                 // Test input character for NOT being a member of  one of
3288                 //    the predefined sets (Word Characters, for example)
3289                 if (fp->fInputIdx >= fActiveLimit) {
3290                     fHitEnd = TRUE;
3291                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3292                     break;
3293                 }
3294 
3295                 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
3296 
3297                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3298 
3299                 UChar32 c = UTEXT_NEXT32(fInputText);
3300                 if (c < 256) {
3301                     Regex8BitSet &s8 = RegexStaticSets::gStaticSets->fPropSets8[opValue];
3302                     if (s8.contains(c) == FALSE) {
3303                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3304                         break;
3305                     }
3306                 } else {
3307                     const UnicodeSet &s = RegexStaticSets::gStaticSets->fPropSets[opValue];
3308                     if (s.contains(c) == FALSE) {
3309                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3310                         break;
3311                     }
3312                 }
3313                 // the character wasn't in the set.
3314                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3315             }
3316             break;
3317 
3318 
3319         case URX_SETREF:
3320             if (fp->fInputIdx >= fActiveLimit) {
3321                 fHitEnd = TRUE;
3322                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3323                 break;
3324             } else {
3325                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3326 
3327                 // There is input left.  Pick up one char and test it for set membership.
3328                 UChar32 c = UTEXT_NEXT32(fInputText);
3329                 U_ASSERT(opValue > 0 && opValue < fSets->size());
3330                 if (c<256) {
3331                     Regex8BitSet *s8 = &fPattern->fSets8[opValue];
3332                     if (s8->contains(c)) {
3333                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3334                         break;
3335                     }
3336                 } else {
3337                     UnicodeSet *s = (UnicodeSet *)fSets->elementAt(opValue);
3338                     if (s->contains(c)) {
3339                         // The character is in the set.  A Match.
3340                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3341                         break;
3342                     }
3343                 }
3344 
3345                 // the character wasn't in the set.
3346                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3347             }
3348             break;
3349 
3350 
3351         case URX_DOTANY:
3352             {
3353                 // . matches anything, but stops at end-of-line.
3354                 if (fp->fInputIdx >= fActiveLimit) {
3355                     // At end of input.  Match failed.  Backtrack out.
3356                     fHitEnd = TRUE;
3357                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3358                     break;
3359                 }
3360 
3361                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3362 
3363                 // There is input left.  Advance over one char, unless we've hit end-of-line
3364                 UChar32 c = UTEXT_NEXT32(fInputText);
3365                 if (isLineTerminator(c)) {
3366                     // End of line in normal mode.   . does not match.
3367                         fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3368                     break;
3369                 }
3370                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3371             }
3372             break;
3373 
3374 
3375         case URX_DOTANY_ALL:
3376             {
3377                 // ., in dot-matches-all (including new lines) mode
3378                 if (fp->fInputIdx >= fActiveLimit) {
3379                     // At end of input.  Match failed.  Backtrack out.
3380                     fHitEnd = TRUE;
3381                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3382                     break;
3383                 }
3384 
3385                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3386 
3387                 // There is input left.  Advance over one char, except if we are
3388                 //   at a cr/lf, advance over both of them.
3389                 UChar32 c;
3390                 c = UTEXT_NEXT32(fInputText);
3391                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3392                 if (c==0x0d && fp->fInputIdx < fActiveLimit) {
3393                     // In the case of a CR/LF, we need to advance over both.
3394                     UChar32 nextc = UTEXT_CURRENT32(fInputText);
3395                     if (nextc == 0x0a) {
3396                         (void)UTEXT_NEXT32(fInputText);
3397                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3398                     }
3399                 }
3400             }
3401             break;
3402 
3403 
3404         case URX_DOTANY_UNIX:
3405             {
3406                 // '.' operator, matches all, but stops at end-of-line.
3407                 //   UNIX_LINES mode, so 0x0a is the only recognized line ending.
3408                 if (fp->fInputIdx >= fActiveLimit) {
3409                     // At end of input.  Match failed.  Backtrack out.
3410                     fHitEnd = TRUE;
3411                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3412                     break;
3413                 }
3414 
3415                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3416 
3417                 // There is input left.  Advance over one char, unless we've hit end-of-line
3418                 UChar32 c = UTEXT_NEXT32(fInputText);
3419                 if (c == 0x0a) {
3420                     // End of line in normal mode.   '.' does not match the \n
3421                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3422                 } else {
3423                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3424                 }
3425             }
3426             break;
3427 
3428 
3429         case URX_JMP:
3430             fp->fPatIdx = opValue;
3431             break;
3432 
3433         case URX_FAIL:
3434             isMatch = FALSE;
3435             goto breakFromLoop;
3436 
3437         case URX_JMP_SAV:
3438             U_ASSERT(opValue < fPattern->fCompiledPat->size());
3439             fp = StateSave(fp, fp->fPatIdx, status);       // State save to loc following current
3440             fp->fPatIdx = opValue;                         // Then JMP.
3441             break;
3442 
3443         case URX_JMP_SAV_X:
3444             // This opcode is used with (x)+, when x can match a zero length string.
3445             // Same as JMP_SAV, except conditional on the match having made forward progress.
3446             // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
3447             //   data address of the input position at the start of the loop.
3448             {
3449                 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
3450                 int32_t  stoOp = (int32_t)pat[opValue-1];
3451                 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
3452                 int32_t  frameLoc = URX_VAL(stoOp);
3453                 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
3454                 int64_t prevInputIdx = fp->fExtra[frameLoc];
3455                 U_ASSERT(prevInputIdx <= fp->fInputIdx);
3456                 if (prevInputIdx < fp->fInputIdx) {
3457                     // The match did make progress.  Repeat the loop.
3458                     fp = StateSave(fp, fp->fPatIdx, status);  // State save to loc following current
3459                     fp->fPatIdx = opValue;
3460                     fp->fExtra[frameLoc] = fp->fInputIdx;
3461                 }
3462                 // If the input position did not advance, we do nothing here,
3463                 //   execution will fall out of the loop.
3464             }
3465             break;
3466 
3467         case URX_CTR_INIT:
3468             {
3469                 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
3470                 fp->fExtra[opValue] = 0;                 //  Set the loop counter variable to zero
3471 
3472                 // Pick up the three extra operands that CTR_INIT has, and
3473                 //    skip the pattern location counter past
3474                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3475                 fp->fPatIdx += 3;
3476                 int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
3477                 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
3478                 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
3479                 U_ASSERT(minCount>=0);
3480                 U_ASSERT(maxCount>=minCount || maxCount==-1);
3481                 U_ASSERT(loopLoc>=fp->fPatIdx);
3482 
3483                 if (minCount == 0) {
3484                     fp = StateSave(fp, loopLoc+1, status);
3485                 }
3486                 if (maxCount == -1) {
3487                     fp->fExtra[opValue+1] = fp->fInputIdx;   //  For loop breaking.
3488                 } else if (maxCount == 0) {
3489                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3490                 }
3491             }
3492             break;
3493 
3494         case URX_CTR_LOOP:
3495             {
3496                 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
3497                 int32_t initOp = (int32_t)pat[opValue];
3498                 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
3499                 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
3500                 int32_t minCount  = (int32_t)pat[opValue+2];
3501                 int32_t maxCount  = (int32_t)pat[opValue+3];
3502                 (*pCounter)++;
3503                 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
3504                     U_ASSERT(*pCounter == maxCount);
3505                     break;
3506                 }
3507                 if (*pCounter >= minCount) {
3508                     if (maxCount == -1) {
3509                         // Loop has no hard upper bound.
3510                         // Check that it is progressing through the input, break if it is not.
3511                         int64_t *pLastInputIdx =  &fp->fExtra[URX_VAL(initOp) + 1];
3512                         if (fp->fInputIdx == *pLastInputIdx) {
3513                             break;
3514                         } else {
3515                             *pLastInputIdx = fp->fInputIdx;
3516                         }
3517                     }
3518                     fp = StateSave(fp, fp->fPatIdx, status);
3519                 } else {
3520                     // Increment time-out counter. (StateSave() does it if count >= minCount)
3521                     fTickCounter--;
3522                     if (fTickCounter <= 0) {
3523                         IncrementTime(status);    // Re-initializes fTickCounter
3524                     }
3525                 }
3526 
3527                 fp->fPatIdx = opValue + 4;    // Loop back.
3528             }
3529             break;
3530 
3531         case URX_CTR_INIT_NG:
3532             {
3533                 // Initialize a non-greedy loop
3534                 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
3535                 fp->fExtra[opValue] = 0;                 //  Set the loop counter variable to zero
3536 
3537                 // Pick up the three extra operands that CTR_INIT_NG has, and
3538                 //    skip the pattern location counter past
3539                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3540                 fp->fPatIdx += 3;
3541                 int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
3542                 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
3543                 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
3544                 U_ASSERT(minCount>=0);
3545                 U_ASSERT(maxCount>=minCount || maxCount==-1);
3546                 U_ASSERT(loopLoc>fp->fPatIdx);
3547                 if (maxCount == -1) {
3548                     fp->fExtra[opValue+1] = fp->fInputIdx;   //  Save initial input index for loop breaking.
3549                 }
3550 
3551                 if (minCount == 0) {
3552                     if (maxCount != 0) {
3553                         fp = StateSave(fp, fp->fPatIdx, status);
3554                     }
3555                     fp->fPatIdx = loopLoc+1;   // Continue with stuff after repeated block
3556                 }
3557             }
3558             break;
3559 
3560         case URX_CTR_LOOP_NG:
3561             {
3562                 // Non-greedy {min, max} loops
3563                 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
3564                 int32_t initOp = (int32_t)pat[opValue];
3565                 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
3566                 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
3567                 int32_t minCount  = (int32_t)pat[opValue+2];
3568                 int32_t maxCount  = (int32_t)pat[opValue+3];
3569 
3570                 (*pCounter)++;
3571                 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
3572                     // The loop has matched the maximum permitted number of times.
3573                     //   Break out of here with no action.  Matching will
3574                     //   continue with the following pattern.
3575                     U_ASSERT(*pCounter == maxCount);
3576                     break;
3577                 }
3578 
3579                 if (*pCounter < minCount) {
3580                     // We haven't met the minimum number of matches yet.
3581                     //   Loop back for another one.
3582                     fp->fPatIdx = opValue + 4;    // Loop back.
3583                     // Increment time-out counter. (StateSave() does it if count >= minCount)
3584                     fTickCounter--;
3585                     if (fTickCounter <= 0) {
3586                         IncrementTime(status);    // Re-initializes fTickCounter
3587                     }
3588                 } else {
3589                     // We do have the minimum number of matches.
3590 
3591                     // If there is no upper bound on the loop iterations, check that the input index
3592                     // is progressing, and stop the loop if it is not.
3593                     if (maxCount == -1) {
3594                         int64_t *pLastInputIdx =  &fp->fExtra[URX_VAL(initOp) + 1];
3595                         if (fp->fInputIdx == *pLastInputIdx) {
3596                             break;
3597                         }
3598                         *pLastInputIdx = fp->fInputIdx;
3599                     }
3600 
3601                     // Loop Continuation: we will fall into the pattern following the loop
3602                     //   (non-greedy, don't execute loop body first), but first do
3603                     //   a state save to the top of the loop, so that a match failure
3604                     //   in the following pattern will try another iteration of the loop.
3605                     fp = StateSave(fp, opValue + 4, status);
3606                 }
3607             }
3608             break;
3609 
3610         case URX_STO_SP:
3611             U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
3612             fData[opValue] = fStack->size();
3613             break;
3614 
3615         case URX_LD_SP:
3616             {
3617                 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
3618                 int32_t newStackSize = (int32_t)fData[opValue];
3619                 U_ASSERT(newStackSize <= fStack->size());
3620                 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
3621                 if (newFP == (int64_t *)fp) {
3622                     break;
3623                 }
3624                 int32_t j;
3625                 for (j=0; j<fFrameSize; j++) {
3626                     newFP[j] = ((int64_t *)fp)[j];
3627                 }
3628                 fp = (REStackFrame *)newFP;
3629                 fStack->setSize(newStackSize);
3630             }
3631             break;
3632 
3633         case URX_BACKREF:
3634             {
3635                 U_ASSERT(opValue < fFrameSize);
3636                 int64_t groupStartIdx = fp->fExtra[opValue];
3637                 int64_t groupEndIdx   = fp->fExtra[opValue+1];
3638                 U_ASSERT(groupStartIdx <= groupEndIdx);
3639                 if (groupStartIdx < 0) {
3640                     // This capture group has not participated in the match thus far,
3641                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
3642                     break;
3643                 }
3644                 UTEXT_SETNATIVEINDEX(fAltInputText, groupStartIdx);
3645                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3646 
3647                 //   Note: if the capture group match was of an empty string the backref
3648                 //         match succeeds.  Verified by testing:  Perl matches succeed
3649                 //         in this case, so we do too.
3650 
3651                 UBool success = TRUE;
3652                 for (;;) {
3653                     if (utext_getNativeIndex(fAltInputText) >= groupEndIdx) {
3654                         success = TRUE;
3655                         break;
3656                     }
3657                     if (utext_getNativeIndex(fInputText) >= fActiveLimit) {
3658                         success = FALSE;
3659                         fHitEnd = TRUE;
3660                         break;
3661                     }
3662                     UChar32 captureGroupChar = utext_next32(fAltInputText);
3663                     UChar32 inputChar = utext_next32(fInputText);
3664                     if (inputChar != captureGroupChar) {
3665                         success = FALSE;
3666                         break;
3667                     }
3668                 }
3669 
3670                 if (success) {
3671                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3672                 } else {
3673                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3674                 }
3675             }
3676             break;
3677 
3678 
3679 
3680         case URX_BACKREF_I:
3681             {
3682                 U_ASSERT(opValue < fFrameSize);
3683                 int64_t groupStartIdx = fp->fExtra[opValue];
3684                 int64_t groupEndIdx   = fp->fExtra[opValue+1];
3685                 U_ASSERT(groupStartIdx <= groupEndIdx);
3686                 if (groupStartIdx < 0) {
3687                     // This capture group has not participated in the match thus far,
3688                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
3689                     break;
3690                 }
3691                 utext_setNativeIndex(fAltInputText, groupStartIdx);
3692                 utext_setNativeIndex(fInputText, fp->fInputIdx);
3693                 CaseFoldingUTextIterator captureGroupItr(*fAltInputText);
3694                 CaseFoldingUTextIterator inputItr(*fInputText);
3695 
3696                 //   Note: if the capture group match was of an empty string the backref
3697                 //         match succeeds.  Verified by testing:  Perl matches succeed
3698                 //         in this case, so we do too.
3699 
3700                 UBool success = TRUE;
3701                 for (;;) {
3702                     if (!captureGroupItr.inExpansion() && utext_getNativeIndex(fAltInputText) >= groupEndIdx) {
3703                         success = TRUE;
3704                         break;
3705                     }
3706                     if (!inputItr.inExpansion() && utext_getNativeIndex(fInputText) >= fActiveLimit) {
3707                         success = FALSE;
3708                         fHitEnd = TRUE;
3709                         break;
3710                     }
3711                     UChar32 captureGroupChar = captureGroupItr.next();
3712                     UChar32 inputChar = inputItr.next();
3713                     if (inputChar != captureGroupChar) {
3714                         success = FALSE;
3715                         break;
3716                     }
3717                 }
3718 
3719                 if (success && inputItr.inExpansion()) {
3720                     // We otained a match by consuming part of a string obtained from
3721                     // case-folding a single code point of the input text.
3722                     // This does not count as an overall match.
3723                     success = FALSE;
3724                 }
3725 
3726                 if (success) {
3727                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3728                 } else {
3729                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3730                 }
3731 
3732             }
3733             break;
3734 
3735         case URX_STO_INP_LOC:
3736             {
3737                 U_ASSERT(opValue >= 0 && opValue < fFrameSize);
3738                 fp->fExtra[opValue] = fp->fInputIdx;
3739             }
3740             break;
3741 
3742         case URX_JMPX:
3743             {
3744                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3745                 fp->fPatIdx += 1;
3746                 int32_t dataLoc  = URX_VAL(pat[instrOperandLoc]);
3747                 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
3748                 int64_t savedInputIdx = fp->fExtra[dataLoc];
3749                 U_ASSERT(savedInputIdx <= fp->fInputIdx);
3750                 if (savedInputIdx < fp->fInputIdx) {
3751                     fp->fPatIdx = opValue;                               // JMP
3752                 } else {
3753                      fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no progress in loop.
3754                 }
3755             }
3756             break;
3757 
3758         case URX_LA_START:
3759             {
3760                 // Entering a look around block.
3761                 // Save Stack Ptr, Input Pos.
3762                 U_ASSERT(opValue>=0 && opValue+3<fPattern->fDataSize);
3763                 fData[opValue]   = fStack->size();
3764                 fData[opValue+1] = fp->fInputIdx;
3765                 fData[opValue+2] = fActiveStart;
3766                 fData[opValue+3] = fActiveLimit;
3767                 fActiveStart     = fLookStart;          // Set the match region change for
3768                 fActiveLimit     = fLookLimit;          //   transparent bounds.
3769             }
3770             break;
3771 
3772         case URX_LA_END:
3773             {
3774                 // Leaving a look-ahead block.
3775                 //  restore Stack Ptr, Input Pos to positions they had on entry to block.
3776                 U_ASSERT(opValue>=0 && opValue+3<fPattern->fDataSize);
3777                 int32_t stackSize = fStack->size();
3778                 int32_t newStackSize =(int32_t)fData[opValue];
3779                 U_ASSERT(stackSize >= newStackSize);
3780                 if (stackSize > newStackSize) {
3781                     // Copy the current top frame back to the new (cut back) top frame.
3782                     //   This makes the capture groups from within the look-ahead
3783                     //   expression available.
3784                     int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
3785                     int32_t j;
3786                     for (j=0; j<fFrameSize; j++) {
3787                         newFP[j] = ((int64_t *)fp)[j];
3788                     }
3789                     fp = (REStackFrame *)newFP;
3790                     fStack->setSize(newStackSize);
3791                 }
3792                 fp->fInputIdx = fData[opValue+1];
3793 
3794                 // Restore the active region bounds in the input string; they may have
3795                 //    been changed because of transparent bounds on a Region.
3796                 fActiveStart = fData[opValue+2];
3797                 fActiveLimit = fData[opValue+3];
3798                 U_ASSERT(fActiveStart >= 0);
3799                 U_ASSERT(fActiveLimit <= fInputLength);
3800             }
3801             break;
3802 
3803         case URX_ONECHAR_I:
3804             // Case insensitive one char.  The char from the pattern is already case folded.
3805             // Input text is not, but case folding the input can not reduce two or more code
3806             // points to one.
3807             if (fp->fInputIdx < fActiveLimit) {
3808                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3809 
3810                 UChar32 c = UTEXT_NEXT32(fInputText);
3811                 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
3812                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3813                     break;
3814                 }
3815             } else {
3816                 fHitEnd = TRUE;
3817             }
3818 
3819             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3820             break;
3821 
3822         case URX_STRING_I:
3823             {
3824                 // Case-insensitive test input against a literal string.
3825                 // Strings require two slots in the compiled pattern, one for the
3826                 //   offset to the string text, and one for the length.
3827                 //   The compiled string has already been case folded.
3828                 {
3829                     const UChar *patternString = litText + opValue;
3830                     int32_t      patternStringIdx  = 0;
3831 
3832                     op      = (int32_t)pat[fp->fPatIdx];
3833                     fp->fPatIdx++;
3834                     opType  = URX_TYPE(op);
3835                     opValue = URX_VAL(op);
3836                     U_ASSERT(opType == URX_STRING_LEN);
3837                     int32_t patternStringLen = opValue;  // Length of the string from the pattern.
3838 
3839 
3840                     UChar32   cPattern;
3841                     UChar32   cText;
3842                     UBool     success = TRUE;
3843 
3844                     UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3845                     CaseFoldingUTextIterator inputIterator(*fInputText);
3846                     while (patternStringIdx < patternStringLen) {
3847                         if (!inputIterator.inExpansion() && UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) {
3848                             success = FALSE;
3849                             fHitEnd = TRUE;
3850                             break;
3851                         }
3852                         U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern);
3853                         cText = inputIterator.next();
3854                         if (cText != cPattern) {
3855                             success = FALSE;
3856                             break;
3857                         }
3858                     }
3859                     if (inputIterator.inExpansion()) {
3860                         success = FALSE;
3861                     }
3862 
3863                     if (success) {
3864                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3865                     } else {
3866                         fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3867                     }
3868                 }
3869             }
3870             break;
3871 
3872         case URX_LB_START:
3873             {
3874                 // Entering a look-behind block.
3875                 // Save Stack Ptr, Input Pos and active input region.
3876                 //   TODO:  implement transparent bounds.  Ticket #6067
3877                 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
3878                 fData[opValue]   = fStack->size();
3879                 fData[opValue+1] = fp->fInputIdx;
3880                 // Save input string length, then reset to pin any matches to end at
3881                 //   the current position.
3882                 fData[opValue+2] = fActiveStart;
3883                 fData[opValue+3] = fActiveLimit;
3884                 fActiveStart     = fRegionStart;
3885                 fActiveLimit     = fp->fInputIdx;
3886                 // Init the variable containing the start index for attempted matches.
3887                 fData[opValue+4] = -1;
3888             }
3889             break;
3890 
3891 
3892         case URX_LB_CONT:
3893             {
3894                 // Positive Look-Behind, at top of loop checking for matches of LB expression
3895                 //    at all possible input starting positions.
3896 
3897                 // Fetch the min and max possible match lengths.  They are the operands
3898                 //   of this op in the pattern.
3899                 int32_t minML = (int32_t)pat[fp->fPatIdx++];
3900                 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
3901                 if (!UTEXT_USES_U16(fInputText)) {
3902                     // utf-8 fix to maximum match length. The pattern compiler assumes utf-16.
3903                     // The max length need not be exact; it just needs to be >= actual maximum.
3904                     maxML *= 3;
3905                 }
3906                 U_ASSERT(minML <= maxML);
3907                 U_ASSERT(minML >= 0);
3908 
3909                 // Fetch (from data) the last input index where a match was attempted.
3910                 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
3911                 int64_t  &lbStartIdx = fData[opValue+4];
3912                 if (lbStartIdx < 0) {
3913                     // First time through loop.
3914                     lbStartIdx = fp->fInputIdx - minML;
3915                     if (lbStartIdx > 0) {
3916                         // move index to a code point boudary, if it's not on one already.
3917                         UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
3918                         lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
3919                     }
3920                 } else {
3921                     // 2nd through nth time through the loop.
3922                     // Back up start position for match by one.
3923                     if (lbStartIdx == 0) {
3924                         (lbStartIdx)--;
3925                     } else {
3926                         UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
3927                         (void)UTEXT_PREVIOUS32(fInputText);
3928                         lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
3929                     }
3930                 }
3931 
3932                 if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
3933                     // We have tried all potential match starting points without
3934                     //  getting a match.  Backtrack out, and out of the
3935                     //   Look Behind altogether.
3936                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3937                     fActiveStart = fData[opValue+2];
3938                     fActiveLimit = fData[opValue+3];
3939                     U_ASSERT(fActiveStart >= 0);
3940                     U_ASSERT(fActiveLimit <= fInputLength);
3941                     break;
3942                 }
3943 
3944                 //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
3945                 //      (successful match will fall off the end of the loop.)
3946                 fp = StateSave(fp, fp->fPatIdx-3, status);
3947                 fp->fInputIdx = lbStartIdx;
3948             }
3949             break;
3950 
3951         case URX_LB_END:
3952             // End of a look-behind block, after a successful match.
3953             {
3954                 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
3955                 if (fp->fInputIdx != fActiveLimit) {
3956                     //  The look-behind expression matched, but the match did not
3957                     //    extend all the way to the point that we are looking behind from.
3958                     //  FAIL out of here, which will take us back to the LB_CONT, which
3959                     //     will retry the match starting at another position or fail
3960                     //     the look-behind altogether, whichever is appropriate.
3961                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3962                     break;
3963                 }
3964 
3965                 // Look-behind match is good.  Restore the orignal input string region,
3966                 //   which had been truncated to pin the end of the lookbehind match to the
3967                 //   position being looked-behind.
3968                 fActiveStart = fData[opValue+2];
3969                 fActiveLimit = fData[opValue+3];
3970                 U_ASSERT(fActiveStart >= 0);
3971                 U_ASSERT(fActiveLimit <= fInputLength);
3972             }
3973             break;
3974 
3975 
3976         case URX_LBN_CONT:
3977             {
3978                 // Negative Look-Behind, at top of loop checking for matches of LB expression
3979                 //    at all possible input starting positions.
3980 
3981                 // Fetch the extra parameters of this op.
3982                 int32_t minML       = (int32_t)pat[fp->fPatIdx++];
3983                 int32_t maxML       = (int32_t)pat[fp->fPatIdx++];
3984                 if (!UTEXT_USES_U16(fInputText)) {
3985                     // utf-8 fix to maximum match length. The pattern compiler assumes utf-16.
3986                     // The max length need not be exact; it just needs to be >= actual maximum.
3987                     maxML *= 3;
3988                 }
3989                 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
3990                         continueLoc = URX_VAL(continueLoc);
3991                 U_ASSERT(minML <= maxML);
3992                 U_ASSERT(minML >= 0);
3993                 U_ASSERT(continueLoc > fp->fPatIdx);
3994 
3995                 // Fetch (from data) the last input index where a match was attempted.
3996                 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
3997                 int64_t  &lbStartIdx = fData[opValue+4];
3998                 if (lbStartIdx < 0) {
3999                     // First time through loop.
4000                     lbStartIdx = fp->fInputIdx - minML;
4001                     if (lbStartIdx > 0) {
4002                         // move index to a code point boudary, if it's not on one already.
4003                         UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
4004                         lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
4005                     }
4006                 } else {
4007                     // 2nd through nth time through the loop.
4008                     // Back up start position for match by one.
4009                     if (lbStartIdx == 0) {
4010                         (lbStartIdx)--;
4011                     } else {
4012                         UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
4013                         (void)UTEXT_PREVIOUS32(fInputText);
4014                         lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
4015                     }
4016                 }
4017 
4018                 if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
4019                     // We have tried all potential match starting points without
4020                     //  getting a match, which means that the negative lookbehind as
4021                     //  a whole has succeeded.  Jump forward to the continue location
4022                     fActiveStart = fData[opValue+2];
4023                     fActiveLimit = fData[opValue+3];
4024                     U_ASSERT(fActiveStart >= 0);
4025                     U_ASSERT(fActiveLimit <= fInputLength);
4026                     fp->fPatIdx = continueLoc;
4027                     break;
4028                 }
4029 
4030                 //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
4031                 //      (successful match will cause a FAIL out of the loop altogether.)
4032                 fp = StateSave(fp, fp->fPatIdx-4, status);
4033                 fp->fInputIdx = lbStartIdx;
4034             }
4035             break;
4036 
4037         case URX_LBN_END:
4038             // End of a negative look-behind block, after a successful match.
4039             {
4040                 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
4041                 if (fp->fInputIdx != fActiveLimit) {
4042                     //  The look-behind expression matched, but the match did not
4043                     //    extend all the way to the point that we are looking behind from.
4044                     //  FAIL out of here, which will take us back to the LB_CONT, which
4045                     //     will retry the match starting at another position or succeed
4046                     //     the look-behind altogether, whichever is appropriate.
4047                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4048                     break;
4049                 }
4050 
4051                 // Look-behind expression matched, which means look-behind test as
4052                 //   a whole Fails
4053 
4054                 //   Restore the orignal input string length, which had been truncated
4055                 //   inorder to pin the end of the lookbehind match
4056                 //   to the position being looked-behind.
4057                 fActiveStart = fData[opValue+2];
4058                 fActiveLimit = fData[opValue+3];
4059                 U_ASSERT(fActiveStart >= 0);
4060                 U_ASSERT(fActiveLimit <= fInputLength);
4061 
4062                 // Restore original stack position, discarding any state saved
4063                 //   by the successful pattern match.
4064                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4065                 int32_t newStackSize = (int32_t)fData[opValue];
4066                 U_ASSERT(fStack->size() > newStackSize);
4067                 fStack->setSize(newStackSize);
4068 
4069                 //  FAIL, which will take control back to someplace
4070                 //  prior to entering the look-behind test.
4071                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4072             }
4073             break;
4074 
4075 
4076         case URX_LOOP_SR_I:
4077             // Loop Initialization for the optimized implementation of
4078             //     [some character set]*
4079             //   This op scans through all matching input.
4080             //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
4081             {
4082                 U_ASSERT(opValue > 0 && opValue < fSets->size());
4083                 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
4084                 UnicodeSet   *s  = (UnicodeSet *)fSets->elementAt(opValue);
4085 
4086                 // Loop through input, until either the input is exhausted or
4087                 //   we reach a character that is not a member of the set.
4088                 int64_t ix = fp->fInputIdx;
4089                 UTEXT_SETNATIVEINDEX(fInputText, ix);
4090                 for (;;) {
4091                     if (ix >= fActiveLimit) {
4092                         fHitEnd = TRUE;
4093                         break;
4094                     }
4095                     UChar32 c = UTEXT_NEXT32(fInputText);
4096                     if (c<256) {
4097                         if (s8->contains(c) == FALSE) {
4098                             break;
4099                         }
4100                     } else {
4101                         if (s->contains(c) == FALSE) {
4102                             break;
4103                         }
4104                     }
4105                     ix = UTEXT_GETNATIVEINDEX(fInputText);
4106                 }
4107 
4108                 // If there were no matching characters, skip over the loop altogether.
4109                 //   The loop doesn't run at all, a * op always succeeds.
4110                 if (ix == fp->fInputIdx) {
4111                     fp->fPatIdx++;   // skip the URX_LOOP_C op.
4112                     break;
4113                 }
4114 
4115                 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
4116                 //   must follow.  It's operand is the stack location
4117                 //   that holds the starting input index for the match of this [set]*
4118                 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
4119                 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
4120                 int32_t stackLoc = URX_VAL(loopcOp);
4121                 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
4122                 fp->fExtra[stackLoc] = fp->fInputIdx;
4123                 fp->fInputIdx = ix;
4124 
4125                 // Save State to the URX_LOOP_C op that follows this one,
4126                 //   so that match failures in the following code will return to there.
4127                 //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
4128                 fp = StateSave(fp, fp->fPatIdx, status);
4129                 fp->fPatIdx++;
4130             }
4131             break;
4132 
4133 
4134         case URX_LOOP_DOT_I:
4135             // Loop Initialization for the optimized implementation of .*
4136             //   This op scans through all remaining input.
4137             //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
4138             {
4139                 // Loop through input until the input is exhausted (we reach an end-of-line)
4140                 // In DOTALL mode, we can just go straight to the end of the input.
4141                 int64_t ix;
4142                 if ((opValue & 1) == 1) {
4143                     // Dot-matches-All mode.  Jump straight to the end of the string.
4144                     ix = fActiveLimit;
4145                     fHitEnd = TRUE;
4146                 } else {
4147                     // NOT DOT ALL mode.  Line endings do not match '.'
4148                     // Scan forward until a line ending or end of input.
4149                     ix = fp->fInputIdx;
4150                     UTEXT_SETNATIVEINDEX(fInputText, ix);
4151                     for (;;) {
4152                         if (ix >= fActiveLimit) {
4153                             fHitEnd = TRUE;
4154                             break;
4155                         }
4156                         UChar32 c = UTEXT_NEXT32(fInputText);
4157                         if ((c & 0x7f) <= 0x29) {          // Fast filter of non-new-line-s
4158                             if ((c == 0x0a) ||             //  0x0a is newline in both modes.
4159                                (((opValue & 2) == 0) &&    // IF not UNIX_LINES mode
4160                                     isLineTerminator(c))) {
4161                                 //  char is a line ending.  Exit the scanning loop.
4162                                 break;
4163                             }
4164                         }
4165                         ix = UTEXT_GETNATIVEINDEX(fInputText);
4166                     }
4167                 }
4168 
4169                 // If there were no matching characters, skip over the loop altogether.
4170                 //   The loop doesn't run at all, a * op always succeeds.
4171                 if (ix == fp->fInputIdx) {
4172                     fp->fPatIdx++;   // skip the URX_LOOP_C op.
4173                     break;
4174                 }
4175 
4176                 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
4177                 //   must follow.  It's operand is the stack location
4178                 //   that holds the starting input index for the match of this .*
4179                 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
4180                 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
4181                 int32_t stackLoc = URX_VAL(loopcOp);
4182                 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
4183                 fp->fExtra[stackLoc] = fp->fInputIdx;
4184                 fp->fInputIdx = ix;
4185 
4186                 // Save State to the URX_LOOP_C op that follows this one,
4187                 //   so that match failures in the following code will return to there.
4188                 //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
4189                 fp = StateSave(fp, fp->fPatIdx, status);
4190                 fp->fPatIdx++;
4191             }
4192             break;
4193 
4194 
4195         case URX_LOOP_C:
4196             {
4197                 U_ASSERT(opValue>=0 && opValue<fFrameSize);
4198                 backSearchIndex = fp->fExtra[opValue];
4199                 U_ASSERT(backSearchIndex <= fp->fInputIdx);
4200                 if (backSearchIndex == fp->fInputIdx) {
4201                     // We've backed up the input idx to the point that the loop started.
4202                     // The loop is done.  Leave here without saving state.
4203                     //  Subsequent failures won't come back here.
4204                     break;
4205                 }
4206                 // Set up for the next iteration of the loop, with input index
4207                 //   backed up by one from the last time through,
4208                 //   and a state save to this instruction in case the following code fails again.
4209                 //   (We're going backwards because this loop emulates stack unwinding, not
4210                 //    the initial scan forward.)
4211                 U_ASSERT(fp->fInputIdx > 0);
4212                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
4213                 UChar32 prevC = UTEXT_PREVIOUS32(fInputText);
4214                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
4215 
4216                 UChar32 twoPrevC = UTEXT_PREVIOUS32(fInputText);
4217                 if (prevC == 0x0a &&
4218                     fp->fInputIdx > backSearchIndex &&
4219                     twoPrevC == 0x0d) {
4220                     int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
4221                     if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
4222                         // .*, stepping back over CRLF pair.
4223                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
4224                     }
4225                 }
4226 
4227 
4228                 fp = StateSave(fp, fp->fPatIdx-1, status);
4229             }
4230             break;
4231 
4232 
4233 
4234         default:
4235             // Trouble.  The compiled pattern contains an entry with an
4236             //           unrecognized type tag.
4237             UPRV_UNREACHABLE;
4238         }
4239 
4240         if (U_FAILURE(status)) {
4241             isMatch = FALSE;
4242             break;
4243         }
4244     }
4245 
4246 breakFromLoop:
4247     fMatch = isMatch;
4248     if (isMatch) {
4249         fLastMatchEnd = fMatchEnd;
4250         fMatchStart   = startIdx;
4251         fMatchEnd     = fp->fInputIdx;
4252     }
4253 
4254 #ifdef REGEX_RUN_DEBUG
4255     if (fTraceDebug) {
4256         if (isMatch) {
4257             printf("Match.  start=%ld   end=%ld\n\n", fMatchStart, fMatchEnd);
4258         } else {
4259             printf("No match\n\n");
4260         }
4261     }
4262 #endif
4263 
4264     fFrame = fp;                // The active stack frame when the engine stopped.
4265                                 //   Contains the capture group results that we need to
4266                                 //    access later.
4267     return;
4268 }
4269 
4270 
4271 //--------------------------------------------------------------------------------
4272 //
4273 //   MatchChunkAt   This is the actual matching engine. Like MatchAt, but with the
4274 //                  assumption that the entire string is available in the UText's
4275 //                  chunk buffer. For now, that means we can use int32_t indexes,
4276 //                  except for anything that needs to be saved (like group starts
4277 //                  and ends).
4278 //
4279 //                  startIdx:    begin matching a this index.
4280 //                  toEnd:       if true, match must extend to end of the input region
4281 //
4282 //--------------------------------------------------------------------------------
MatchChunkAt(int32_t startIdx,UBool toEnd,UErrorCode & status)4283 void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status) {
4284     UBool       isMatch  = FALSE;      // True if the we have a match.
4285 
4286     int32_t     backSearchIndex = INT32_MAX; // used after greedy single-character matches for searching backwards
4287 
4288     int32_t     op;                    // Operation from the compiled pattern, split into
4289     int32_t     opType;                //    the opcode
4290     int32_t     opValue;               //    and the operand value.
4291 
4292 #ifdef REGEX_RUN_DEBUG
4293     if (fTraceDebug) {
4294         printf("MatchAt(startIdx=%d)\n", startIdx);
4295         printf("Original Pattern: \"%s\"\n", CStr(StringFromUText(fPattern->fPattern))());
4296         printf("Input String:     \"%s\"\n\n", CStr(StringFromUText(fInputText))());
4297     }
4298 #endif
4299 
4300     if (U_FAILURE(status)) {
4301         return;
4302     }
4303 
4304     //  Cache frequently referenced items from the compiled pattern
4305     //
4306     int64_t             *pat           = fPattern->fCompiledPat->getBuffer();
4307 
4308     const UChar         *litText       = fPattern->fLiteralText.getBuffer();
4309     UVector             *fSets         = fPattern->fSets;
4310 
4311     const UChar         *inputBuf      = fInputText->chunkContents;
4312 
4313     fFrameSize = fPattern->fFrameSize;
4314     REStackFrame        *fp            = resetStack();
4315     if (U_FAILURE(fDeferredStatus)) {
4316         status = fDeferredStatus;
4317         return;
4318     }
4319 
4320     fp->fPatIdx   = 0;
4321     fp->fInputIdx = startIdx;
4322 
4323     // Zero out the pattern's static data
4324     int32_t i;
4325     for (i = 0; i<fPattern->fDataSize; i++) {
4326         fData[i] = 0;
4327     }
4328 
4329     //
4330     //  Main loop for interpreting the compiled pattern.
4331     //  One iteration of the loop per pattern operation performed.
4332     //
4333     for (;;) {
4334         op      = (int32_t)pat[fp->fPatIdx];
4335         opType  = URX_TYPE(op);
4336         opValue = URX_VAL(op);
4337 #ifdef REGEX_RUN_DEBUG
4338         if (fTraceDebug) {
4339             UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
4340             printf("inputIdx=%ld   inputChar=%x   sp=%3ld   activeLimit=%ld  ", fp->fInputIdx,
4341                    UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
4342             fPattern->dumpOp(fp->fPatIdx);
4343         }
4344 #endif
4345         fp->fPatIdx++;
4346 
4347         switch (opType) {
4348 
4349 
4350         case URX_NOP:
4351             break;
4352 
4353 
4354         case URX_BACKTRACK:
4355             // Force a backtrack.  In some circumstances, the pattern compiler
4356             //   will notice that the pattern can't possibly match anything, and will
4357             //   emit one of these at that point.
4358             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4359             break;
4360 
4361 
4362         case URX_ONECHAR:
4363             if (fp->fInputIdx < fActiveLimit) {
4364                 UChar32 c;
4365                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4366                 if (c == opValue) {
4367                     break;
4368                 }
4369             } else {
4370                 fHitEnd = TRUE;
4371             }
4372             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4373             break;
4374 
4375 
4376         case URX_STRING:
4377             {
4378                 // Test input against a literal string.
4379                 // Strings require two slots in the compiled pattern, one for the
4380                 //   offset to the string text, and one for the length.
4381                 int32_t   stringStartIdx = opValue;
4382                 int32_t   stringLen;
4383 
4384                 op      = (int32_t)pat[fp->fPatIdx];     // Fetch the second operand
4385                 fp->fPatIdx++;
4386                 opType    = URX_TYPE(op);
4387                 stringLen = URX_VAL(op);
4388                 U_ASSERT(opType == URX_STRING_LEN);
4389                 U_ASSERT(stringLen >= 2);
4390 
4391                 const UChar * pInp = inputBuf + fp->fInputIdx;
4392                 const UChar * pInpLimit = inputBuf + fActiveLimit;
4393                 const UChar * pPat = litText+stringStartIdx;
4394                 const UChar * pEnd = pInp + stringLen;
4395                 UBool success = TRUE;
4396                 while (pInp < pEnd) {
4397                     if (pInp >= pInpLimit) {
4398                         fHitEnd = TRUE;
4399                         success = FALSE;
4400                         break;
4401                     }
4402                     if (*pInp++ != *pPat++) {
4403                         success = FALSE;
4404                         break;
4405                     }
4406                 }
4407 
4408                 if (success) {
4409                     fp->fInputIdx += stringLen;
4410                 } else {
4411                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4412                 }
4413             }
4414             break;
4415 
4416 
4417         case URX_STATE_SAVE:
4418             fp = StateSave(fp, opValue, status);
4419             break;
4420 
4421 
4422         case URX_END:
4423             // The match loop will exit via this path on a successful match,
4424             //   when we reach the end of the pattern.
4425             if (toEnd && fp->fInputIdx != fActiveLimit) {
4426                 // The pattern matched, but not to the end of input.  Try some more.
4427                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4428                 break;
4429             }
4430             isMatch = TRUE;
4431             goto  breakFromLoop;
4432 
4433             // Start and End Capture stack frame variables are laid out out like this:
4434             //  fp->fExtra[opValue]  - The start of a completed capture group
4435             //             opValue+1 - The end   of a completed capture group
4436             //             opValue+2 - the start of a capture group whose end
4437             //                          has not yet been reached (and might not ever be).
4438         case URX_START_CAPTURE:
4439             U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
4440             fp->fExtra[opValue+2] = fp->fInputIdx;
4441             break;
4442 
4443 
4444         case URX_END_CAPTURE:
4445             U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
4446             U_ASSERT(fp->fExtra[opValue+2] >= 0);            // Start pos for this group must be set.
4447             fp->fExtra[opValue]   = fp->fExtra[opValue+2];   // Tentative start becomes real.
4448             fp->fExtra[opValue+1] = fp->fInputIdx;           // End position
4449             U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
4450             break;
4451 
4452 
4453         case URX_DOLLAR:                   //  $, test for End of line
4454             //     or for position before new line at end of input
4455             if (fp->fInputIdx < fAnchorLimit-2) {
4456                 // We are no where near the end of input.  Fail.
4457                 //   This is the common case.  Keep it first.
4458                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4459                 break;
4460             }
4461             if (fp->fInputIdx >= fAnchorLimit) {
4462                 // We really are at the end of input.  Success.
4463                 fHitEnd = TRUE;
4464                 fRequireEnd = TRUE;
4465                 break;
4466             }
4467 
4468             // If we are positioned just before a new-line that is located at the
4469             //   end of input, succeed.
4470             if (fp->fInputIdx == fAnchorLimit-1) {
4471                 UChar32 c;
4472                 U16_GET(inputBuf, fAnchorStart, fp->fInputIdx, fAnchorLimit, c);
4473 
4474                 if (isLineTerminator(c)) {
4475                     if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
4476                         // At new-line at end of input. Success
4477                         fHitEnd = TRUE;
4478                         fRequireEnd = TRUE;
4479                         break;
4480                     }
4481                 }
4482             } else if (fp->fInputIdx == fAnchorLimit-2 &&
4483                 inputBuf[fp->fInputIdx]==0x0d && inputBuf[fp->fInputIdx+1]==0x0a) {
4484                     fHitEnd = TRUE;
4485                     fRequireEnd = TRUE;
4486                     break;                         // At CR/LF at end of input.  Success
4487             }
4488 
4489             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4490 
4491             break;
4492 
4493 
4494         case URX_DOLLAR_D:                   //  $, test for End of Line, in UNIX_LINES mode.
4495             if (fp->fInputIdx >= fAnchorLimit-1) {
4496                 // Either at the last character of input, or off the end.
4497                 if (fp->fInputIdx == fAnchorLimit-1) {
4498                     // At last char of input.  Success if it's a new line.
4499                     if (inputBuf[fp->fInputIdx] == 0x0a) {
4500                         fHitEnd = TRUE;
4501                         fRequireEnd = TRUE;
4502                         break;
4503                     }
4504                 } else {
4505                     // Off the end of input.  Success.
4506                     fHitEnd = TRUE;
4507                     fRequireEnd = TRUE;
4508                     break;
4509                 }
4510             }
4511 
4512             // Not at end of input.  Back-track out.
4513             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4514             break;
4515 
4516 
4517         case URX_DOLLAR_M:                //  $, test for End of line in multi-line mode
4518             {
4519                 if (fp->fInputIdx >= fAnchorLimit) {
4520                     // We really are at the end of input.  Success.
4521                     fHitEnd = TRUE;
4522                     fRequireEnd = TRUE;
4523                     break;
4524                 }
4525                 // If we are positioned just before a new-line, succeed.
4526                 // It makes no difference where the new-line is within the input.
4527                 UChar32 c = inputBuf[fp->fInputIdx];
4528                 if (isLineTerminator(c)) {
4529                     // At a line end, except for the odd chance of  being in the middle of a CR/LF sequence
4530                     //  In multi-line mode, hitting a new-line just before the end of input does not
4531                     //   set the hitEnd or requireEnd flags
4532                     if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
4533                         break;
4534                     }
4535                 }
4536                 // not at a new line.  Fail.
4537                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4538             }
4539             break;
4540 
4541 
4542         case URX_DOLLAR_MD:                //  $, test for End of line in multi-line and UNIX_LINES mode
4543             {
4544                 if (fp->fInputIdx >= fAnchorLimit) {
4545                     // We really are at the end of input.  Success.
4546                     fHitEnd = TRUE;
4547                     fRequireEnd = TRUE;  // Java set requireEnd in this case, even though
4548                     break;               //   adding a new-line would not lose the match.
4549                 }
4550                 // If we are not positioned just before a new-line, the test fails; backtrack out.
4551                 // It makes no difference where the new-line is within the input.
4552                 if (inputBuf[fp->fInputIdx] != 0x0a) {
4553                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4554                 }
4555             }
4556             break;
4557 
4558 
4559         case URX_CARET:                    //  ^, test for start of line
4560             if (fp->fInputIdx != fAnchorStart) {
4561                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4562             }
4563             break;
4564 
4565 
4566         case URX_CARET_M:                   //  ^, test for start of line in mulit-line mode
4567             {
4568                 if (fp->fInputIdx == fAnchorStart) {
4569                     // We are at the start input.  Success.
4570                     break;
4571                 }
4572                 // Check whether character just before the current pos is a new-line
4573                 //   unless we are at the end of input
4574                 UChar  c = inputBuf[fp->fInputIdx - 1];
4575                 if ((fp->fInputIdx < fAnchorLimit) &&
4576                     isLineTerminator(c)) {
4577                     //  It's a new-line.  ^ is true.  Success.
4578                     //  TODO:  what should be done with positions between a CR and LF?
4579                     break;
4580                 }
4581                 // Not at the start of a line.  Fail.
4582                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4583             }
4584             break;
4585 
4586 
4587         case URX_CARET_M_UNIX:       //  ^, test for start of line in mulit-line + Unix-line mode
4588             {
4589                 U_ASSERT(fp->fInputIdx >= fAnchorStart);
4590                 if (fp->fInputIdx <= fAnchorStart) {
4591                     // We are at the start input.  Success.
4592                     break;
4593                 }
4594                 // Check whether character just before the current pos is a new-line
4595                 U_ASSERT(fp->fInputIdx <= fAnchorLimit);
4596                 UChar  c = inputBuf[fp->fInputIdx - 1];
4597                 if (c != 0x0a) {
4598                     // Not at the start of a line.  Back-track out.
4599                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4600                 }
4601             }
4602             break;
4603 
4604         case URX_BACKSLASH_B:          // Test for word boundaries
4605             {
4606                 UBool success = isChunkWordBoundary((int32_t)fp->fInputIdx);
4607                 success ^= (UBool)(opValue != 0);     // flip sense for \B
4608                 if (!success) {
4609                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4610                 }
4611             }
4612             break;
4613 
4614 
4615         case URX_BACKSLASH_BU:          // Test for word boundaries, Unicode-style
4616             {
4617                 UBool success = isUWordBoundary(fp->fInputIdx, status);
4618                 success ^= (UBool)(opValue != 0);     // flip sense for \B
4619                 if (!success) {
4620                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4621                 }
4622             }
4623             break;
4624 
4625 
4626         case URX_BACKSLASH_D:            // Test for decimal digit
4627             {
4628                 if (fp->fInputIdx >= fActiveLimit) {
4629                     fHitEnd = TRUE;
4630                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4631                     break;
4632                 }
4633 
4634                 UChar32 c;
4635                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4636                 int8_t ctype = u_charType(c);     // TODO:  make a unicode set for this.  Will be faster.
4637                 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
4638                 success ^= (UBool)(opValue != 0);        // flip sense for \D
4639                 if (!success) {
4640                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4641                 }
4642             }
4643             break;
4644 
4645 
4646         case URX_BACKSLASH_G:          // Test for position at end of previous match
4647             if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) {
4648                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4649             }
4650             break;
4651 
4652 
4653         case URX_BACKSLASH_H:            // Test for \h, horizontal white space.
4654             {
4655                 if (fp->fInputIdx >= fActiveLimit) {
4656                     fHitEnd = TRUE;
4657                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4658                     break;
4659                 }
4660                 UChar32 c;
4661                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4662                 int8_t ctype = u_charType(c);
4663                 UBool success = (ctype == U_SPACE_SEPARATOR || c == 9);  // SPACE_SEPARATOR || TAB
4664                 success ^= (UBool)(opValue != 0);        // flip sense for \H
4665                 if (!success) {
4666                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4667                 }
4668             }
4669             break;
4670 
4671 
4672         case URX_BACKSLASH_R:            // Test for \R, any line break sequence.
4673             {
4674                 if (fp->fInputIdx >= fActiveLimit) {
4675                     fHitEnd = TRUE;
4676                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4677                     break;
4678                 }
4679                 UChar32 c;
4680                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4681                 if (isLineTerminator(c)) {
4682                     if (c == 0x0d && fp->fInputIdx < fActiveLimit) {
4683                         // Check for CR/LF sequence. Consume both together when found.
4684                         UChar c2;
4685                         U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c2);
4686                         if (c2 != 0x0a) {
4687                             U16_PREV(inputBuf, 0, fp->fInputIdx, c2);
4688                         }
4689                     }
4690                 } else {
4691                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4692                 }
4693             }
4694             break;
4695 
4696 
4697         case URX_BACKSLASH_V:         // Any single code point line ending.
4698             {
4699                 if (fp->fInputIdx >= fActiveLimit) {
4700                     fHitEnd = TRUE;
4701                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4702                     break;
4703                 }
4704                 UChar32 c;
4705                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4706                 UBool success = isLineTerminator(c);
4707                 success ^= (UBool)(opValue != 0);        // flip sense for \V
4708                 if (!success) {
4709                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4710                 }
4711             }
4712             break;
4713 
4714 
4715         case URX_BACKSLASH_X:
4716             //  Match a Grapheme, as defined by Unicode UAX 29.
4717 
4718             // Fail if at end of input
4719             if (fp->fInputIdx >= fActiveLimit) {
4720                 fHitEnd = TRUE;
4721                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4722                 break;
4723             }
4724 
4725             fp->fInputIdx = followingGCBoundary(fp->fInputIdx, status);
4726             if (fp->fInputIdx >= fActiveLimit) {
4727                 fHitEnd = TRUE;
4728                 fp->fInputIdx = fActiveLimit;
4729             }
4730             break;
4731 
4732 
4733         case URX_BACKSLASH_Z:          // Test for end of Input
4734             if (fp->fInputIdx < fAnchorLimit) {
4735                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4736             } else {
4737                 fHitEnd = TRUE;
4738                 fRequireEnd = TRUE;
4739             }
4740             break;
4741 
4742 
4743 
4744         case URX_STATIC_SETREF:
4745             {
4746                 // Test input character against one of the predefined sets
4747                 //    (Word Characters, for example)
4748                 // The high bit of the op value is a flag for the match polarity.
4749                 //    0:   success if input char is in set.
4750                 //    1:   success if input char is not in set.
4751                 if (fp->fInputIdx >= fActiveLimit) {
4752                     fHitEnd = TRUE;
4753                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4754                     break;
4755                 }
4756 
4757                 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
4758                 opValue &= ~URX_NEG_SET;
4759                 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
4760 
4761                 UChar32 c;
4762                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4763                 if (c < 256) {
4764                     Regex8BitSet &s8 = RegexStaticSets::gStaticSets->fPropSets8[opValue];
4765                     if (s8.contains(c)) {
4766                         success = !success;
4767                     }
4768                 } else {
4769                     const UnicodeSet &s = RegexStaticSets::gStaticSets->fPropSets[opValue];
4770                     if (s.contains(c)) {
4771                         success = !success;
4772                     }
4773                 }
4774                 if (!success) {
4775                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4776                 }
4777             }
4778             break;
4779 
4780 
4781         case URX_STAT_SETREF_N:
4782             {
4783                 // Test input character for NOT being a member of  one of
4784                 //    the predefined sets (Word Characters, for example)
4785                 if (fp->fInputIdx >= fActiveLimit) {
4786                     fHitEnd = TRUE;
4787                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4788                     break;
4789                 }
4790 
4791                 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
4792 
4793                 UChar32  c;
4794                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4795                 if (c < 256) {
4796                     Regex8BitSet &s8 = RegexStaticSets::gStaticSets->fPropSets8[opValue];
4797                     if (s8.contains(c) == FALSE) {
4798                         break;
4799                     }
4800                 } else {
4801                     const UnicodeSet &s = RegexStaticSets::gStaticSets->fPropSets[opValue];
4802                     if (s.contains(c) == FALSE) {
4803                         break;
4804                     }
4805                 }
4806                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4807             }
4808             break;
4809 
4810 
4811         case URX_SETREF:
4812             {
4813                 if (fp->fInputIdx >= fActiveLimit) {
4814                     fHitEnd = TRUE;
4815                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4816                     break;
4817                 }
4818 
4819                 U_ASSERT(opValue > 0 && opValue < fSets->size());
4820 
4821                 // There is input left.  Pick up one char and test it for set membership.
4822                 UChar32  c;
4823                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4824                 if (c<256) {
4825                     Regex8BitSet *s8 = &fPattern->fSets8[opValue];
4826                     if (s8->contains(c)) {
4827                         // The character is in the set.  A Match.
4828                         break;
4829                     }
4830                 } else {
4831                     UnicodeSet *s = (UnicodeSet *)fSets->elementAt(opValue);
4832                     if (s->contains(c)) {
4833                         // The character is in the set.  A Match.
4834                         break;
4835                     }
4836                 }
4837 
4838                 // the character wasn't in the set.
4839                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4840             }
4841             break;
4842 
4843 
4844         case URX_DOTANY:
4845             {
4846                 // . matches anything, but stops at end-of-line.
4847                 if (fp->fInputIdx >= fActiveLimit) {
4848                     // At end of input.  Match failed.  Backtrack out.
4849                     fHitEnd = TRUE;
4850                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4851                     break;
4852                 }
4853 
4854                 // There is input left.  Advance over one char, unless we've hit end-of-line
4855                 UChar32  c;
4856                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4857                 if (isLineTerminator(c)) {
4858                     // End of line in normal mode.   . does not match.
4859                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4860                     break;
4861                 }
4862             }
4863             break;
4864 
4865 
4866         case URX_DOTANY_ALL:
4867             {
4868                 // . in dot-matches-all (including new lines) mode
4869                 if (fp->fInputIdx >= fActiveLimit) {
4870                     // At end of input.  Match failed.  Backtrack out.
4871                     fHitEnd = TRUE;
4872                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4873                     break;
4874                 }
4875 
4876                 // There is input left.  Advance over one char, except if we are
4877                 //   at a cr/lf, advance over both of them.
4878                 UChar32 c;
4879                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4880                 if (c==0x0d && fp->fInputIdx < fActiveLimit) {
4881                     // In the case of a CR/LF, we need to advance over both.
4882                     if (inputBuf[fp->fInputIdx] == 0x0a) {
4883                         U16_FWD_1(inputBuf, fp->fInputIdx, fActiveLimit);
4884                     }
4885                 }
4886             }
4887             break;
4888 
4889 
4890         case URX_DOTANY_UNIX:
4891             {
4892                 // '.' operator, matches all, but stops at end-of-line.
4893                 //   UNIX_LINES mode, so 0x0a is the only recognized line ending.
4894                 if (fp->fInputIdx >= fActiveLimit) {
4895                     // At end of input.  Match failed.  Backtrack out.
4896                     fHitEnd = TRUE;
4897                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4898                     break;
4899                 }
4900 
4901                 // There is input left.  Advance over one char, unless we've hit end-of-line
4902                 UChar32 c;
4903                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4904                 if (c == 0x0a) {
4905                     // End of line in normal mode.   '.' does not match the \n
4906                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4907                 }
4908             }
4909             break;
4910 
4911 
4912         case URX_JMP:
4913             fp->fPatIdx = opValue;
4914             break;
4915 
4916         case URX_FAIL:
4917             isMatch = FALSE;
4918             goto breakFromLoop;
4919 
4920         case URX_JMP_SAV:
4921             U_ASSERT(opValue < fPattern->fCompiledPat->size());
4922             fp = StateSave(fp, fp->fPatIdx, status);       // State save to loc following current
4923             fp->fPatIdx = opValue;                         // Then JMP.
4924             break;
4925 
4926         case URX_JMP_SAV_X:
4927             // This opcode is used with (x)+, when x can match a zero length string.
4928             // Same as JMP_SAV, except conditional on the match having made forward progress.
4929             // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
4930             //   data address of the input position at the start of the loop.
4931             {
4932                 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
4933                 int32_t  stoOp = (int32_t)pat[opValue-1];
4934                 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
4935                 int32_t  frameLoc = URX_VAL(stoOp);
4936                 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
4937                 int32_t prevInputIdx = (int32_t)fp->fExtra[frameLoc];
4938                 U_ASSERT(prevInputIdx <= fp->fInputIdx);
4939                 if (prevInputIdx < fp->fInputIdx) {
4940                     // The match did make progress.  Repeat the loop.
4941                     fp = StateSave(fp, fp->fPatIdx, status);  // State save to loc following current
4942                     fp->fPatIdx = opValue;
4943                     fp->fExtra[frameLoc] = fp->fInputIdx;
4944                 }
4945                 // If the input position did not advance, we do nothing here,
4946                 //   execution will fall out of the loop.
4947             }
4948             break;
4949 
4950         case URX_CTR_INIT:
4951             {
4952                 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
4953                 fp->fExtra[opValue] = 0;                 //  Set the loop counter variable to zero
4954 
4955                 // Pick up the three extra operands that CTR_INIT has, and
4956                 //    skip the pattern location counter past
4957                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
4958                 fp->fPatIdx += 3;
4959                 int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
4960                 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
4961                 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
4962                 U_ASSERT(minCount>=0);
4963                 U_ASSERT(maxCount>=minCount || maxCount==-1);
4964                 U_ASSERT(loopLoc>=fp->fPatIdx);
4965 
4966                 if (minCount == 0) {
4967                     fp = StateSave(fp, loopLoc+1, status);
4968                 }
4969                 if (maxCount == -1) {
4970                     fp->fExtra[opValue+1] = fp->fInputIdx;   //  For loop breaking.
4971                 } else if (maxCount == 0) {
4972                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4973                 }
4974             }
4975             break;
4976 
4977         case URX_CTR_LOOP:
4978             {
4979                 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
4980                 int32_t initOp = (int32_t)pat[opValue];
4981                 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
4982                 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
4983                 int32_t minCount  = (int32_t)pat[opValue+2];
4984                 int32_t maxCount  = (int32_t)pat[opValue+3];
4985                 (*pCounter)++;
4986                 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
4987                     U_ASSERT(*pCounter == maxCount);
4988                     break;
4989                 }
4990                 if (*pCounter >= minCount) {
4991                     if (maxCount == -1) {
4992                         // Loop has no hard upper bound.
4993                         // Check that it is progressing through the input, break if it is not.
4994                         int64_t *pLastInputIdx =  &fp->fExtra[URX_VAL(initOp) + 1];
4995                         if (fp->fInputIdx == *pLastInputIdx) {
4996                             break;
4997                         } else {
4998                             *pLastInputIdx = fp->fInputIdx;
4999                         }
5000                     }
5001                     fp = StateSave(fp, fp->fPatIdx, status);
5002                 } else {
5003                     // Increment time-out counter. (StateSave() does it if count >= minCount)
5004                     fTickCounter--;
5005                     if (fTickCounter <= 0) {
5006                         IncrementTime(status);    // Re-initializes fTickCounter
5007                     }
5008                 }
5009                 fp->fPatIdx = opValue + 4;    // Loop back.
5010             }
5011             break;
5012 
5013         case URX_CTR_INIT_NG:
5014             {
5015                 // Initialize a non-greedy loop
5016                 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
5017                 fp->fExtra[opValue] = 0;                 //  Set the loop counter variable to zero
5018 
5019                 // Pick up the three extra operands that CTR_INIT_NG has, and
5020                 //    skip the pattern location counter past
5021                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
5022                 fp->fPatIdx += 3;
5023                 int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
5024                 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
5025                 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
5026                 U_ASSERT(minCount>=0);
5027                 U_ASSERT(maxCount>=minCount || maxCount==-1);
5028                 U_ASSERT(loopLoc>fp->fPatIdx);
5029                 if (maxCount == -1) {
5030                     fp->fExtra[opValue+1] = fp->fInputIdx;   //  Save initial input index for loop breaking.
5031                 }
5032 
5033                 if (minCount == 0) {
5034                     if (maxCount != 0) {
5035                         fp = StateSave(fp, fp->fPatIdx, status);
5036                     }
5037                     fp->fPatIdx = loopLoc+1;   // Continue with stuff after repeated block
5038                 }
5039             }
5040             break;
5041 
5042         case URX_CTR_LOOP_NG:
5043             {
5044                 // Non-greedy {min, max} loops
5045                 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
5046                 int32_t initOp = (int32_t)pat[opValue];
5047                 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
5048                 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
5049                 int32_t minCount  = (int32_t)pat[opValue+2];
5050                 int32_t maxCount  = (int32_t)pat[opValue+3];
5051 
5052                 (*pCounter)++;
5053                 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
5054                     // The loop has matched the maximum permitted number of times.
5055                     //   Break out of here with no action.  Matching will
5056                     //   continue with the following pattern.
5057                     U_ASSERT(*pCounter == maxCount);
5058                     break;
5059                 }
5060 
5061                 if (*pCounter < minCount) {
5062                     // We haven't met the minimum number of matches yet.
5063                     //   Loop back for another one.
5064                     fp->fPatIdx = opValue + 4;    // Loop back.
5065                     fTickCounter--;
5066                     if (fTickCounter <= 0) {
5067                         IncrementTime(status);    // Re-initializes fTickCounter
5068                     }
5069                 } else {
5070                     // We do have the minimum number of matches.
5071 
5072                     // If there is no upper bound on the loop iterations, check that the input index
5073                     // is progressing, and stop the loop if it is not.
5074                     if (maxCount == -1) {
5075                         int64_t *pLastInputIdx =  &fp->fExtra[URX_VAL(initOp) + 1];
5076                         if (fp->fInputIdx == *pLastInputIdx) {
5077                             break;
5078                         }
5079                         *pLastInputIdx = fp->fInputIdx;
5080                     }
5081 
5082                     // Loop Continuation: we will fall into the pattern following the loop
5083                     //   (non-greedy, don't execute loop body first), but first do
5084                     //   a state save to the top of the loop, so that a match failure
5085                     //   in the following pattern will try another iteration of the loop.
5086                     fp = StateSave(fp, opValue + 4, status);
5087                 }
5088             }
5089             break;
5090 
5091         case URX_STO_SP:
5092             U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
5093             fData[opValue] = fStack->size();
5094             break;
5095 
5096         case URX_LD_SP:
5097             {
5098                 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
5099                 int32_t newStackSize = (int32_t)fData[opValue];
5100                 U_ASSERT(newStackSize <= fStack->size());
5101                 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
5102                 if (newFP == (int64_t *)fp) {
5103                     break;
5104                 }
5105                 int32_t j;
5106                 for (j=0; j<fFrameSize; j++) {
5107                     newFP[j] = ((int64_t *)fp)[j];
5108                 }
5109                 fp = (REStackFrame *)newFP;
5110                 fStack->setSize(newStackSize);
5111             }
5112             break;
5113 
5114         case URX_BACKREF:
5115             {
5116                 U_ASSERT(opValue < fFrameSize);
5117                 int64_t groupStartIdx = fp->fExtra[opValue];
5118                 int64_t groupEndIdx   = fp->fExtra[opValue+1];
5119                 U_ASSERT(groupStartIdx <= groupEndIdx);
5120                 int64_t inputIndex = fp->fInputIdx;
5121                 if (groupStartIdx < 0) {
5122                     // This capture group has not participated in the match thus far,
5123                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
5124                     break;
5125                 }
5126                 UBool success = TRUE;
5127                 for (int64_t groupIndex = groupStartIdx; groupIndex < groupEndIdx; ++groupIndex,++inputIndex) {
5128                     if (inputIndex >= fActiveLimit) {
5129                         success = FALSE;
5130                         fHitEnd = TRUE;
5131                         break;
5132                     }
5133                     if (inputBuf[groupIndex] != inputBuf[inputIndex]) {
5134                         success = FALSE;
5135                         break;
5136                     }
5137                 }
5138                 if (success && groupStartIdx < groupEndIdx && U16_IS_LEAD(inputBuf[groupEndIdx-1]) &&
5139                         inputIndex < fActiveLimit && U16_IS_TRAIL(inputBuf[inputIndex])) {
5140                     // Capture group ended with an unpaired lead surrogate.
5141                     // Back reference is not permitted to match lead only of a surrogatge pair.
5142                     success = FALSE;
5143                 }
5144                 if (success) {
5145                     fp->fInputIdx = inputIndex;
5146                 } else {
5147                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5148                 }
5149             }
5150             break;
5151 
5152         case URX_BACKREF_I:
5153             {
5154                 U_ASSERT(opValue < fFrameSize);
5155                 int64_t groupStartIdx = fp->fExtra[opValue];
5156                 int64_t groupEndIdx   = fp->fExtra[opValue+1];
5157                 U_ASSERT(groupStartIdx <= groupEndIdx);
5158                 if (groupStartIdx < 0) {
5159                     // This capture group has not participated in the match thus far,
5160                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
5161                     break;
5162                 }
5163                 CaseFoldingUCharIterator captureGroupItr(inputBuf, groupStartIdx, groupEndIdx);
5164                 CaseFoldingUCharIterator inputItr(inputBuf, fp->fInputIdx, fActiveLimit);
5165 
5166                 //   Note: if the capture group match was of an empty string the backref
5167                 //         match succeeds.  Verified by testing:  Perl matches succeed
5168                 //         in this case, so we do too.
5169 
5170                 UBool success = TRUE;
5171                 for (;;) {
5172                     UChar32 captureGroupChar = captureGroupItr.next();
5173                     if (captureGroupChar == U_SENTINEL) {
5174                         success = TRUE;
5175                         break;
5176                     }
5177                     UChar32 inputChar = inputItr.next();
5178                     if (inputChar == U_SENTINEL) {
5179                         success = FALSE;
5180                         fHitEnd = TRUE;
5181                         break;
5182                     }
5183                     if (inputChar != captureGroupChar) {
5184                         success = FALSE;
5185                         break;
5186                     }
5187                 }
5188 
5189                 if (success && inputItr.inExpansion()) {
5190                     // We otained a match by consuming part of a string obtained from
5191                     // case-folding a single code point of the input text.
5192                     // This does not count as an overall match.
5193                     success = FALSE;
5194                 }
5195 
5196                 if (success) {
5197                     fp->fInputIdx = inputItr.getIndex();
5198                 } else {
5199                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5200                 }
5201             }
5202             break;
5203 
5204         case URX_STO_INP_LOC:
5205             {
5206                 U_ASSERT(opValue >= 0 && opValue < fFrameSize);
5207                 fp->fExtra[opValue] = fp->fInputIdx;
5208             }
5209             break;
5210 
5211         case URX_JMPX:
5212             {
5213                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
5214                 fp->fPatIdx += 1;
5215                 int32_t dataLoc  = URX_VAL(pat[instrOperandLoc]);
5216                 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
5217                 int32_t savedInputIdx = (int32_t)fp->fExtra[dataLoc];
5218                 U_ASSERT(savedInputIdx <= fp->fInputIdx);
5219                 if (savedInputIdx < fp->fInputIdx) {
5220                     fp->fPatIdx = opValue;                               // JMP
5221                 } else {
5222                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no progress in loop.
5223                 }
5224             }
5225             break;
5226 
5227         case URX_LA_START:
5228             {
5229                 // Entering a look around block.
5230                 // Save Stack Ptr, Input Pos.
5231                 U_ASSERT(opValue>=0 && opValue+3<fPattern->fDataSize);
5232                 fData[opValue]   = fStack->size();
5233                 fData[opValue+1] = fp->fInputIdx;
5234                 fData[opValue+2] = fActiveStart;
5235                 fData[opValue+3] = fActiveLimit;
5236                 fActiveStart     = fLookStart;          // Set the match region change for
5237                 fActiveLimit     = fLookLimit;          //   transparent bounds.
5238             }
5239             break;
5240 
5241         case URX_LA_END:
5242             {
5243                 // Leaving a look around block.
5244                 //  restore Stack Ptr, Input Pos to positions they had on entry to block.
5245                 U_ASSERT(opValue>=0 && opValue+3<fPattern->fDataSize);
5246                 int32_t stackSize = fStack->size();
5247                 int32_t newStackSize = (int32_t)fData[opValue];
5248                 U_ASSERT(stackSize >= newStackSize);
5249                 if (stackSize > newStackSize) {
5250                     // Copy the current top frame back to the new (cut back) top frame.
5251                     //   This makes the capture groups from within the look-ahead
5252                     //   expression available.
5253                     int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
5254                     int32_t j;
5255                     for (j=0; j<fFrameSize; j++) {
5256                         newFP[j] = ((int64_t *)fp)[j];
5257                     }
5258                     fp = (REStackFrame *)newFP;
5259                     fStack->setSize(newStackSize);
5260                 }
5261                 fp->fInputIdx = fData[opValue+1];
5262 
5263                 // Restore the active region bounds in the input string; they may have
5264                 //    been changed because of transparent bounds on a Region.
5265                 fActiveStart = fData[opValue+2];
5266                 fActiveLimit = fData[opValue+3];
5267                 U_ASSERT(fActiveStart >= 0);
5268                 U_ASSERT(fActiveLimit <= fInputLength);
5269             }
5270             break;
5271 
5272         case URX_ONECHAR_I:
5273             if (fp->fInputIdx < fActiveLimit) {
5274                 UChar32 c;
5275                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5276                 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
5277                     break;
5278                 }
5279             } else {
5280                 fHitEnd = TRUE;
5281             }
5282             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5283             break;
5284 
5285         case URX_STRING_I:
5286             // Case-insensitive test input against a literal string.
5287             // Strings require two slots in the compiled pattern, one for the
5288             //   offset to the string text, and one for the length.
5289             //   The compiled string has already been case folded.
5290             {
5291                 const UChar *patternString = litText + opValue;
5292 
5293                 op      = (int32_t)pat[fp->fPatIdx];
5294                 fp->fPatIdx++;
5295                 opType  = URX_TYPE(op);
5296                 opValue = URX_VAL(op);
5297                 U_ASSERT(opType == URX_STRING_LEN);
5298                 int32_t patternStringLen = opValue;  // Length of the string from the pattern.
5299 
5300                 UChar32      cText;
5301                 UChar32      cPattern;
5302                 UBool        success = TRUE;
5303                 int32_t      patternStringIdx  = 0;
5304                 CaseFoldingUCharIterator inputIterator(inputBuf, fp->fInputIdx, fActiveLimit);
5305                 while (patternStringIdx < patternStringLen) {
5306                     U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern);
5307                     cText = inputIterator.next();
5308                     if (cText != cPattern) {
5309                         success = FALSE;
5310                         if (cText == U_SENTINEL) {
5311                             fHitEnd = TRUE;
5312                         }
5313                         break;
5314                     }
5315                 }
5316                 if (inputIterator.inExpansion()) {
5317                     success = FALSE;
5318                 }
5319 
5320                 if (success) {
5321                     fp->fInputIdx = inputIterator.getIndex();
5322                 } else {
5323                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5324                 }
5325             }
5326             break;
5327 
5328         case URX_LB_START:
5329             {
5330                 // Entering a look-behind block.
5331                 // Save Stack Ptr, Input Pos and active input region.
5332                 //   TODO:  implement transparent bounds.  Ticket #6067
5333                 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
5334                 fData[opValue]   = fStack->size();
5335                 fData[opValue+1] = fp->fInputIdx;
5336                 // Save input string length, then reset to pin any matches to end at
5337                 //   the current position.
5338                 fData[opValue+2] = fActiveStart;
5339                 fData[opValue+3] = fActiveLimit;
5340                 fActiveStart     = fRegionStart;
5341                 fActiveLimit     = fp->fInputIdx;
5342                 // Init the variable containing the start index for attempted matches.
5343                 fData[opValue+4] = -1;
5344             }
5345             break;
5346 
5347 
5348         case URX_LB_CONT:
5349             {
5350                 // Positive Look-Behind, at top of loop checking for matches of LB expression
5351                 //    at all possible input starting positions.
5352 
5353                 // Fetch the min and max possible match lengths.  They are the operands
5354                 //   of this op in the pattern.
5355                 int32_t minML = (int32_t)pat[fp->fPatIdx++];
5356                 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
5357                 U_ASSERT(minML <= maxML);
5358                 U_ASSERT(minML >= 0);
5359 
5360                 // Fetch (from data) the last input index where a match was attempted.
5361                 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
5362                 int64_t  &lbStartIdx = fData[opValue+4];
5363                 if (lbStartIdx < 0) {
5364                     // First time through loop.
5365                     lbStartIdx = fp->fInputIdx - minML;
5366                     if (lbStartIdx > 0 && lbStartIdx < fInputLength) {
5367                         U16_SET_CP_START(inputBuf, 0, lbStartIdx);
5368                     }
5369                 } else {
5370                     // 2nd through nth time through the loop.
5371                     // Back up start position for match by one.
5372                     if (lbStartIdx == 0) {
5373                         lbStartIdx--;
5374                     } else {
5375                         U16_BACK_1(inputBuf, 0, lbStartIdx);
5376                     }
5377                 }
5378 
5379                 if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
5380                     // We have tried all potential match starting points without
5381                     //  getting a match.  Backtrack out, and out of the
5382                     //   Look Behind altogether.
5383                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5384                     fActiveStart = fData[opValue+2];
5385                     fActiveLimit = fData[opValue+3];
5386                     U_ASSERT(fActiveStart >= 0);
5387                     U_ASSERT(fActiveLimit <= fInputLength);
5388                     break;
5389                 }
5390 
5391                 //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
5392                 //      (successful match will fall off the end of the loop.)
5393                 fp = StateSave(fp, fp->fPatIdx-3, status);
5394                 fp->fInputIdx =  lbStartIdx;
5395             }
5396             break;
5397 
5398         case URX_LB_END:
5399             // End of a look-behind block, after a successful match.
5400             {
5401                 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
5402                 if (fp->fInputIdx != fActiveLimit) {
5403                     //  The look-behind expression matched, but the match did not
5404                     //    extend all the way to the point that we are looking behind from.
5405                     //  FAIL out of here, which will take us back to the LB_CONT, which
5406                     //     will retry the match starting at another position or fail
5407                     //     the look-behind altogether, whichever is appropriate.
5408                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5409                     break;
5410                 }
5411 
5412                 // Look-behind match is good.  Restore the orignal input string region,
5413                 //   which had been truncated to pin the end of the lookbehind match to the
5414                 //   position being looked-behind.
5415                 fActiveStart = fData[opValue+2];
5416                 fActiveLimit = fData[opValue+3];
5417                 U_ASSERT(fActiveStart >= 0);
5418                 U_ASSERT(fActiveLimit <= fInputLength);
5419             }
5420             break;
5421 
5422 
5423         case URX_LBN_CONT:
5424             {
5425                 // Negative Look-Behind, at top of loop checking for matches of LB expression
5426                 //    at all possible input starting positions.
5427 
5428                 // Fetch the extra parameters of this op.
5429                 int32_t minML       = (int32_t)pat[fp->fPatIdx++];
5430                 int32_t maxML       = (int32_t)pat[fp->fPatIdx++];
5431                 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
5432                 continueLoc = URX_VAL(continueLoc);
5433                 U_ASSERT(minML <= maxML);
5434                 U_ASSERT(minML >= 0);
5435                 U_ASSERT(continueLoc > fp->fPatIdx);
5436 
5437                 // Fetch (from data) the last input index where a match was attempted.
5438                 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
5439                 int64_t  &lbStartIdx = fData[opValue+4];
5440                 if (lbStartIdx < 0) {
5441                     // First time through loop.
5442                     lbStartIdx = fp->fInputIdx - minML;
5443                     if (lbStartIdx > 0 && lbStartIdx < fInputLength) {
5444                         U16_SET_CP_START(inputBuf, 0, lbStartIdx);
5445                     }
5446                 } else {
5447                     // 2nd through nth time through the loop.
5448                     // Back up start position for match by one.
5449                     if (lbStartIdx == 0) {
5450                         lbStartIdx--;   // Because U16_BACK is unsafe starting at 0.
5451                     } else {
5452                         U16_BACK_1(inputBuf, 0, lbStartIdx);
5453                     }
5454                 }
5455 
5456                 if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
5457                     // We have tried all potential match starting points without
5458                     //  getting a match, which means that the negative lookbehind as
5459                     //  a whole has succeeded.  Jump forward to the continue location
5460                     fActiveStart = fData[opValue+2];
5461                     fActiveLimit = fData[opValue+3];
5462                     U_ASSERT(fActiveStart >= 0);
5463                     U_ASSERT(fActiveLimit <= fInputLength);
5464                     fp->fPatIdx = continueLoc;
5465                     break;
5466                 }
5467 
5468                 //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
5469                 //      (successful match will cause a FAIL out of the loop altogether.)
5470                 fp = StateSave(fp, fp->fPatIdx-4, status);
5471                 fp->fInputIdx =  lbStartIdx;
5472             }
5473             break;
5474 
5475         case URX_LBN_END:
5476             // End of a negative look-behind block, after a successful match.
5477             {
5478                 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
5479                 if (fp->fInputIdx != fActiveLimit) {
5480                     //  The look-behind expression matched, but the match did not
5481                     //    extend all the way to the point that we are looking behind from.
5482                     //  FAIL out of here, which will take us back to the LB_CONT, which
5483                     //     will retry the match starting at another position or succeed
5484                     //     the look-behind altogether, whichever is appropriate.
5485                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5486                     break;
5487                 }
5488 
5489                 // Look-behind expression matched, which means look-behind test as
5490                 //   a whole Fails
5491 
5492                 //   Restore the orignal input string length, which had been truncated
5493                 //   inorder to pin the end of the lookbehind match
5494                 //   to the position being looked-behind.
5495                 fActiveStart = fData[opValue+2];
5496                 fActiveLimit = fData[opValue+3];
5497                 U_ASSERT(fActiveStart >= 0);
5498                 U_ASSERT(fActiveLimit <= fInputLength);
5499 
5500                 // Restore original stack position, discarding any state saved
5501                 //   by the successful pattern match.
5502                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5503                 int32_t newStackSize = (int32_t)fData[opValue];
5504                 U_ASSERT(fStack->size() > newStackSize);
5505                 fStack->setSize(newStackSize);
5506 
5507                 //  FAIL, which will take control back to someplace
5508                 //  prior to entering the look-behind test.
5509                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5510             }
5511             break;
5512 
5513 
5514         case URX_LOOP_SR_I:
5515             // Loop Initialization for the optimized implementation of
5516             //     [some character set]*
5517             //   This op scans through all matching input.
5518             //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
5519             {
5520                 U_ASSERT(opValue > 0 && opValue < fSets->size());
5521                 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
5522                 UnicodeSet   *s  = (UnicodeSet *)fSets->elementAt(opValue);
5523 
5524                 // Loop through input, until either the input is exhausted or
5525                 //   we reach a character that is not a member of the set.
5526                 int32_t ix = (int32_t)fp->fInputIdx;
5527                 for (;;) {
5528                     if (ix >= fActiveLimit) {
5529                         fHitEnd = TRUE;
5530                         break;
5531                     }
5532                     UChar32   c;
5533                     U16_NEXT(inputBuf, ix, fActiveLimit, c);
5534                     if (c<256) {
5535                         if (s8->contains(c) == FALSE) {
5536                             U16_BACK_1(inputBuf, 0, ix);
5537                             break;
5538                         }
5539                     } else {
5540                         if (s->contains(c) == FALSE) {
5541                             U16_BACK_1(inputBuf, 0, ix);
5542                             break;
5543                         }
5544                     }
5545                 }
5546 
5547                 // If there were no matching characters, skip over the loop altogether.
5548                 //   The loop doesn't run at all, a * op always succeeds.
5549                 if (ix == fp->fInputIdx) {
5550                     fp->fPatIdx++;   // skip the URX_LOOP_C op.
5551                     break;
5552                 }
5553 
5554                 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
5555                 //   must follow.  It's operand is the stack location
5556                 //   that holds the starting input index for the match of this [set]*
5557                 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
5558                 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
5559                 int32_t stackLoc = URX_VAL(loopcOp);
5560                 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
5561                 fp->fExtra[stackLoc] = fp->fInputIdx;
5562                 fp->fInputIdx = ix;
5563 
5564                 // Save State to the URX_LOOP_C op that follows this one,
5565                 //   so that match failures in the following code will return to there.
5566                 //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
5567                 fp = StateSave(fp, fp->fPatIdx, status);
5568                 fp->fPatIdx++;
5569             }
5570             break;
5571 
5572 
5573         case URX_LOOP_DOT_I:
5574             // Loop Initialization for the optimized implementation of .*
5575             //   This op scans through all remaining input.
5576             //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
5577             {
5578                 // Loop through input until the input is exhausted (we reach an end-of-line)
5579                 // In DOTALL mode, we can just go straight to the end of the input.
5580                 int32_t ix;
5581                 if ((opValue & 1) == 1) {
5582                     // Dot-matches-All mode.  Jump straight to the end of the string.
5583                     ix = (int32_t)fActiveLimit;
5584                     fHitEnd = TRUE;
5585                 } else {
5586                     // NOT DOT ALL mode.  Line endings do not match '.'
5587                     // Scan forward until a line ending or end of input.
5588                     ix = (int32_t)fp->fInputIdx;
5589                     for (;;) {
5590                         if (ix >= fActiveLimit) {
5591                             fHitEnd = TRUE;
5592                             break;
5593                         }
5594                         UChar32   c;
5595                         U16_NEXT(inputBuf, ix, fActiveLimit, c);   // c = inputBuf[ix++]
5596                         if ((c & 0x7f) <= 0x29) {          // Fast filter of non-new-line-s
5597                             if ((c == 0x0a) ||             //  0x0a is newline in both modes.
5598                                 (((opValue & 2) == 0) &&    // IF not UNIX_LINES mode
5599                                    isLineTerminator(c))) {
5600                                 //  char is a line ending.  Put the input pos back to the
5601                                 //    line ending char, and exit the scanning loop.
5602                                 U16_BACK_1(inputBuf, 0, ix);
5603                                 break;
5604                             }
5605                         }
5606                     }
5607                 }
5608 
5609                 // If there were no matching characters, skip over the loop altogether.
5610                 //   The loop doesn't run at all, a * op always succeeds.
5611                 if (ix == fp->fInputIdx) {
5612                     fp->fPatIdx++;   // skip the URX_LOOP_C op.
5613                     break;
5614                 }
5615 
5616                 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
5617                 //   must follow.  It's operand is the stack location
5618                 //   that holds the starting input index for the match of this .*
5619                 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
5620                 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
5621                 int32_t stackLoc = URX_VAL(loopcOp);
5622                 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
5623                 fp->fExtra[stackLoc] = fp->fInputIdx;
5624                 fp->fInputIdx = ix;
5625 
5626                 // Save State to the URX_LOOP_C op that follows this one,
5627                 //   so that match failures in the following code will return to there.
5628                 //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
5629                 fp = StateSave(fp, fp->fPatIdx, status);
5630                 fp->fPatIdx++;
5631             }
5632             break;
5633 
5634 
5635         case URX_LOOP_C:
5636             {
5637                 U_ASSERT(opValue>=0 && opValue<fFrameSize);
5638                 backSearchIndex = (int32_t)fp->fExtra[opValue];
5639                 U_ASSERT(backSearchIndex <= fp->fInputIdx);
5640                 if (backSearchIndex == fp->fInputIdx) {
5641                     // We've backed up the input idx to the point that the loop started.
5642                     // The loop is done.  Leave here without saving state.
5643                     //  Subsequent failures won't come back here.
5644                     break;
5645                 }
5646                 // Set up for the next iteration of the loop, with input index
5647                 //   backed up by one from the last time through,
5648                 //   and a state save to this instruction in case the following code fails again.
5649                 //   (We're going backwards because this loop emulates stack unwinding, not
5650                 //    the initial scan forward.)
5651                 U_ASSERT(fp->fInputIdx > 0);
5652                 UChar32 prevC;
5653                 U16_PREV(inputBuf, 0, fp->fInputIdx, prevC); // !!!: should this 0 be one of f*Limit?
5654 
5655                 if (prevC == 0x0a &&
5656                     fp->fInputIdx > backSearchIndex &&
5657                     inputBuf[fp->fInputIdx-1] == 0x0d) {
5658                     int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
5659                     if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
5660                         // .*, stepping back over CRLF pair.
5661                         U16_BACK_1(inputBuf, 0, fp->fInputIdx);
5662                     }
5663                 }
5664 
5665 
5666                 fp = StateSave(fp, fp->fPatIdx-1, status);
5667             }
5668             break;
5669 
5670 
5671 
5672         default:
5673             // Trouble.  The compiled pattern contains an entry with an
5674             //           unrecognized type tag.
5675             UPRV_UNREACHABLE;
5676         }
5677 
5678         if (U_FAILURE(status)) {
5679             isMatch = FALSE;
5680             break;
5681         }
5682     }
5683 
5684 breakFromLoop:
5685     fMatch = isMatch;
5686     if (isMatch) {
5687         fLastMatchEnd = fMatchEnd;
5688         fMatchStart   = startIdx;
5689         fMatchEnd     = fp->fInputIdx;
5690     }
5691 
5692 #ifdef REGEX_RUN_DEBUG
5693     if (fTraceDebug) {
5694         if (isMatch) {
5695             printf("Match.  start=%ld   end=%ld\n\n", fMatchStart, fMatchEnd);
5696         } else {
5697             printf("No match\n\n");
5698         }
5699     }
5700 #endif
5701 
5702     fFrame = fp;                // The active stack frame when the engine stopped.
5703                                 //   Contains the capture group results that we need to
5704                                 //    access later.
5705 
5706     return;
5707 }
5708 
5709 
5710 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher)
5711 
5712 U_NAMESPACE_END
5713 
5714 #endif  // !UCONFIG_NO_REGULAR_EXPRESSIONS
5715 
5716