• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **************************************************************************
5 *   Copyright (C) 2002-2016 International Business Machines Corporation
6 *   and others. All rights reserved.
7 **************************************************************************
8 */
9 //
10 //  file:  rematch.cpp
11 //
12 //         Contains the implementation of class RegexMatcher,
13 //         which is one of the main API classes for the ICU regular expression package.
14 //
15 
16 #include "unicode/utypes.h"
17 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
18 
19 #include "unicode/regex.h"
20 #include "unicode/uniset.h"
21 #include "unicode/uchar.h"
22 #include "unicode/ustring.h"
23 #include "unicode/rbbi.h"
24 #include "unicode/utf.h"
25 #include "unicode/utf16.h"
26 #include "uassert.h"
27 #include "cmemory.h"
28 #include "cstr.h"
29 #include "uvector.h"
30 #include "uvectr32.h"
31 #include "uvectr64.h"
32 #include "regeximp.h"
33 #include "regexst.h"
34 #include "regextxt.h"
35 #include "ucase.h"
36 
37 // #include <malloc.h>        // Needed for heapcheck testing
38 
39 
40 U_NAMESPACE_BEGIN
41 
42 // Default limit for the size of the back track stack, to avoid system
43 //    failures causedby heap exhaustion.  Units are in 32 bit words, not bytes.
44 // This value puts ICU's limits higher than most other regexp implementations,
45 //    which use recursion rather than the heap, and take more storage per
46 //    backtrack point.
47 //
48 static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY = 8000000;
49 
50 // Time limit counter constant.
51 //   Time limits for expression evaluation are in terms of quanta of work by
52 //   the engine, each of which is 10,000 state saves.
53 //   This constant determines that state saves per tick number.
54 static const int32_t TIMER_INITIAL_VALUE = 10000;
55 
56 
57 // Test for any of the Unicode line terminating characters.
isLineTerminator(UChar32 c)58 static inline UBool isLineTerminator(UChar32 c) {
59     if (c & ~(0x0a | 0x0b | 0x0c | 0x0d | 0x85 | 0x2028 | 0x2029)) {
60         return false;
61     }
62     return (c<=0x0d && c>=0x0a) || c==0x85 || c==0x2028 || c==0x2029;
63 }
64 
65 //-----------------------------------------------------------------------------
66 //
67 //   Constructor and Destructor
68 //
69 //-----------------------------------------------------------------------------
RegexMatcher(const RegexPattern * pat)70 RegexMatcher::RegexMatcher(const RegexPattern *pat)  {
71     fDeferredStatus = U_ZERO_ERROR;
72     init(fDeferredStatus);
73     if (U_FAILURE(fDeferredStatus)) {
74         return;
75     }
76     if (pat==NULL) {
77         fDeferredStatus = U_ILLEGAL_ARGUMENT_ERROR;
78         return;
79     }
80     fPattern = pat;
81     init2(RegexStaticSets::gStaticSets->fEmptyText, fDeferredStatus);
82 }
83 
84 
85 
RegexMatcher(const UnicodeString & regexp,const UnicodeString & input,uint32_t flags,UErrorCode & status)86 RegexMatcher::RegexMatcher(const UnicodeString &regexp, const UnicodeString &input,
87                            uint32_t flags, UErrorCode &status) {
88     init(status);
89     if (U_FAILURE(status)) {
90         return;
91     }
92     UParseError    pe;
93     fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
94     fPattern           = fPatternOwned;
95 
96     UText inputText = UTEXT_INITIALIZER;
97     utext_openConstUnicodeString(&inputText, &input, &status);
98     init2(&inputText, status);
99     utext_close(&inputText);
100 
101     fInputUniStrMaybeMutable = TRUE;
102 }
103 
104 
RegexMatcher(UText * regexp,UText * input,uint32_t flags,UErrorCode & status)105 RegexMatcher::RegexMatcher(UText *regexp, UText *input,
106                            uint32_t flags, UErrorCode &status) {
107     init(status);
108     if (U_FAILURE(status)) {
109         return;
110     }
111     UParseError    pe;
112     fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
113     if (U_FAILURE(status)) {
114         return;
115     }
116 
117     fPattern           = fPatternOwned;
118     init2(input, status);
119 }
120 
121 
RegexMatcher(const UnicodeString & regexp,uint32_t flags,UErrorCode & status)122 RegexMatcher::RegexMatcher(const UnicodeString &regexp,
123                            uint32_t flags, UErrorCode &status) {
124     init(status);
125     if (U_FAILURE(status)) {
126         return;
127     }
128     UParseError    pe;
129     fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
130     if (U_FAILURE(status)) {
131         return;
132     }
133     fPattern           = fPatternOwned;
134     init2(RegexStaticSets::gStaticSets->fEmptyText, status);
135 }
136 
RegexMatcher(UText * regexp,uint32_t flags,UErrorCode & status)137 RegexMatcher::RegexMatcher(UText *regexp,
138                            uint32_t flags, UErrorCode &status) {
139     init(status);
140     if (U_FAILURE(status)) {
141         return;
142     }
143     UParseError    pe;
144     fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
145         if (U_FAILURE(status)) {
146         return;
147     }
148 
149     fPattern           = fPatternOwned;
150     init2(RegexStaticSets::gStaticSets->fEmptyText, status);
151 }
152 
153 
154 
155 
~RegexMatcher()156 RegexMatcher::~RegexMatcher() {
157     delete fStack;
158     if (fData != fSmallData) {
159         uprv_free(fData);
160         fData = NULL;
161     }
162     if (fPatternOwned) {
163         delete fPatternOwned;
164         fPatternOwned = NULL;
165         fPattern = NULL;
166     }
167 
168     if (fInput) {
169         delete fInput;
170     }
171     if (fInputText) {
172         utext_close(fInputText);
173     }
174     if (fAltInputText) {
175         utext_close(fAltInputText);
176     }
177 
178     #if UCONFIG_NO_BREAK_ITERATION==0
179     delete fWordBreakItr;
180     delete fGCBreakItr;
181     #endif
182 }
183 
184 //
185 //   init()   common initialization for use by all constructors.
186 //            Initialize all fields, get the object into a consistent state.
187 //            This must be done even when the initial status shows an error,
188 //            so that the object is initialized sufficiently well for the destructor
189 //            to run safely.
190 //
init(UErrorCode & status)191 void RegexMatcher::init(UErrorCode &status) {
192     fPattern           = NULL;
193     fPatternOwned      = NULL;
194     fFrameSize         = 0;
195     fRegionStart       = 0;
196     fRegionLimit       = 0;
197     fAnchorStart       = 0;
198     fAnchorLimit       = 0;
199     fLookStart         = 0;
200     fLookLimit         = 0;
201     fActiveStart       = 0;
202     fActiveLimit       = 0;
203     fTransparentBounds = FALSE;
204     fAnchoringBounds   = TRUE;
205     fMatch             = FALSE;
206     fMatchStart        = 0;
207     fMatchEnd          = 0;
208     fLastMatchEnd      = -1;
209     fAppendPosition    = 0;
210     fHitEnd            = FALSE;
211     fRequireEnd        = FALSE;
212     fStack             = NULL;
213     fFrame             = NULL;
214     fTimeLimit         = 0;
215     fTime              = 0;
216     fTickCounter       = 0;
217     fStackLimit        = DEFAULT_BACKTRACK_STACK_CAPACITY;
218     fCallbackFn        = NULL;
219     fCallbackContext   = NULL;
220     fFindProgressCallbackFn      = NULL;
221     fFindProgressCallbackContext = NULL;
222     fTraceDebug        = FALSE;
223     fDeferredStatus    = status;
224     fData              = fSmallData;
225     fWordBreakItr      = NULL;
226     fGCBreakItr        = NULL;
227 
228     fStack             = NULL;
229     fInputText         = NULL;
230     fAltInputText      = NULL;
231     fInput             = NULL;
232     fInputLength       = 0;
233     fInputUniStrMaybeMutable = FALSE;
234 }
235 
236 //
237 //  init2()   Common initialization for use by RegexMatcher constructors, part 2.
238 //            This handles the common setup to be done after the Pattern is available.
239 //
init2(UText * input,UErrorCode & status)240 void RegexMatcher::init2(UText *input, UErrorCode &status) {
241     if (U_FAILURE(status)) {
242         fDeferredStatus = status;
243         return;
244     }
245 
246     if (fPattern->fDataSize > UPRV_LENGTHOF(fSmallData)) {
247         fData = (int64_t *)uprv_malloc(fPattern->fDataSize * sizeof(int64_t));
248         if (fData == NULL) {
249             status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
250             return;
251         }
252     }
253 
254     fStack = new UVector64(status);
255     if (fStack == NULL) {
256         status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
257         return;
258     }
259 
260     reset(input);
261     setStackLimit(DEFAULT_BACKTRACK_STACK_CAPACITY, status);
262     if (U_FAILURE(status)) {
263         fDeferredStatus = status;
264         return;
265     }
266 }
267 
268 
269 static const UChar BACKSLASH  = 0x5c;
270 static const UChar DOLLARSIGN = 0x24;
271 static const UChar LEFTBRACKET = 0x7b;
272 static const UChar RIGHTBRACKET = 0x7d;
273 
274 //--------------------------------------------------------------------------------
275 //
276 //    appendReplacement
277 //
278 //--------------------------------------------------------------------------------
appendReplacement(UnicodeString & dest,const UnicodeString & replacement,UErrorCode & status)279 RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest,
280                                               const UnicodeString &replacement,
281                                               UErrorCode &status) {
282     UText replacementText = UTEXT_INITIALIZER;
283 
284     utext_openConstUnicodeString(&replacementText, &replacement, &status);
285     if (U_SUCCESS(status)) {
286         UText resultText = UTEXT_INITIALIZER;
287         utext_openUnicodeString(&resultText, &dest, &status);
288 
289         if (U_SUCCESS(status)) {
290             appendReplacement(&resultText, &replacementText, status);
291             utext_close(&resultText);
292         }
293         utext_close(&replacementText);
294     }
295 
296     return *this;
297 }
298 
299 //
300 //    appendReplacement, UText mode
301 //
appendReplacement(UText * dest,UText * replacement,UErrorCode & status)302 RegexMatcher &RegexMatcher::appendReplacement(UText *dest,
303                                               UText *replacement,
304                                               UErrorCode &status) {
305     if (U_FAILURE(status)) {
306         return *this;
307     }
308     if (U_FAILURE(fDeferredStatus)) {
309         status = fDeferredStatus;
310         return *this;
311     }
312     if (fMatch == FALSE) {
313         status = U_REGEX_INVALID_STATE;
314         return *this;
315     }
316 
317     // Copy input string from the end of previous match to start of current match
318     int64_t  destLen = utext_nativeLength(dest);
319     if (fMatchStart > fAppendPosition) {
320         if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
321             destLen += utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
322                                      (int32_t)(fMatchStart-fAppendPosition), &status);
323         } else {
324             int32_t len16;
325             if (UTEXT_USES_U16(fInputText)) {
326                 len16 = (int32_t)(fMatchStart-fAppendPosition);
327             } else {
328                 UErrorCode lengthStatus = U_ZERO_ERROR;
329                 len16 = utext_extract(fInputText, fAppendPosition, fMatchStart, NULL, 0, &lengthStatus);
330             }
331             UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
332             if (inputChars == NULL) {
333                 status = U_MEMORY_ALLOCATION_ERROR;
334                 return *this;
335             }
336             utext_extract(fInputText, fAppendPosition, fMatchStart, inputChars, len16+1, &status);
337             destLen += utext_replace(dest, destLen, destLen, inputChars, len16, &status);
338             uprv_free(inputChars);
339         }
340     }
341     fAppendPosition = fMatchEnd;
342 
343 
344     // scan the replacement text, looking for substitutions ($n) and \escapes.
345     //  TODO:  optimize this loop by efficiently scanning for '$' or '\',
346     //         move entire ranges not containing substitutions.
347     UTEXT_SETNATIVEINDEX(replacement, 0);
348     for (UChar32 c = UTEXT_NEXT32(replacement); U_SUCCESS(status) && c != U_SENTINEL;  c = UTEXT_NEXT32(replacement)) {
349         if (c == BACKSLASH) {
350             // Backslash Escape.  Copy the following char out without further checks.
351             //                    Note:  Surrogate pairs don't need any special handling
352             //                           The second half wont be a '$' or a '\', and
353             //                           will move to the dest normally on the next
354             //                           loop iteration.
355             c = UTEXT_CURRENT32(replacement);
356             if (c == U_SENTINEL) {
357                 break;
358             }
359 
360             if (c==0x55/*U*/ || c==0x75/*u*/) {
361                 // We have a \udddd or \Udddddddd escape sequence.
362                 int32_t offset = 0;
363                 struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(replacement);
364                 UChar32 escapedChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context);
365                 if (escapedChar != (UChar32)0xFFFFFFFF) {
366                     if (U_IS_BMP(escapedChar)) {
367                         UChar c16 = (UChar)escapedChar;
368                         destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
369                     } else {
370                         UChar surrogate[2];
371                         surrogate[0] = U16_LEAD(escapedChar);
372                         surrogate[1] = U16_TRAIL(escapedChar);
373                         if (U_SUCCESS(status)) {
374                             destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
375                         }
376                     }
377                     // TODO:  Report errors for mal-formed \u escapes?
378                     //        As this is, the original sequence is output, which may be OK.
379                     if (context.lastOffset == offset) {
380                         (void)UTEXT_PREVIOUS32(replacement);
381                     } else if (context.lastOffset != offset-1) {
382                         utext_moveIndex32(replacement, offset - context.lastOffset - 1);
383                     }
384                 }
385             } else {
386                 (void)UTEXT_NEXT32(replacement);
387                 // Plain backslash escape.  Just put out the escaped character.
388                 if (U_IS_BMP(c)) {
389                     UChar c16 = (UChar)c;
390                     destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
391                 } else {
392                     UChar surrogate[2];
393                     surrogate[0] = U16_LEAD(c);
394                     surrogate[1] = U16_TRAIL(c);
395                     if (U_SUCCESS(status)) {
396                         destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
397                     }
398                 }
399             }
400         } else if (c != DOLLARSIGN) {
401             // Normal char, not a $.  Copy it out without further checks.
402             if (U_IS_BMP(c)) {
403                 UChar c16 = (UChar)c;
404                 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
405             } else {
406                 UChar surrogate[2];
407                 surrogate[0] = U16_LEAD(c);
408                 surrogate[1] = U16_TRAIL(c);
409                 if (U_SUCCESS(status)) {
410                     destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
411                 }
412             }
413         } else {
414             // We've got a $.  Pick up a capture group name or number if one follows.
415             // Consume digits so long as the resulting group number <= the number of
416             // number of capture groups in the pattern.
417 
418             int32_t groupNum  = 0;
419             int32_t numDigits = 0;
420             UChar32 nextChar = utext_current32(replacement);
421             if (nextChar == LEFTBRACKET) {
422                 // Scan for a Named Capture Group, ${name}.
423                 UnicodeString groupName;
424                 utext_next32(replacement);
425                 while(U_SUCCESS(status) && nextChar != RIGHTBRACKET) {
426                     nextChar = utext_next32(replacement);
427                     if (nextChar == U_SENTINEL) {
428                         status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
429                     } else if ((nextChar >= 0x41 && nextChar <= 0x5a) ||       // A..Z
430                                (nextChar >= 0x61 && nextChar <= 0x7a) ||       // a..z
431                                (nextChar >= 0x31 && nextChar <= 0x39)) {       // 0..9
432                         groupName.append(nextChar);
433                     } else if (nextChar == RIGHTBRACKET) {
434                         groupNum = fPattern->fNamedCaptureMap ? uhash_geti(fPattern->fNamedCaptureMap, &groupName) : 0;
435                         if (groupNum == 0) {
436                             status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
437                         }
438                     } else {
439                         // Character was something other than a name char or a closing '}'
440                         status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
441                     }
442                 }
443 
444             } else if (u_isdigit(nextChar)) {
445                 // $n    Scan for a capture group number
446                 int32_t numCaptureGroups = fPattern->fGroupMap->size();
447                 for (;;) {
448                     nextChar = UTEXT_CURRENT32(replacement);
449                     if (nextChar == U_SENTINEL) {
450                         break;
451                     }
452                     if (u_isdigit(nextChar) == FALSE) {
453                         break;
454                     }
455                     int32_t nextDigitVal = u_charDigitValue(nextChar);
456                     if (groupNum*10 + nextDigitVal > numCaptureGroups) {
457                         // Don't consume the next digit if it makes the capture group number too big.
458                         if (numDigits == 0) {
459                             status = U_INDEX_OUTOFBOUNDS_ERROR;
460                         }
461                         break;
462                     }
463                     (void)UTEXT_NEXT32(replacement);
464                     groupNum=groupNum*10 + nextDigitVal;
465                     ++numDigits;
466                 }
467             } else {
468                 // $ not followed by capture group name or number.
469                 status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
470             }
471 
472             if (U_SUCCESS(status)) {
473                 destLen += appendGroup(groupNum, dest, status);
474             }
475         }  // End of $ capture group handling
476     }  // End of per-character loop through the replacement string.
477 
478     return *this;
479 }
480 
481 
482 
483 //--------------------------------------------------------------------------------
484 //
485 //    appendTail     Intended to be used in conjunction with appendReplacement()
486 //                   To the destination string, append everything following
487 //                   the last match position from the input string.
488 //
489 //                   Note:  Match ranges do not affect appendTail or appendReplacement
490 //
491 //--------------------------------------------------------------------------------
appendTail(UnicodeString & dest)492 UnicodeString &RegexMatcher::appendTail(UnicodeString &dest) {
493     UErrorCode status = U_ZERO_ERROR;
494     UText resultText = UTEXT_INITIALIZER;
495     utext_openUnicodeString(&resultText, &dest, &status);
496 
497     if (U_SUCCESS(status)) {
498         appendTail(&resultText, status);
499         utext_close(&resultText);
500     }
501 
502     return dest;
503 }
504 
505 //
506 //   appendTail, UText mode
507 //
appendTail(UText * dest,UErrorCode & status)508 UText *RegexMatcher::appendTail(UText *dest, UErrorCode &status) {
509     if (U_FAILURE(status)) {
510         return dest;
511     }
512     if (U_FAILURE(fDeferredStatus)) {
513         status = fDeferredStatus;
514         return dest;
515     }
516 
517     if (fInputLength > fAppendPosition) {
518         if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
519             int64_t destLen = utext_nativeLength(dest);
520             utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
521                           (int32_t)(fInputLength-fAppendPosition), &status);
522         } else {
523             int32_t len16;
524             if (UTEXT_USES_U16(fInputText)) {
525                 len16 = (int32_t)(fInputLength-fAppendPosition);
526             } else {
527                 len16 = utext_extract(fInputText, fAppendPosition, fInputLength, NULL, 0, &status);
528                 status = U_ZERO_ERROR; // buffer overflow
529             }
530 
531             UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16));
532             if (inputChars == NULL) {
533                 fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
534             } else {
535                 utext_extract(fInputText, fAppendPosition, fInputLength, inputChars, len16, &status); // unterminated
536                 int64_t destLen = utext_nativeLength(dest);
537                 utext_replace(dest, destLen, destLen, inputChars, len16, &status);
538                 uprv_free(inputChars);
539             }
540         }
541     }
542     return dest;
543 }
544 
545 
546 
547 //--------------------------------------------------------------------------------
548 //
549 //   end
550 //
551 //--------------------------------------------------------------------------------
end(UErrorCode & err) const552 int32_t RegexMatcher::end(UErrorCode &err) const {
553     return end(0, err);
554 }
555 
end64(UErrorCode & err) const556 int64_t RegexMatcher::end64(UErrorCode &err) const {
557     return end64(0, err);
558 }
559 
end64(int32_t group,UErrorCode & err) const560 int64_t RegexMatcher::end64(int32_t group, UErrorCode &err) const {
561     if (U_FAILURE(err)) {
562         return -1;
563     }
564     if (fMatch == FALSE) {
565         err = U_REGEX_INVALID_STATE;
566         return -1;
567     }
568     if (group < 0 || group > fPattern->fGroupMap->size()) {
569         err = U_INDEX_OUTOFBOUNDS_ERROR;
570         return -1;
571     }
572     int64_t e = -1;
573     if (group == 0) {
574         e = fMatchEnd;
575     } else {
576         // Get the position within the stack frame of the variables for
577         //    this capture group.
578         int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
579         U_ASSERT(groupOffset < fPattern->fFrameSize);
580         U_ASSERT(groupOffset >= 0);
581         e = fFrame->fExtra[groupOffset + 1];
582     }
583 
584         return e;
585 }
586 
end(int32_t group,UErrorCode & err) const587 int32_t RegexMatcher::end(int32_t group, UErrorCode &err) const {
588     return (int32_t)end64(group, err);
589 }
590 
591 //--------------------------------------------------------------------------------
592 //
593 //   findProgressInterrupt  This function is called once for each advance in the target
594 //                          string from the find() function, and calls the user progress callback
595 //                          function if there is one installed.
596 //
597 //         Return:  TRUE if the find operation is to be terminated.
598 //                  FALSE if the find operation is to continue running.
599 //
600 //--------------------------------------------------------------------------------
findProgressInterrupt(int64_t pos,UErrorCode & status)601 UBool RegexMatcher::findProgressInterrupt(int64_t pos, UErrorCode &status) {
602     if (fFindProgressCallbackFn && !(*fFindProgressCallbackFn)(fFindProgressCallbackContext, pos)) {
603         status = U_REGEX_STOPPED_BY_CALLER;
604         return TRUE;
605     }
606     return FALSE;
607 }
608 
609 //--------------------------------------------------------------------------------
610 //
611 //   find()
612 //
613 //--------------------------------------------------------------------------------
find()614 UBool RegexMatcher::find() {
615     if (U_FAILURE(fDeferredStatus)) {
616         return FALSE;
617     }
618     UErrorCode status = U_ZERO_ERROR;
619     UBool result = find(status);
620     return result;
621 }
622 
623 //--------------------------------------------------------------------------------
624 //
625 //   find()
626 //
627 //--------------------------------------------------------------------------------
find(UErrorCode & status)628 UBool RegexMatcher::find(UErrorCode &status) {
629     // Start at the position of the last match end.  (Will be zero if the
630     //   matcher has been reset.)
631     //
632     if (U_FAILURE(status)) {
633         return FALSE;
634     }
635     if (U_FAILURE(fDeferredStatus)) {
636         status = fDeferredStatus;
637         return FALSE;
638     }
639 
640     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
641         return findUsingChunk(status);
642     }
643 
644     int64_t startPos = fMatchEnd;
645     if (startPos==0) {
646         startPos = fActiveStart;
647     }
648 
649     if (fMatch) {
650         // Save the position of any previous successful match.
651         fLastMatchEnd = fMatchEnd;
652 
653         if (fMatchStart == fMatchEnd) {
654             // Previous match had zero length.  Move start position up one position
655             //  to avoid sending find() into a loop on zero-length matches.
656             if (startPos >= fActiveLimit) {
657                 fMatch = FALSE;
658                 fHitEnd = TRUE;
659                 return FALSE;
660             }
661             UTEXT_SETNATIVEINDEX(fInputText, startPos);
662             (void)UTEXT_NEXT32(fInputText);
663             startPos = UTEXT_GETNATIVEINDEX(fInputText);
664         }
665     } else {
666         if (fLastMatchEnd >= 0) {
667             // A previous find() failed to match.  Don't try again.
668             //   (without this test, a pattern with a zero-length match
669             //    could match again at the end of an input string.)
670             fHitEnd = TRUE;
671             return FALSE;
672         }
673     }
674 
675 
676     // Compute the position in the input string beyond which a match can not begin, because
677     //   the minimum length match would extend past the end of the input.
678     //   Note:  some patterns that cannot match anything will have fMinMatchLength==Max Int.
679     //          Be aware of possible overflows if making changes here.
680     int64_t testStartLimit;
681     if (UTEXT_USES_U16(fInputText)) {
682         testStartLimit = fActiveLimit - fPattern->fMinMatchLen;
683         if (startPos > testStartLimit) {
684             fMatch = FALSE;
685             fHitEnd = TRUE;
686             return FALSE;
687         }
688     } else {
689         // We don't know exactly how long the minimum match length is in native characters.
690         // Treat anything > 0 as 1.
691         testStartLimit = fActiveLimit - (fPattern->fMinMatchLen > 0 ? 1 : 0);
692     }
693 
694     UChar32  c;
695     U_ASSERT(startPos >= 0);
696 
697     switch (fPattern->fStartType) {
698     case START_NO_INFO:
699         // No optimization was found.
700         //  Try a match at each input position.
701         for (;;) {
702             MatchAt(startPos, FALSE, status);
703             if (U_FAILURE(status)) {
704                 return FALSE;
705             }
706             if (fMatch) {
707                 return TRUE;
708             }
709             if (startPos >= testStartLimit) {
710                 fHitEnd = TRUE;
711                 return FALSE;
712             }
713             UTEXT_SETNATIVEINDEX(fInputText, startPos);
714             (void)UTEXT_NEXT32(fInputText);
715             startPos = UTEXT_GETNATIVEINDEX(fInputText);
716             // Note that it's perfectly OK for a pattern to have a zero-length
717             //   match at the end of a string, so we must make sure that the loop
718             //   runs with startPos == testStartLimit the last time through.
719             if  (findProgressInterrupt(startPos, status))
720                 return FALSE;
721         }
722         UPRV_UNREACHABLE_EXIT;
723 
724     case START_START:
725         // Matches are only possible at the start of the input string
726         //   (pattern begins with ^ or \A)
727         if (startPos > fActiveStart) {
728             fMatch = FALSE;
729             return FALSE;
730         }
731         MatchAt(startPos, FALSE, status);
732         if (U_FAILURE(status)) {
733             return FALSE;
734         }
735         return fMatch;
736 
737 
738     case START_SET:
739         {
740             // Match may start on any char from a pre-computed set.
741             U_ASSERT(fPattern->fMinMatchLen > 0);
742             UTEXT_SETNATIVEINDEX(fInputText, startPos);
743             for (;;) {
744                 int64_t pos = startPos;
745                 c = UTEXT_NEXT32(fInputText);
746                 startPos = UTEXT_GETNATIVEINDEX(fInputText);
747                 // c will be -1 (U_SENTINEL) at end of text, in which case we
748                 // skip this next block (so we don't have a negative array index)
749                 // and handle end of text in the following block.
750                 if (c >= 0 && ((c<256 && fPattern->fInitialChars8->contains(c)) ||
751                               (c>=256 && fPattern->fInitialChars->contains(c)))) {
752                     MatchAt(pos, FALSE, status);
753                     if (U_FAILURE(status)) {
754                         return FALSE;
755                     }
756                     if (fMatch) {
757                         return TRUE;
758                     }
759                     UTEXT_SETNATIVEINDEX(fInputText, pos);
760                 }
761                 if (startPos > testStartLimit) {
762                     fMatch = FALSE;
763                     fHitEnd = TRUE;
764                     return FALSE;
765                 }
766                 if  (findProgressInterrupt(startPos, status))
767                     return FALSE;
768             }
769         }
770         UPRV_UNREACHABLE_EXIT;
771 
772     case START_STRING:
773     case START_CHAR:
774         {
775             // Match starts on exactly one char.
776             U_ASSERT(fPattern->fMinMatchLen > 0);
777             UChar32 theChar = fPattern->fInitialChar;
778             UTEXT_SETNATIVEINDEX(fInputText, startPos);
779             for (;;) {
780                 int64_t pos = startPos;
781                 c = UTEXT_NEXT32(fInputText);
782                 startPos = UTEXT_GETNATIVEINDEX(fInputText);
783                 if (c == theChar) {
784                     MatchAt(pos, FALSE, status);
785                     if (U_FAILURE(status)) {
786                         return FALSE;
787                     }
788                     if (fMatch) {
789                         return TRUE;
790                     }
791                     UTEXT_SETNATIVEINDEX(fInputText, startPos);
792                 }
793                 if (startPos > testStartLimit) {
794                     fMatch = FALSE;
795                     fHitEnd = TRUE;
796                     return FALSE;
797                 }
798                 if  (findProgressInterrupt(startPos, status))
799                     return FALSE;
800            }
801         }
802         UPRV_UNREACHABLE_EXIT;
803 
804     case START_LINE:
805         {
806             UChar32 ch;
807             if (startPos == fAnchorStart) {
808                 MatchAt(startPos, FALSE, status);
809                 if (U_FAILURE(status)) {
810                     return FALSE;
811                 }
812                 if (fMatch) {
813                     return TRUE;
814                 }
815                 UTEXT_SETNATIVEINDEX(fInputText, startPos);
816                 ch = UTEXT_NEXT32(fInputText);
817                 startPos = UTEXT_GETNATIVEINDEX(fInputText);
818             } else {
819                 UTEXT_SETNATIVEINDEX(fInputText, startPos);
820                 ch = UTEXT_PREVIOUS32(fInputText);
821                 UTEXT_SETNATIVEINDEX(fInputText, startPos);
822             }
823 
824             if (fPattern->fFlags & UREGEX_UNIX_LINES) {
825                 for (;;) {
826                     if (ch == 0x0a) {
827                             MatchAt(startPos, FALSE, status);
828                             if (U_FAILURE(status)) {
829                                 return FALSE;
830                             }
831                             if (fMatch) {
832                                 return TRUE;
833                             }
834                             UTEXT_SETNATIVEINDEX(fInputText, startPos);
835                     }
836                     if (startPos >= testStartLimit) {
837                         fMatch = FALSE;
838                         fHitEnd = TRUE;
839                         return FALSE;
840                     }
841                     ch = UTEXT_NEXT32(fInputText);
842                     startPos = UTEXT_GETNATIVEINDEX(fInputText);
843                     // Note that it's perfectly OK for a pattern to have a zero-length
844                     //   match at the end of a string, so we must make sure that the loop
845                     //   runs with startPos == testStartLimit the last time through.
846                     if  (findProgressInterrupt(startPos, status))
847                         return FALSE;
848                 }
849             } else {
850                 for (;;) {
851                     if (isLineTerminator(ch)) {
852                         if (ch == 0x0d && startPos < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
853                             (void)UTEXT_NEXT32(fInputText);
854                             startPos = UTEXT_GETNATIVEINDEX(fInputText);
855                         }
856                         MatchAt(startPos, FALSE, status);
857                         if (U_FAILURE(status)) {
858                             return FALSE;
859                         }
860                         if (fMatch) {
861                             return TRUE;
862                         }
863                         UTEXT_SETNATIVEINDEX(fInputText, startPos);
864                     }
865                     if (startPos >= testStartLimit) {
866                         fMatch = FALSE;
867                         fHitEnd = TRUE;
868                         return FALSE;
869                     }
870                     ch = UTEXT_NEXT32(fInputText);
871                     startPos = UTEXT_GETNATIVEINDEX(fInputText);
872                     // Note that it's perfectly OK for a pattern to have a zero-length
873                     //   match at the end of a string, so we must make sure that the loop
874                     //   runs with startPos == testStartLimit the last time through.
875                     if  (findProgressInterrupt(startPos, status))
876                         return FALSE;
877                 }
878             }
879         }
880 
881     default:
882         UPRV_UNREACHABLE_ASSERT;
883         // Unknown value in fPattern->fStartType, should be from StartOfMatch enum. But
884         // we have reports of this in production code, don't use UPRV_UNREACHABLE_EXIT.
885         // See ICU-21669.
886         status = U_INTERNAL_PROGRAM_ERROR;
887         return FALSE;
888     }
889 
890     UPRV_UNREACHABLE_EXIT;
891 }
892 
893 
894 
find(int64_t start,UErrorCode & status)895 UBool RegexMatcher::find(int64_t start, UErrorCode &status) {
896     if (U_FAILURE(status)) {
897         return FALSE;
898     }
899     if (U_FAILURE(fDeferredStatus)) {
900         status = fDeferredStatus;
901         return FALSE;
902     }
903     this->reset();                        // Note:  Reset() is specified by Java Matcher documentation.
904                                           //        This will reset the region to be the full input length.
905     if (start < 0) {
906         status = U_INDEX_OUTOFBOUNDS_ERROR;
907         return FALSE;
908     }
909 
910     int64_t nativeStart = start;
911     if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
912         status = U_INDEX_OUTOFBOUNDS_ERROR;
913         return FALSE;
914     }
915     fMatchEnd = nativeStart;
916     return find(status);
917 }
918 
919 
920 //--------------------------------------------------------------------------------
921 //
922 //   findUsingChunk() -- like find(), but with the advance knowledge that the
923 //                       entire string is available in the UText's chunk buffer.
924 //
925 //--------------------------------------------------------------------------------
findUsingChunk(UErrorCode & status)926 UBool RegexMatcher::findUsingChunk(UErrorCode &status) {
927     // Start at the position of the last match end.  (Will be zero if the
928     //   matcher has been reset.
929     //
930 
931     int32_t startPos = (int32_t)fMatchEnd;
932     if (startPos==0) {
933         startPos = (int32_t)fActiveStart;
934     }
935 
936     const UChar *inputBuf = fInputText->chunkContents;
937 
938     if (fMatch) {
939         // Save the position of any previous successful match.
940         fLastMatchEnd = fMatchEnd;
941 
942         if (fMatchStart == fMatchEnd) {
943             // Previous match had zero length.  Move start position up one position
944             //  to avoid sending find() into a loop on zero-length matches.
945             if (startPos >= fActiveLimit) {
946                 fMatch = FALSE;
947                 fHitEnd = TRUE;
948                 return FALSE;
949             }
950             U16_FWD_1(inputBuf, startPos, fInputLength);
951         }
952     } else {
953         if (fLastMatchEnd >= 0) {
954             // A previous find() failed to match.  Don't try again.
955             //   (without this test, a pattern with a zero-length match
956             //    could match again at the end of an input string.)
957             fHitEnd = TRUE;
958             return FALSE;
959         }
960     }
961 
962 
963     // Compute the position in the input string beyond which a match can not begin, because
964     //   the minimum length match would extend past the end of the input.
965     //   Note:  some patterns that cannot match anything will have fMinMatchLength==Max Int.
966     //          Be aware of possible overflows if making changes here.
967     //   Note:  a match can begin at inputBuf + testLen; it is an inclusive limit.
968     int32_t testLen  = (int32_t)(fActiveLimit - fPattern->fMinMatchLen);
969     if (startPos > testLen) {
970         fMatch = FALSE;
971         fHitEnd = TRUE;
972         return FALSE;
973     }
974 
975     UChar32  c;
976     U_ASSERT(startPos >= 0);
977 
978     switch (fPattern->fStartType) {
979     case START_NO_INFO:
980         // No optimization was found.
981         //  Try a match at each input position.
982         for (;;) {
983             MatchChunkAt(startPos, FALSE, status);
984             if (U_FAILURE(status)) {
985                 return FALSE;
986             }
987             if (fMatch) {
988                 return TRUE;
989             }
990             if (startPos >= testLen) {
991                 fHitEnd = TRUE;
992                 return FALSE;
993             }
994             U16_FWD_1(inputBuf, startPos, fActiveLimit);
995             // Note that it's perfectly OK for a pattern to have a zero-length
996             //   match at the end of a string, so we must make sure that the loop
997             //   runs with startPos == testLen the last time through.
998             if  (findProgressInterrupt(startPos, status))
999                 return FALSE;
1000         }
1001         UPRV_UNREACHABLE_EXIT;
1002 
1003     case START_START:
1004         // Matches are only possible at the start of the input string
1005         //   (pattern begins with ^ or \A)
1006         if (startPos > fActiveStart) {
1007             fMatch = FALSE;
1008             return FALSE;
1009         }
1010         MatchChunkAt(startPos, FALSE, status);
1011         if (U_FAILURE(status)) {
1012             return FALSE;
1013         }
1014         return fMatch;
1015 
1016 
1017     case START_SET:
1018     {
1019         // Match may start on any char from a pre-computed set.
1020         U_ASSERT(fPattern->fMinMatchLen > 0);
1021         for (;;) {
1022             int32_t pos = startPos;
1023             U16_NEXT(inputBuf, startPos, fActiveLimit, c);  // like c = inputBuf[startPos++];
1024             if ((c<256 && fPattern->fInitialChars8->contains(c)) ||
1025                 (c>=256 && fPattern->fInitialChars->contains(c))) {
1026                 MatchChunkAt(pos, FALSE, status);
1027                 if (U_FAILURE(status)) {
1028                     return FALSE;
1029                 }
1030                 if (fMatch) {
1031                     return TRUE;
1032                 }
1033             }
1034             if (startPos > testLen) {
1035                 fMatch = FALSE;
1036                 fHitEnd = TRUE;
1037                 return FALSE;
1038             }
1039             if  (findProgressInterrupt(startPos, status))
1040                 return FALSE;
1041         }
1042     }
1043     UPRV_UNREACHABLE_EXIT;
1044 
1045     case START_STRING:
1046     case START_CHAR:
1047     {
1048         // Match starts on exactly one char.
1049         U_ASSERT(fPattern->fMinMatchLen > 0);
1050         UChar32 theChar = fPattern->fInitialChar;
1051         for (;;) {
1052             int32_t pos = startPos;
1053             U16_NEXT(inputBuf, startPos, fActiveLimit, c);  // like c = inputBuf[startPos++];
1054             if (c == theChar) {
1055                 MatchChunkAt(pos, FALSE, status);
1056                 if (U_FAILURE(status)) {
1057                     return FALSE;
1058                 }
1059                 if (fMatch) {
1060                     return TRUE;
1061                 }
1062             }
1063             if (startPos > testLen) {
1064                 fMatch = FALSE;
1065                 fHitEnd = TRUE;
1066                 return FALSE;
1067             }
1068             if  (findProgressInterrupt(startPos, status))
1069                 return FALSE;
1070         }
1071     }
1072     UPRV_UNREACHABLE_EXIT;
1073 
1074     case START_LINE:
1075     {
1076         UChar32 ch;
1077         if (startPos == fAnchorStart) {
1078             MatchChunkAt(startPos, FALSE, status);
1079             if (U_FAILURE(status)) {
1080                 return FALSE;
1081             }
1082             if (fMatch) {
1083                 return TRUE;
1084             }
1085             U16_FWD_1(inputBuf, startPos, fActiveLimit);
1086         }
1087 
1088         if (fPattern->fFlags & UREGEX_UNIX_LINES) {
1089             for (;;) {
1090                 ch = inputBuf[startPos-1];
1091                 if (ch == 0x0a) {
1092                     MatchChunkAt(startPos, FALSE, status);
1093                     if (U_FAILURE(status)) {
1094                         return FALSE;
1095                     }
1096                     if (fMatch) {
1097                         return TRUE;
1098                     }
1099                 }
1100                 if (startPos >= testLen) {
1101                     fMatch = FALSE;
1102                     fHitEnd = TRUE;
1103                     return FALSE;
1104                 }
1105                 U16_FWD_1(inputBuf, startPos, fActiveLimit);
1106                 // Note that it's perfectly OK for a pattern to have a zero-length
1107                 //   match at the end of a string, so we must make sure that the loop
1108                 //   runs with startPos == testLen the last time through.
1109                 if  (findProgressInterrupt(startPos, status))
1110                     return FALSE;
1111             }
1112         } else {
1113             for (;;) {
1114                 ch = inputBuf[startPos-1];
1115                 if (isLineTerminator(ch)) {
1116                     if (ch == 0x0d && startPos < fActiveLimit && inputBuf[startPos] == 0x0a) {
1117                         startPos++;
1118                     }
1119                     MatchChunkAt(startPos, FALSE, status);
1120                     if (U_FAILURE(status)) {
1121                         return FALSE;
1122                     }
1123                     if (fMatch) {
1124                         return TRUE;
1125                     }
1126                 }
1127                 if (startPos >= testLen) {
1128                     fMatch = FALSE;
1129                     fHitEnd = TRUE;
1130                     return FALSE;
1131                 }
1132                 U16_FWD_1(inputBuf, startPos, fActiveLimit);
1133                 // Note that it's perfectly OK for a pattern to have a zero-length
1134                 //   match at the end of a string, so we must make sure that the loop
1135                 //   runs with startPos == testLen the last time through.
1136                 if  (findProgressInterrupt(startPos, status))
1137                     return FALSE;
1138             }
1139         }
1140     }
1141 
1142     default:
1143         UPRV_UNREACHABLE_ASSERT;
1144         // Unknown value in fPattern->fStartType, should be from StartOfMatch enum. But
1145         // we have reports of this in production code, don't use UPRV_UNREACHABLE_EXIT.
1146         // See ICU-21669.
1147         status = U_INTERNAL_PROGRAM_ERROR;
1148         return FALSE;
1149     }
1150 
1151     UPRV_UNREACHABLE_EXIT;
1152 }
1153 
1154 
1155 
1156 //--------------------------------------------------------------------------------
1157 //
1158 //  group()
1159 //
1160 //--------------------------------------------------------------------------------
group(UErrorCode & status) const1161 UnicodeString RegexMatcher::group(UErrorCode &status) const {
1162     return group(0, status);
1163 }
1164 
1165 //  Return immutable shallow clone
group(UText * dest,int64_t & group_len,UErrorCode & status) const1166 UText *RegexMatcher::group(UText *dest, int64_t &group_len, UErrorCode &status) const {
1167     return group(0, dest, group_len, status);
1168 }
1169 
1170 //  Return immutable shallow clone
group(int32_t groupNum,UText * dest,int64_t & group_len,UErrorCode & status) const1171 UText *RegexMatcher::group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const {
1172     group_len = 0;
1173     if (U_FAILURE(status)) {
1174         return dest;
1175     }
1176     if (U_FAILURE(fDeferredStatus)) {
1177         status = fDeferredStatus;
1178     } else if (fMatch == FALSE) {
1179         status = U_REGEX_INVALID_STATE;
1180     } else if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
1181         status = U_INDEX_OUTOFBOUNDS_ERROR;
1182     }
1183 
1184     if (U_FAILURE(status)) {
1185         return dest;
1186     }
1187 
1188     int64_t s, e;
1189     if (groupNum == 0) {
1190         s = fMatchStart;
1191         e = fMatchEnd;
1192     } else {
1193         int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1194         U_ASSERT(groupOffset < fPattern->fFrameSize);
1195         U_ASSERT(groupOffset >= 0);
1196         s = fFrame->fExtra[groupOffset];
1197         e = fFrame->fExtra[groupOffset+1];
1198     }
1199 
1200     if (s < 0) {
1201         // A capture group wasn't part of the match
1202         return utext_clone(dest, fInputText, FALSE, TRUE, &status);
1203     }
1204     U_ASSERT(s <= e);
1205     group_len = e - s;
1206 
1207     dest = utext_clone(dest, fInputText, FALSE, TRUE, &status);
1208     if (dest)
1209         UTEXT_SETNATIVEINDEX(dest, s);
1210     return dest;
1211 }
1212 
group(int32_t groupNum,UErrorCode & status) const1213 UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const {
1214     UnicodeString result;
1215     int64_t groupStart = start64(groupNum, status);
1216     int64_t groupEnd = end64(groupNum, status);
1217     if (U_FAILURE(status) || groupStart == -1 || groupStart == groupEnd) {
1218         return result;
1219     }
1220 
1221     // Get the group length using a utext_extract preflight.
1222     //    UText is actually pretty efficient at this when underlying encoding is UTF-16.
1223     int32_t length = utext_extract(fInputText, groupStart, groupEnd, NULL, 0, &status);
1224     if (status != U_BUFFER_OVERFLOW_ERROR) {
1225         return result;
1226     }
1227 
1228     status = U_ZERO_ERROR;
1229     UChar *buf = result.getBuffer(length);
1230     if (buf == NULL) {
1231         status = U_MEMORY_ALLOCATION_ERROR;
1232     } else {
1233         int32_t extractLength = utext_extract(fInputText, groupStart, groupEnd, buf, length, &status);
1234         result.releaseBuffer(extractLength);
1235         U_ASSERT(length == extractLength);
1236     }
1237     return result;
1238 }
1239 
1240 
1241 //--------------------------------------------------------------------------------
1242 //
1243 //  appendGroup() -- currently internal only, appends a group to a UText rather
1244 //                   than replacing its contents
1245 //
1246 //--------------------------------------------------------------------------------
1247 
appendGroup(int32_t groupNum,UText * dest,UErrorCode & status) const1248 int64_t RegexMatcher::appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const {
1249     if (U_FAILURE(status)) {
1250         return 0;
1251     }
1252     if (U_FAILURE(fDeferredStatus)) {
1253         status = fDeferredStatus;
1254         return 0;
1255     }
1256     int64_t destLen = utext_nativeLength(dest);
1257 
1258     if (fMatch == FALSE) {
1259         status = U_REGEX_INVALID_STATE;
1260         return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1261     }
1262     if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
1263         status = U_INDEX_OUTOFBOUNDS_ERROR;
1264         return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1265     }
1266 
1267     int64_t s, e;
1268     if (groupNum == 0) {
1269         s = fMatchStart;
1270         e = fMatchEnd;
1271     } else {
1272         int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1273         U_ASSERT(groupOffset < fPattern->fFrameSize);
1274         U_ASSERT(groupOffset >= 0);
1275         s = fFrame->fExtra[groupOffset];
1276         e = fFrame->fExtra[groupOffset+1];
1277     }
1278 
1279     if (s < 0) {
1280         // A capture group wasn't part of the match
1281         return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1282     }
1283     U_ASSERT(s <= e);
1284 
1285     int64_t deltaLen;
1286     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1287         U_ASSERT(e <= fInputLength);
1288         deltaLen = utext_replace(dest, destLen, destLen, fInputText->chunkContents+s, (int32_t)(e-s), &status);
1289     } else {
1290         int32_t len16;
1291         if (UTEXT_USES_U16(fInputText)) {
1292             len16 = (int32_t)(e-s);
1293         } else {
1294             UErrorCode lengthStatus = U_ZERO_ERROR;
1295             len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus);
1296         }
1297         UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
1298         if (groupChars == NULL) {
1299             status = U_MEMORY_ALLOCATION_ERROR;
1300             return 0;
1301         }
1302         utext_extract(fInputText, s, e, groupChars, len16+1, &status);
1303 
1304         deltaLen = utext_replace(dest, destLen, destLen, groupChars, len16, &status);
1305         uprv_free(groupChars);
1306     }
1307     return deltaLen;
1308 }
1309 
1310 
1311 
1312 //--------------------------------------------------------------------------------
1313 //
1314 //  groupCount()
1315 //
1316 //--------------------------------------------------------------------------------
groupCount() const1317 int32_t RegexMatcher::groupCount() const {
1318     return fPattern->fGroupMap->size();
1319 }
1320 
1321 //--------------------------------------------------------------------------------
1322 //
1323 //  hasAnchoringBounds()
1324 //
1325 //--------------------------------------------------------------------------------
hasAnchoringBounds() const1326 UBool RegexMatcher::hasAnchoringBounds() const {
1327     return fAnchoringBounds;
1328 }
1329 
1330 
1331 //--------------------------------------------------------------------------------
1332 //
1333 //  hasTransparentBounds()
1334 //
1335 //--------------------------------------------------------------------------------
hasTransparentBounds() const1336 UBool RegexMatcher::hasTransparentBounds() const {
1337     return fTransparentBounds;
1338 }
1339 
1340 
1341 
1342 //--------------------------------------------------------------------------------
1343 //
1344 //  hitEnd()
1345 //
1346 //--------------------------------------------------------------------------------
hitEnd() const1347 UBool RegexMatcher::hitEnd() const {
1348     return fHitEnd;
1349 }
1350 
1351 
1352 //--------------------------------------------------------------------------------
1353 //
1354 //  input()
1355 //
1356 //--------------------------------------------------------------------------------
input() const1357 const UnicodeString &RegexMatcher::input() const {
1358     if (!fInput) {
1359         UErrorCode status = U_ZERO_ERROR;
1360         int32_t len16;
1361         if (UTEXT_USES_U16(fInputText)) {
1362             len16 = (int32_t)fInputLength;
1363         } else {
1364             len16 = utext_extract(fInputText, 0, fInputLength, NULL, 0, &status);
1365             status = U_ZERO_ERROR; // overflow, length status
1366         }
1367         UnicodeString *result = new UnicodeString(len16, 0, 0);
1368 
1369         UChar *inputChars = result->getBuffer(len16);
1370         utext_extract(fInputText, 0, fInputLength, inputChars, len16, &status); // unterminated warning
1371         result->releaseBuffer(len16);
1372 
1373         (*(const UnicodeString **)&fInput) = result; // pointer assignment, rather than operator=
1374     }
1375 
1376     return *fInput;
1377 }
1378 
1379 //--------------------------------------------------------------------------------
1380 //
1381 //  inputText()
1382 //
1383 //--------------------------------------------------------------------------------
inputText() const1384 UText *RegexMatcher::inputText() const {
1385     return fInputText;
1386 }
1387 
1388 
1389 //--------------------------------------------------------------------------------
1390 //
1391 //  getInput() -- like inputText(), but makes a clone or copies into another UText
1392 //
1393 //--------------------------------------------------------------------------------
getInput(UText * dest,UErrorCode & status) const1394 UText *RegexMatcher::getInput (UText *dest, UErrorCode &status) const {
1395     if (U_FAILURE(status)) {
1396         return dest;
1397     }
1398     if (U_FAILURE(fDeferredStatus)) {
1399         status = fDeferredStatus;
1400         return dest;
1401     }
1402 
1403     if (dest) {
1404         if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1405             utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents, (int32_t)fInputLength, &status);
1406         } else {
1407             int32_t input16Len;
1408             if (UTEXT_USES_U16(fInputText)) {
1409                 input16Len = (int32_t)fInputLength;
1410             } else {
1411                 UErrorCode lengthStatus = U_ZERO_ERROR;
1412                 input16Len = utext_extract(fInputText, 0, fInputLength, NULL, 0, &lengthStatus); // buffer overflow error
1413             }
1414             UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(input16Len));
1415             if (inputChars == NULL) {
1416                 return dest;
1417             }
1418 
1419             status = U_ZERO_ERROR;
1420             utext_extract(fInputText, 0, fInputLength, inputChars, input16Len, &status); // not terminated warning
1421             status = U_ZERO_ERROR;
1422             utext_replace(dest, 0, utext_nativeLength(dest), inputChars, input16Len, &status);
1423 
1424             uprv_free(inputChars);
1425         }
1426         return dest;
1427     } else {
1428         return utext_clone(NULL, fInputText, FALSE, TRUE, &status);
1429     }
1430 }
1431 
1432 
1433 static UBool compat_SyncMutableUTextContents(UText *ut);
compat_SyncMutableUTextContents(UText * ut)1434 static UBool compat_SyncMutableUTextContents(UText *ut) {
1435     UBool retVal = FALSE;
1436 
1437     //  In the following test, we're really only interested in whether the UText should switch
1438     //  between heap and stack allocation.  If length hasn't changed, we won't, so the chunkContents
1439     //  will still point to the correct data.
1440     if (utext_nativeLength(ut) != ut->nativeIndexingLimit) {
1441         UnicodeString *us=(UnicodeString *)ut->context;
1442 
1443         // Update to the latest length.
1444         // For example, (utext_nativeLength(ut) != ut->nativeIndexingLimit).
1445         int32_t newLength = us->length();
1446 
1447         // Update the chunk description.
1448         // The buffer may have switched between stack- and heap-based.
1449         ut->chunkContents    = us->getBuffer();
1450         ut->chunkLength      = newLength;
1451         ut->chunkNativeLimit = newLength;
1452         ut->nativeIndexingLimit = newLength;
1453         retVal = TRUE;
1454     }
1455 
1456     return retVal;
1457 }
1458 
1459 //--------------------------------------------------------------------------------
1460 //
1461 //  lookingAt()
1462 //
1463 //--------------------------------------------------------------------------------
lookingAt(UErrorCode & status)1464 UBool RegexMatcher::lookingAt(UErrorCode &status) {
1465     if (U_FAILURE(status)) {
1466         return FALSE;
1467     }
1468     if (U_FAILURE(fDeferredStatus)) {
1469         status = fDeferredStatus;
1470         return FALSE;
1471     }
1472 
1473     if (fInputUniStrMaybeMutable) {
1474         if (compat_SyncMutableUTextContents(fInputText)) {
1475         fInputLength = utext_nativeLength(fInputText);
1476         reset();
1477         }
1478     }
1479     else {
1480         resetPreserveRegion();
1481     }
1482     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1483         MatchChunkAt((int32_t)fActiveStart, FALSE, status);
1484     } else {
1485         MatchAt(fActiveStart, FALSE, status);
1486     }
1487     return fMatch;
1488 }
1489 
1490 
lookingAt(int64_t start,UErrorCode & status)1491 UBool RegexMatcher::lookingAt(int64_t start, UErrorCode &status) {
1492     if (U_FAILURE(status)) {
1493         return FALSE;
1494     }
1495     if (U_FAILURE(fDeferredStatus)) {
1496         status = fDeferredStatus;
1497         return FALSE;
1498     }
1499     reset();
1500 
1501     if (start < 0) {
1502         status = U_INDEX_OUTOFBOUNDS_ERROR;
1503         return FALSE;
1504     }
1505 
1506     if (fInputUniStrMaybeMutable) {
1507         if (compat_SyncMutableUTextContents(fInputText)) {
1508         fInputLength = utext_nativeLength(fInputText);
1509         reset();
1510         }
1511     }
1512 
1513     int64_t nativeStart;
1514     nativeStart = start;
1515     if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
1516         status = U_INDEX_OUTOFBOUNDS_ERROR;
1517         return FALSE;
1518     }
1519 
1520     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1521         MatchChunkAt((int32_t)nativeStart, FALSE, status);
1522     } else {
1523         MatchAt(nativeStart, FALSE, status);
1524     }
1525     return fMatch;
1526 }
1527 
1528 
1529 
1530 //--------------------------------------------------------------------------------
1531 //
1532 //  matches()
1533 //
1534 //--------------------------------------------------------------------------------
matches(UErrorCode & status)1535 UBool RegexMatcher::matches(UErrorCode &status) {
1536     if (U_FAILURE(status)) {
1537         return FALSE;
1538     }
1539     if (U_FAILURE(fDeferredStatus)) {
1540         status = fDeferredStatus;
1541         return FALSE;
1542     }
1543 
1544     if (fInputUniStrMaybeMutable) {
1545         if (compat_SyncMutableUTextContents(fInputText)) {
1546         fInputLength = utext_nativeLength(fInputText);
1547         reset();
1548         }
1549     }
1550     else {
1551         resetPreserveRegion();
1552     }
1553 
1554     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1555         MatchChunkAt((int32_t)fActiveStart, TRUE, status);
1556     } else {
1557         MatchAt(fActiveStart, TRUE, status);
1558     }
1559     return fMatch;
1560 }
1561 
1562 
matches(int64_t start,UErrorCode & status)1563 UBool RegexMatcher::matches(int64_t start, UErrorCode &status) {
1564     if (U_FAILURE(status)) {
1565         return FALSE;
1566     }
1567     if (U_FAILURE(fDeferredStatus)) {
1568         status = fDeferredStatus;
1569         return FALSE;
1570     }
1571     reset();
1572 
1573     if (start < 0) {
1574         status = U_INDEX_OUTOFBOUNDS_ERROR;
1575         return FALSE;
1576     }
1577 
1578     if (fInputUniStrMaybeMutable) {
1579         if (compat_SyncMutableUTextContents(fInputText)) {
1580         fInputLength = utext_nativeLength(fInputText);
1581         reset();
1582         }
1583     }
1584 
1585     int64_t nativeStart;
1586     nativeStart = start;
1587     if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
1588         status = U_INDEX_OUTOFBOUNDS_ERROR;
1589         return FALSE;
1590     }
1591 
1592     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1593         MatchChunkAt((int32_t)nativeStart, TRUE, status);
1594     } else {
1595         MatchAt(nativeStart, TRUE, status);
1596     }
1597     return fMatch;
1598 }
1599 
1600 
1601 
1602 //--------------------------------------------------------------------------------
1603 //
1604 //    pattern
1605 //
1606 //--------------------------------------------------------------------------------
pattern() const1607 const RegexPattern &RegexMatcher::pattern() const {
1608     return *fPattern;
1609 }
1610 
1611 
1612 
1613 //--------------------------------------------------------------------------------
1614 //
1615 //    region
1616 //
1617 //--------------------------------------------------------------------------------
region(int64_t regionStart,int64_t regionLimit,int64_t startIndex,UErrorCode & status)1618 RegexMatcher &RegexMatcher::region(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status) {
1619     if (U_FAILURE(status)) {
1620         return *this;
1621     }
1622 
1623     if (regionStart>regionLimit || regionStart<0 || regionLimit<0) {
1624         status = U_ILLEGAL_ARGUMENT_ERROR;
1625     }
1626 
1627     int64_t nativeStart = regionStart;
1628     int64_t nativeLimit = regionLimit;
1629     if (nativeStart > fInputLength || nativeLimit > fInputLength) {
1630       status = U_ILLEGAL_ARGUMENT_ERROR;
1631     }
1632 
1633     if (startIndex == -1)
1634       this->reset();
1635     else
1636       resetPreserveRegion();
1637 
1638     fRegionStart = nativeStart;
1639     fRegionLimit = nativeLimit;
1640     fActiveStart = nativeStart;
1641     fActiveLimit = nativeLimit;
1642 
1643     if (startIndex != -1) {
1644       if (startIndex < fActiveStart || startIndex > fActiveLimit) {
1645           status = U_INDEX_OUTOFBOUNDS_ERROR;
1646       }
1647       fMatchEnd = startIndex;
1648     }
1649 
1650     if (!fTransparentBounds) {
1651         fLookStart = nativeStart;
1652         fLookLimit = nativeLimit;
1653     }
1654     if (fAnchoringBounds) {
1655         fAnchorStart = nativeStart;
1656         fAnchorLimit = nativeLimit;
1657     }
1658     return *this;
1659 }
1660 
region(int64_t start,int64_t limit,UErrorCode & status)1661 RegexMatcher &RegexMatcher::region(int64_t start, int64_t limit, UErrorCode &status) {
1662   return region(start, limit, -1, status);
1663 }
1664 
1665 //--------------------------------------------------------------------------------
1666 //
1667 //    regionEnd
1668 //
1669 //--------------------------------------------------------------------------------
regionEnd() const1670 int32_t RegexMatcher::regionEnd() const {
1671     return (int32_t)fRegionLimit;
1672 }
1673 
regionEnd64() const1674 int64_t RegexMatcher::regionEnd64() const {
1675     return fRegionLimit;
1676 }
1677 
1678 //--------------------------------------------------------------------------------
1679 //
1680 //    regionStart
1681 //
1682 //--------------------------------------------------------------------------------
regionStart() const1683 int32_t RegexMatcher::regionStart() const {
1684     return (int32_t)fRegionStart;
1685 }
1686 
regionStart64() const1687 int64_t RegexMatcher::regionStart64() const {
1688     return fRegionStart;
1689 }
1690 
1691 
1692 //--------------------------------------------------------------------------------
1693 //
1694 //    replaceAll
1695 //
1696 //--------------------------------------------------------------------------------
replaceAll(const UnicodeString & replacement,UErrorCode & status)1697 UnicodeString RegexMatcher::replaceAll(const UnicodeString &replacement, UErrorCode &status) {
1698     UText replacementText = UTEXT_INITIALIZER;
1699     UText resultText = UTEXT_INITIALIZER;
1700     UnicodeString resultString;
1701     if (U_FAILURE(status)) {
1702         return resultString;
1703     }
1704 
1705     utext_openConstUnicodeString(&replacementText, &replacement, &status);
1706     utext_openUnicodeString(&resultText, &resultString, &status);
1707 
1708     replaceAll(&replacementText, &resultText, status);
1709 
1710     utext_close(&resultText);
1711     utext_close(&replacementText);
1712 
1713     return resultString;
1714 }
1715 
1716 
1717 //
1718 //    replaceAll, UText mode
1719 //
replaceAll(UText * replacement,UText * dest,UErrorCode & status)1720 UText *RegexMatcher::replaceAll(UText *replacement, UText *dest, UErrorCode &status) {
1721     if (U_FAILURE(status)) {
1722         return dest;
1723     }
1724     if (U_FAILURE(fDeferredStatus)) {
1725         status = fDeferredStatus;
1726         return dest;
1727     }
1728 
1729     if (dest == NULL) {
1730         UnicodeString emptyString;
1731         UText empty = UTEXT_INITIALIZER;
1732 
1733         utext_openUnicodeString(&empty, &emptyString, &status);
1734         dest = utext_clone(NULL, &empty, TRUE, FALSE, &status);
1735         utext_close(&empty);
1736     }
1737 
1738     if (U_SUCCESS(status)) {
1739         reset();
1740         while (find()) {
1741             appendReplacement(dest, replacement, status);
1742             if (U_FAILURE(status)) {
1743                 break;
1744             }
1745         }
1746         appendTail(dest, status);
1747     }
1748 
1749     return dest;
1750 }
1751 
1752 
1753 //--------------------------------------------------------------------------------
1754 //
1755 //    replaceFirst
1756 //
1757 //--------------------------------------------------------------------------------
replaceFirst(const UnicodeString & replacement,UErrorCode & status)1758 UnicodeString RegexMatcher::replaceFirst(const UnicodeString &replacement, UErrorCode &status) {
1759     UText replacementText = UTEXT_INITIALIZER;
1760     UText resultText = UTEXT_INITIALIZER;
1761     UnicodeString resultString;
1762 
1763     utext_openConstUnicodeString(&replacementText, &replacement, &status);
1764     utext_openUnicodeString(&resultText, &resultString, &status);
1765 
1766     replaceFirst(&replacementText, &resultText, status);
1767 
1768     utext_close(&resultText);
1769     utext_close(&replacementText);
1770 
1771     return resultString;
1772 }
1773 
1774 //
1775 //    replaceFirst, UText mode
1776 //
replaceFirst(UText * replacement,UText * dest,UErrorCode & status)1777 UText *RegexMatcher::replaceFirst(UText *replacement, UText *dest, UErrorCode &status) {
1778     if (U_FAILURE(status)) {
1779         return dest;
1780     }
1781     if (U_FAILURE(fDeferredStatus)) {
1782         status = fDeferredStatus;
1783         return dest;
1784     }
1785 
1786     reset();
1787     if (!find()) {
1788         return getInput(dest, status);
1789     }
1790 
1791     if (dest == NULL) {
1792         UnicodeString emptyString;
1793         UText empty = UTEXT_INITIALIZER;
1794 
1795         utext_openUnicodeString(&empty, &emptyString, &status);
1796         dest = utext_clone(NULL, &empty, TRUE, FALSE, &status);
1797         utext_close(&empty);
1798     }
1799 
1800     appendReplacement(dest, replacement, status);
1801     appendTail(dest, status);
1802 
1803     return dest;
1804 }
1805 
1806 
1807 //--------------------------------------------------------------------------------
1808 //
1809 //     requireEnd
1810 //
1811 //--------------------------------------------------------------------------------
requireEnd() const1812 UBool RegexMatcher::requireEnd() const {
1813     return fRequireEnd;
1814 }
1815 
1816 
1817 //--------------------------------------------------------------------------------
1818 //
1819 //     reset
1820 //
1821 //--------------------------------------------------------------------------------
reset()1822 RegexMatcher &RegexMatcher::reset() {
1823     fRegionStart    = 0;
1824     fRegionLimit    = fInputLength;
1825     fActiveStart    = 0;
1826     fActiveLimit    = fInputLength;
1827     fAnchorStart    = 0;
1828     fAnchorLimit    = fInputLength;
1829     fLookStart      = 0;
1830     fLookLimit      = fInputLength;
1831     resetPreserveRegion();
1832     return *this;
1833 }
1834 
1835 
1836 
resetPreserveRegion()1837 void RegexMatcher::resetPreserveRegion() {
1838     fMatchStart     = 0;
1839     fMatchEnd       = 0;
1840     fLastMatchEnd   = -1;
1841     fAppendPosition = 0;
1842     fMatch          = FALSE;
1843     fHitEnd         = FALSE;
1844     fRequireEnd     = FALSE;
1845     fTime           = 0;
1846     fTickCounter    = TIMER_INITIAL_VALUE;
1847     //resetStack(); // more expensive than it looks...
1848 }
1849 
1850 
reset(const UnicodeString & input)1851 RegexMatcher &RegexMatcher::reset(const UnicodeString &input) {
1852     fInputText = utext_openConstUnicodeString(fInputText, &input, &fDeferredStatus);
1853     if (fPattern->fNeedsAltInput) {
1854         fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
1855     }
1856     if (U_FAILURE(fDeferredStatus)) {
1857         return *this;
1858     }
1859     fInputLength = utext_nativeLength(fInputText);
1860 
1861     reset();
1862     delete fInput;
1863     fInput = NULL;
1864 
1865     //  Do the following for any UnicodeString.
1866     //  This is for compatibility for those clients who modify the input string "live" during regex operations.
1867     fInputUniStrMaybeMutable = TRUE;
1868 
1869 #if UCONFIG_NO_BREAK_ITERATION==0
1870     if (fWordBreakItr) {
1871         fWordBreakItr->setText(fInputText, fDeferredStatus);
1872     }
1873     if (fGCBreakItr) {
1874         fGCBreakItr->setText(fInputText, fDeferredStatus);
1875     }
1876 #endif
1877 
1878     return *this;
1879 }
1880 
1881 
reset(UText * input)1882 RegexMatcher &RegexMatcher::reset(UText *input) {
1883     if (fInputText != input) {
1884         fInputText = utext_clone(fInputText, input, FALSE, TRUE, &fDeferredStatus);
1885         if (fPattern->fNeedsAltInput) fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
1886         if (U_FAILURE(fDeferredStatus)) {
1887             return *this;
1888         }
1889         fInputLength = utext_nativeLength(fInputText);
1890 
1891         delete fInput;
1892         fInput = NULL;
1893 
1894 #if UCONFIG_NO_BREAK_ITERATION==0
1895         if (fWordBreakItr) {
1896             fWordBreakItr->setText(input, fDeferredStatus);
1897         }
1898         if (fGCBreakItr) {
1899             fGCBreakItr->setText(fInputText, fDeferredStatus);
1900         }
1901 #endif
1902     }
1903     reset();
1904     fInputUniStrMaybeMutable = FALSE;
1905 
1906     return *this;
1907 }
1908 
1909 /*RegexMatcher &RegexMatcher::reset(const UChar *) {
1910     fDeferredStatus = U_INTERNAL_PROGRAM_ERROR;
1911     return *this;
1912 }*/
1913 
reset(int64_t position,UErrorCode & status)1914 RegexMatcher &RegexMatcher::reset(int64_t position, UErrorCode &status) {
1915     if (U_FAILURE(status)) {
1916         return *this;
1917     }
1918     reset();       // Reset also resets the region to be the entire string.
1919 
1920     if (position < 0 || position > fActiveLimit) {
1921         status = U_INDEX_OUTOFBOUNDS_ERROR;
1922         return *this;
1923     }
1924     fMatchEnd = position;
1925     return *this;
1926 }
1927 
1928 
1929 //--------------------------------------------------------------------------------
1930 //
1931 //    refresh
1932 //
1933 //--------------------------------------------------------------------------------
refreshInputText(UText * input,UErrorCode & status)1934 RegexMatcher &RegexMatcher::refreshInputText(UText *input, UErrorCode &status) {
1935     if (U_FAILURE(status)) {
1936         return *this;
1937     }
1938     if (input == NULL) {
1939         status = U_ILLEGAL_ARGUMENT_ERROR;
1940         return *this;
1941     }
1942     if (utext_nativeLength(fInputText) != utext_nativeLength(input)) {
1943         status = U_ILLEGAL_ARGUMENT_ERROR;
1944         return *this;
1945     }
1946     int64_t  pos = utext_getNativeIndex(fInputText);
1947     //  Shallow read-only clone of the new UText into the existing input UText
1948     fInputText = utext_clone(fInputText, input, FALSE, TRUE, &status);
1949     if (U_FAILURE(status)) {
1950         return *this;
1951     }
1952     utext_setNativeIndex(fInputText, pos);
1953 
1954     if (fAltInputText != NULL) {
1955         pos = utext_getNativeIndex(fAltInputText);
1956         fAltInputText = utext_clone(fAltInputText, input, FALSE, TRUE, &status);
1957         if (U_FAILURE(status)) {
1958             return *this;
1959         }
1960         utext_setNativeIndex(fAltInputText, pos);
1961     }
1962     return *this;
1963 }
1964 
1965 
1966 
1967 //--------------------------------------------------------------------------------
1968 //
1969 //    setTrace
1970 //
1971 //--------------------------------------------------------------------------------
setTrace(UBool state)1972 void RegexMatcher::setTrace(UBool state) {
1973     fTraceDebug = state;
1974 }
1975 
1976 
1977 
1978 /**
1979   *  UText, replace entire contents of the destination UText with a substring of the source UText.
1980   *
1981   *     @param src    The source UText
1982   *     @param dest   The destination UText. Must be writable.
1983   *                   May be NULL, in which case a new UText will be allocated.
1984   *     @param start  Start index of source substring.
1985   *     @param limit  Limit index of source substring.
1986   *     @param status An error code.
1987   */
utext_extract_replace(UText * src,UText * dest,int64_t start,int64_t limit,UErrorCode * status)1988 static UText *utext_extract_replace(UText *src, UText *dest, int64_t start, int64_t limit, UErrorCode *status) {
1989     if (U_FAILURE(*status)) {
1990         return dest;
1991     }
1992     if (start == limit) {
1993         if (dest) {
1994             utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, status);
1995             return dest;
1996         } else {
1997             return utext_openUChars(NULL, NULL, 0, status);
1998         }
1999     }
2000     int32_t length = utext_extract(src, start, limit, NULL, 0, status);
2001     if (*status != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(*status)) {
2002         return dest;
2003     }
2004     *status = U_ZERO_ERROR;
2005     MaybeStackArray<UChar, 40> buffer;
2006     if (length >= buffer.getCapacity()) {
2007         UChar *newBuf = buffer.resize(length+1);   // Leave space for terminating Nul.
2008         if (newBuf == NULL) {
2009             *status = U_MEMORY_ALLOCATION_ERROR;
2010         }
2011     }
2012     utext_extract(src, start, limit, buffer.getAlias(), length+1, status);
2013     if (dest) {
2014         utext_replace(dest, 0, utext_nativeLength(dest), buffer.getAlias(), length, status);
2015         return dest;
2016     }
2017 
2018     // Caller did not provide a preexisting UText.
2019     // Open a new one, and have it adopt the text buffer storage.
2020     if (U_FAILURE(*status)) {
2021         return NULL;
2022     }
2023     int32_t ownedLength = 0;
2024     UChar *ownedBuf = buffer.orphanOrClone(length+1, ownedLength);
2025     if (ownedBuf == NULL) {
2026         *status = U_MEMORY_ALLOCATION_ERROR;
2027         return NULL;
2028     }
2029     UText *result = utext_openUChars(NULL, ownedBuf, length, status);
2030     if (U_FAILURE(*status)) {
2031         uprv_free(ownedBuf);
2032         return NULL;
2033     }
2034     result->providerProperties |= (1 << UTEXT_PROVIDER_OWNS_TEXT);
2035     return result;
2036 }
2037 
2038 
2039 //---------------------------------------------------------------------
2040 //
2041 //   split
2042 //
2043 //---------------------------------------------------------------------
split(const UnicodeString & input,UnicodeString dest[],int32_t destCapacity,UErrorCode & status)2044 int32_t  RegexMatcher::split(const UnicodeString &input,
2045         UnicodeString    dest[],
2046         int32_t          destCapacity,
2047         UErrorCode      &status)
2048 {
2049     UText inputText = UTEXT_INITIALIZER;
2050     utext_openConstUnicodeString(&inputText, &input, &status);
2051     if (U_FAILURE(status)) {
2052         return 0;
2053     }
2054 
2055     UText **destText = (UText **)uprv_malloc(sizeof(UText*)*destCapacity);
2056     if (destText == NULL) {
2057         status = U_MEMORY_ALLOCATION_ERROR;
2058         return 0;
2059     }
2060     int32_t i;
2061     for (i = 0; i < destCapacity; i++) {
2062         destText[i] = utext_openUnicodeString(NULL, &dest[i], &status);
2063     }
2064 
2065     int32_t fieldCount = split(&inputText, destText, destCapacity, status);
2066 
2067     for (i = 0; i < destCapacity; i++) {
2068         utext_close(destText[i]);
2069     }
2070 
2071     uprv_free(destText);
2072     utext_close(&inputText);
2073     return fieldCount;
2074 }
2075 
2076 //
2077 //   split, UText mode
2078 //
split(UText * input,UText * dest[],int32_t destCapacity,UErrorCode & status)2079 int32_t  RegexMatcher::split(UText *input,
2080         UText           *dest[],
2081         int32_t          destCapacity,
2082         UErrorCode      &status)
2083 {
2084     //
2085     // Check arguments for validity
2086     //
2087     if (U_FAILURE(status)) {
2088         return 0;
2089     }
2090 
2091     if (destCapacity < 1) {
2092         status = U_ILLEGAL_ARGUMENT_ERROR;
2093         return 0;
2094     }
2095 
2096     //
2097     // Reset for the input text
2098     //
2099     reset(input);
2100     int64_t   nextOutputStringStart = 0;
2101     if (fActiveLimit == 0) {
2102         return 0;
2103     }
2104 
2105     //
2106     // Loop through the input text, searching for the delimiter pattern
2107     //
2108     int32_t i;
2109     int32_t numCaptureGroups = fPattern->fGroupMap->size();
2110     for (i=0; ; i++) {
2111         if (i>=destCapacity-1) {
2112             // There is one or zero output string left.
2113             // Fill the last output string with whatever is left from the input, then exit the loop.
2114             //  ( i will be == destCapacity if we filled the output array while processing
2115             //    capture groups of the delimiter expression, in which case we will discard the
2116             //    last capture group saved in favor of the unprocessed remainder of the
2117             //    input string.)
2118             i = destCapacity-1;
2119             if (fActiveLimit > nextOutputStringStart) {
2120                 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2121                     if (dest[i]) {
2122                         utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2123                                       input->chunkContents+nextOutputStringStart,
2124                                       (int32_t)(fActiveLimit-nextOutputStringStart), &status);
2125                     } else {
2126                         UText remainingText = UTEXT_INITIALIZER;
2127                         utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2128                                          fActiveLimit-nextOutputStringStart, &status);
2129                         dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2130                         utext_close(&remainingText);
2131                     }
2132                 } else {
2133                     UErrorCode lengthStatus = U_ZERO_ERROR;
2134                     int32_t remaining16Length =
2135                         utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus);
2136                     UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2137                     if (remainingChars == NULL) {
2138                         status = U_MEMORY_ALLOCATION_ERROR;
2139                         break;
2140                     }
2141 
2142                     utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
2143                     if (dest[i]) {
2144                         utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2145                     } else {
2146                         UText remainingText = UTEXT_INITIALIZER;
2147                         utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2148                         dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2149                         utext_close(&remainingText);
2150                     }
2151 
2152                     uprv_free(remainingChars);
2153                 }
2154             }
2155             break;
2156         }
2157         if (find()) {
2158             // We found another delimiter.  Move everything from where we started looking
2159             //  up until the start of the delimiter into the next output string.
2160             if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2161                 if (dest[i]) {
2162                     utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2163                                   input->chunkContents+nextOutputStringStart,
2164                                   (int32_t)(fMatchStart-nextOutputStringStart), &status);
2165                 } else {
2166                     UText remainingText = UTEXT_INITIALIZER;
2167                     utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2168                                       fMatchStart-nextOutputStringStart, &status);
2169                     dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2170                     utext_close(&remainingText);
2171                 }
2172             } else {
2173                 UErrorCode lengthStatus = U_ZERO_ERROR;
2174                 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fMatchStart, NULL, 0, &lengthStatus);
2175                 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2176                 if (remainingChars == NULL) {
2177                     status = U_MEMORY_ALLOCATION_ERROR;
2178                     break;
2179                 }
2180                 utext_extract(input, nextOutputStringStart, fMatchStart, remainingChars, remaining16Length+1, &status);
2181                 if (dest[i]) {
2182                     utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2183                 } else {
2184                     UText remainingText = UTEXT_INITIALIZER;
2185                     utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2186                     dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2187                     utext_close(&remainingText);
2188                 }
2189 
2190                 uprv_free(remainingChars);
2191             }
2192             nextOutputStringStart = fMatchEnd;
2193 
2194             // If the delimiter pattern has capturing parentheses, the captured
2195             //  text goes out into the next n destination strings.
2196             int32_t groupNum;
2197             for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
2198                 if (i >= destCapacity-2) {
2199                     // Never fill the last available output string with capture group text.
2200                     // It will filled with the last field, the remainder of the
2201                     //  unsplit input text.
2202                     break;
2203                 }
2204                 i++;
2205                 dest[i] = utext_extract_replace(fInputText, dest[i],
2206                                                start64(groupNum, status), end64(groupNum, status), &status);
2207             }
2208 
2209             if (nextOutputStringStart == fActiveLimit) {
2210                 // The delimiter was at the end of the string.  We're done, but first
2211                 // we output one last empty string, for the empty field following
2212                 //   the delimiter at the end of input.
2213                 if (i+1 < destCapacity) {
2214                     ++i;
2215                     if (dest[i] == NULL) {
2216                         dest[i] = utext_openUChars(NULL, NULL, 0, &status);
2217                     } else {
2218                         static const UChar emptyString[] = {(UChar)0};
2219                         utext_replace(dest[i], 0, utext_nativeLength(dest[i]), emptyString, 0, &status);
2220                     }
2221                 }
2222                 break;
2223 
2224             }
2225         }
2226         else
2227         {
2228             // We ran off the end of the input while looking for the next delimiter.
2229             // All the remaining text goes into the current output string.
2230             if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2231                 if (dest[i]) {
2232                     utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2233                                   input->chunkContents+nextOutputStringStart,
2234                                   (int32_t)(fActiveLimit-nextOutputStringStart), &status);
2235                 } else {
2236                     UText remainingText = UTEXT_INITIALIZER;
2237                     utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2238                                      fActiveLimit-nextOutputStringStart, &status);
2239                     dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2240                     utext_close(&remainingText);
2241                 }
2242             } else {
2243                 UErrorCode lengthStatus = U_ZERO_ERROR;
2244                 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus);
2245                 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2246                 if (remainingChars == NULL) {
2247                     status = U_MEMORY_ALLOCATION_ERROR;
2248                     break;
2249                 }
2250 
2251                 utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
2252                 if (dest[i]) {
2253                     utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2254                 } else {
2255                     UText remainingText = UTEXT_INITIALIZER;
2256                     utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2257                     dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2258                     utext_close(&remainingText);
2259                 }
2260 
2261                 uprv_free(remainingChars);
2262             }
2263             break;
2264         }
2265         if (U_FAILURE(status)) {
2266             break;
2267         }
2268     }   // end of for loop
2269     return i+1;
2270 }
2271 
2272 
2273 //--------------------------------------------------------------------------------
2274 //
2275 //     start
2276 //
2277 //--------------------------------------------------------------------------------
start(UErrorCode & status) const2278 int32_t RegexMatcher::start(UErrorCode &status) const {
2279     return start(0, status);
2280 }
2281 
start64(UErrorCode & status) const2282 int64_t RegexMatcher::start64(UErrorCode &status) const {
2283     return start64(0, status);
2284 }
2285 
2286 //--------------------------------------------------------------------------------
2287 //
2288 //     start(int32_t group, UErrorCode &status)
2289 //
2290 //--------------------------------------------------------------------------------
2291 
start64(int32_t group,UErrorCode & status) const2292 int64_t RegexMatcher::start64(int32_t group, UErrorCode &status) const {
2293     if (U_FAILURE(status)) {
2294         return -1;
2295     }
2296     if (U_FAILURE(fDeferredStatus)) {
2297         status = fDeferredStatus;
2298         return -1;
2299     }
2300     if (fMatch == FALSE) {
2301         status = U_REGEX_INVALID_STATE;
2302         return -1;
2303     }
2304     if (group < 0 || group > fPattern->fGroupMap->size()) {
2305         status = U_INDEX_OUTOFBOUNDS_ERROR;
2306         return -1;
2307     }
2308     int64_t s;
2309     if (group == 0) {
2310         s = fMatchStart;
2311     } else {
2312         int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
2313         U_ASSERT(groupOffset < fPattern->fFrameSize);
2314         U_ASSERT(groupOffset >= 0);
2315         s = fFrame->fExtra[groupOffset];
2316     }
2317 
2318     return s;
2319 }
2320 
2321 
start(int32_t group,UErrorCode & status) const2322 int32_t RegexMatcher::start(int32_t group, UErrorCode &status) const {
2323     return (int32_t)start64(group, status);
2324 }
2325 
2326 //--------------------------------------------------------------------------------
2327 //
2328 //     useAnchoringBounds
2329 //
2330 //--------------------------------------------------------------------------------
useAnchoringBounds(UBool b)2331 RegexMatcher &RegexMatcher::useAnchoringBounds(UBool b) {
2332     fAnchoringBounds = b;
2333     fAnchorStart = (fAnchoringBounds ? fRegionStart : 0);
2334     fAnchorLimit = (fAnchoringBounds ? fRegionLimit : fInputLength);
2335     return *this;
2336 }
2337 
2338 
2339 //--------------------------------------------------------------------------------
2340 //
2341 //     useTransparentBounds
2342 //
2343 //--------------------------------------------------------------------------------
useTransparentBounds(UBool b)2344 RegexMatcher &RegexMatcher::useTransparentBounds(UBool b) {
2345     fTransparentBounds = b;
2346     fLookStart = (fTransparentBounds ? 0 : fRegionStart);
2347     fLookLimit = (fTransparentBounds ? fInputLength : fRegionLimit);
2348     return *this;
2349 }
2350 
2351 //--------------------------------------------------------------------------------
2352 //
2353 //     setTimeLimit
2354 //
2355 //--------------------------------------------------------------------------------
setTimeLimit(int32_t limit,UErrorCode & status)2356 void RegexMatcher::setTimeLimit(int32_t limit, UErrorCode &status) {
2357     if (U_FAILURE(status)) {
2358         return;
2359     }
2360     if (U_FAILURE(fDeferredStatus)) {
2361         status = fDeferredStatus;
2362         return;
2363     }
2364     if (limit < 0) {
2365         status = U_ILLEGAL_ARGUMENT_ERROR;
2366         return;
2367     }
2368     fTimeLimit = limit;
2369 }
2370 
2371 
2372 //--------------------------------------------------------------------------------
2373 //
2374 //     getTimeLimit
2375 //
2376 //--------------------------------------------------------------------------------
getTimeLimit() const2377 int32_t RegexMatcher::getTimeLimit() const {
2378     return fTimeLimit;
2379 }
2380 
2381 
2382 //--------------------------------------------------------------------------------
2383 //
2384 //     setStackLimit
2385 //
2386 //--------------------------------------------------------------------------------
setStackLimit(int32_t limit,UErrorCode & status)2387 void RegexMatcher::setStackLimit(int32_t limit, UErrorCode &status) {
2388     if (U_FAILURE(status)) {
2389         return;
2390     }
2391     if (U_FAILURE(fDeferredStatus)) {
2392         status = fDeferredStatus;
2393         return;
2394     }
2395     if (limit < 0) {
2396         status = U_ILLEGAL_ARGUMENT_ERROR;
2397         return;
2398     }
2399 
2400     // Reset the matcher.  This is needed here in case there is a current match
2401     //    whose final stack frame (containing the match results, pointed to by fFrame)
2402     //    would be lost by resizing to a smaller stack size.
2403     reset();
2404 
2405     if (limit == 0) {
2406         // Unlimited stack expansion
2407         fStack->setMaxCapacity(0);
2408     } else {
2409         // Change the units of the limit  from bytes to ints, and bump the size up
2410         //   to be big enough to hold at least one stack frame for the pattern,
2411         //   if it isn't there already.
2412         int32_t adjustedLimit = limit / sizeof(int32_t);
2413         if (adjustedLimit < fPattern->fFrameSize) {
2414             adjustedLimit = fPattern->fFrameSize;
2415         }
2416         fStack->setMaxCapacity(adjustedLimit);
2417     }
2418     fStackLimit = limit;
2419 }
2420 
2421 
2422 //--------------------------------------------------------------------------------
2423 //
2424 //     getStackLimit
2425 //
2426 //--------------------------------------------------------------------------------
getStackLimit() const2427 int32_t RegexMatcher::getStackLimit() const {
2428     return fStackLimit;
2429 }
2430 
2431 
2432 //--------------------------------------------------------------------------------
2433 //
2434 //     setMatchCallback
2435 //
2436 //--------------------------------------------------------------------------------
setMatchCallback(URegexMatchCallback * callback,const void * context,UErrorCode & status)2437 void RegexMatcher::setMatchCallback(URegexMatchCallback     *callback,
2438                                     const void              *context,
2439                                     UErrorCode              &status) {
2440     if (U_FAILURE(status)) {
2441         return;
2442     }
2443     fCallbackFn = callback;
2444     fCallbackContext = context;
2445 }
2446 
2447 
2448 //--------------------------------------------------------------------------------
2449 //
2450 //     getMatchCallback
2451 //
2452 //--------------------------------------------------------------------------------
getMatchCallback(URegexMatchCallback * & callback,const void * & context,UErrorCode & status)2453 void RegexMatcher::getMatchCallback(URegexMatchCallback   *&callback,
2454                                   const void              *&context,
2455                                   UErrorCode              &status) {
2456     if (U_FAILURE(status)) {
2457        return;
2458     }
2459     callback = fCallbackFn;
2460     context  = fCallbackContext;
2461 }
2462 
2463 
2464 //--------------------------------------------------------------------------------
2465 //
2466 //     setMatchCallback
2467 //
2468 //--------------------------------------------------------------------------------
setFindProgressCallback(URegexFindProgressCallback * callback,const void * context,UErrorCode & status)2469 void RegexMatcher::setFindProgressCallback(URegexFindProgressCallback      *callback,
2470                                                 const void                      *context,
2471                                                 UErrorCode                      &status) {
2472     if (U_FAILURE(status)) {
2473         return;
2474     }
2475     fFindProgressCallbackFn = callback;
2476     fFindProgressCallbackContext = context;
2477 }
2478 
2479 
2480 //--------------------------------------------------------------------------------
2481 //
2482 //     getMatchCallback
2483 //
2484 //--------------------------------------------------------------------------------
getFindProgressCallback(URegexFindProgressCallback * & callback,const void * & context,UErrorCode & status)2485 void RegexMatcher::getFindProgressCallback(URegexFindProgressCallback    *&callback,
2486                                                 const void                    *&context,
2487                                                 UErrorCode                    &status) {
2488     if (U_FAILURE(status)) {
2489        return;
2490     }
2491     callback = fFindProgressCallbackFn;
2492     context  = fFindProgressCallbackContext;
2493 }
2494 
2495 
2496 //================================================================================
2497 //
2498 //    Code following this point in this file is the internal
2499 //    Match Engine Implementation.
2500 //
2501 //================================================================================
2502 
2503 
2504 //--------------------------------------------------------------------------------
2505 //
2506 //   resetStack
2507 //           Discard any previous contents of the state save stack, and initialize a
2508 //           new stack frame to all -1.  The -1s are needed for capture group limits,
2509 //           where they indicate that a group has not yet matched anything.
2510 //--------------------------------------------------------------------------------
resetStack()2511 REStackFrame *RegexMatcher::resetStack() {
2512     // Discard any previous contents of the state save stack, and initialize a
2513     //  new stack frame with all -1 data.  The -1s are needed for capture group limits,
2514     //  where they indicate that a group has not yet matched anything.
2515     fStack->removeAllElements();
2516 
2517     REStackFrame *iFrame = (REStackFrame *)fStack->reserveBlock(fPattern->fFrameSize, fDeferredStatus);
2518     if(U_FAILURE(fDeferredStatus)) {
2519         return NULL;
2520     }
2521 
2522     int32_t i;
2523     for (i=0; i<fPattern->fFrameSize-RESTACKFRAME_HDRCOUNT; i++) {
2524         iFrame->fExtra[i] = -1;
2525     }
2526     return iFrame;
2527 }
2528 
2529 
2530 
2531 //--------------------------------------------------------------------------------
2532 //
2533 //   isWordBoundary
2534 //                     in perl, "xab..cd..", \b is true at positions 0,3,5,7
2535 //                     For us,
2536 //                       If the current char is a combining mark,
2537 //                          \b is FALSE.
2538 //                       Else Scan backwards to the first non-combining char.
2539 //                            We are at a boundary if the this char and the original chars are
2540 //                               opposite in membership in \w set
2541 //
2542 //          parameters:   pos   - the current position in the input buffer
2543 //
2544 //              TODO:  double-check edge cases at region boundaries.
2545 //
2546 //--------------------------------------------------------------------------------
isWordBoundary(int64_t pos)2547 UBool RegexMatcher::isWordBoundary(int64_t pos) {
2548     UBool isBoundary = FALSE;
2549     UBool cIsWord    = FALSE;
2550 
2551     if (pos >= fLookLimit) {
2552         fHitEnd = TRUE;
2553     } else {
2554         // Determine whether char c at current position is a member of the word set of chars.
2555         // If we're off the end of the string, behave as though we're not at a word char.
2556         UTEXT_SETNATIVEINDEX(fInputText, pos);
2557         UChar32  c = UTEXT_CURRENT32(fInputText);
2558         if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
2559             // Current char is a combining one.  Not a boundary.
2560             return FALSE;
2561         }
2562         cIsWord = RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET].contains(c);
2563     }
2564 
2565     // Back up until we come to a non-combining char, determine whether
2566     //  that char is a word char.
2567     UBool prevCIsWord = FALSE;
2568     for (;;) {
2569         if (UTEXT_GETNATIVEINDEX(fInputText) <= fLookStart) {
2570             break;
2571         }
2572         UChar32 prevChar = UTEXT_PREVIOUS32(fInputText);
2573         if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
2574               || u_charType(prevChar) == U_FORMAT_CHAR)) {
2575             prevCIsWord = RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET].contains(prevChar);
2576             break;
2577         }
2578     }
2579     isBoundary = cIsWord ^ prevCIsWord;
2580     return isBoundary;
2581 }
2582 
isChunkWordBoundary(int32_t pos)2583 UBool RegexMatcher::isChunkWordBoundary(int32_t pos) {
2584     UBool isBoundary = FALSE;
2585     UBool cIsWord    = FALSE;
2586 
2587     const UChar *inputBuf = fInputText->chunkContents;
2588 
2589     if (pos >= fLookLimit) {
2590         fHitEnd = TRUE;
2591     } else {
2592         // Determine whether char c at current position is a member of the word set of chars.
2593         // If we're off the end of the string, behave as though we're not at a word char.
2594         UChar32 c;
2595         U16_GET(inputBuf, fLookStart, pos, fLookLimit, c);
2596         if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
2597             // Current char is a combining one.  Not a boundary.
2598             return FALSE;
2599         }
2600         cIsWord = RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET].contains(c);
2601     }
2602 
2603     // Back up until we come to a non-combining char, determine whether
2604     //  that char is a word char.
2605     UBool prevCIsWord = FALSE;
2606     for (;;) {
2607         if (pos <= fLookStart) {
2608             break;
2609         }
2610         UChar32 prevChar;
2611         U16_PREV(inputBuf, fLookStart, pos, prevChar);
2612         if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
2613               || u_charType(prevChar) == U_FORMAT_CHAR)) {
2614             prevCIsWord = RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET].contains(prevChar);
2615             break;
2616         }
2617     }
2618     isBoundary = cIsWord ^ prevCIsWord;
2619     return isBoundary;
2620 }
2621 
2622 //--------------------------------------------------------------------------------
2623 //
2624 //   isUWordBoundary
2625 //
2626 //         Test for a word boundary using RBBI word break.
2627 //
2628 //          parameters:   pos   - the current position in the input buffer
2629 //
2630 //--------------------------------------------------------------------------------
isUWordBoundary(int64_t pos,UErrorCode & status)2631 UBool RegexMatcher::isUWordBoundary(int64_t pos, UErrorCode &status) {
2632     UBool       returnVal = FALSE;
2633 
2634 #if UCONFIG_NO_BREAK_ITERATION==0
2635     // Note: this point will never be reached if break iteration is configured out.
2636     //       Regex patterns that would require this function will fail to compile.
2637 
2638     // If we haven't yet created a break iterator for this matcher, do it now.
2639     if (fWordBreakItr == nullptr) {
2640         fWordBreakItr = BreakIterator::createWordInstance(Locale::getEnglish(), status);
2641         if (U_FAILURE(status)) {
2642             return FALSE;
2643         }
2644         fWordBreakItr->setText(fInputText, status);
2645     }
2646 
2647     // Note: zero width boundary tests like \b see through transparent region bounds,
2648     //       which is why fLookLimit is used here, rather than fActiveLimit.
2649     if (pos >= fLookLimit) {
2650         fHitEnd = TRUE;
2651         returnVal = TRUE;   // With Unicode word rules, only positions within the interior of "real"
2652                             //    words are not boundaries.  All non-word chars stand by themselves,
2653                             //    with word boundaries on both sides.
2654     } else {
2655         returnVal = fWordBreakItr->isBoundary((int32_t)pos);
2656     }
2657 #endif
2658     return   returnVal;
2659 }
2660 
2661 
followingGCBoundary(int64_t pos,UErrorCode & status)2662 int64_t RegexMatcher::followingGCBoundary(int64_t pos, UErrorCode &status) {
2663     int64_t result = pos;
2664 
2665 #if UCONFIG_NO_BREAK_ITERATION==0
2666     // Note: this point will never be reached if break iteration is configured out.
2667     //       Regex patterns that would require this function will fail to compile.
2668 
2669     // If we haven't yet created a break iterator for this matcher, do it now.
2670     if (fGCBreakItr == nullptr) {
2671         fGCBreakItr = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
2672         if (U_FAILURE(status)) {
2673             return pos;
2674         }
2675         fGCBreakItr->setText(fInputText, status);
2676     }
2677     result = fGCBreakItr->following(pos);
2678     if (result == BreakIterator::DONE) {
2679         result = pos;
2680     }
2681 #endif
2682     return result;
2683 }
2684 
2685 //--------------------------------------------------------------------------------
2686 //
2687 //   IncrementTime     This function is called once each TIMER_INITIAL_VALUE state
2688 //                     saves. Increment the "time" counter, and call the
2689 //                     user callback function if there is one installed.
2690 //
2691 //                     If the match operation needs to be aborted, either for a time-out
2692 //                     or because the user callback asked for it, just set an error status.
2693 //                     The engine will pick that up and stop in its outer loop.
2694 //
2695 //--------------------------------------------------------------------------------
IncrementTime(UErrorCode & status)2696 void RegexMatcher::IncrementTime(UErrorCode &status) {
2697     fTickCounter = TIMER_INITIAL_VALUE;
2698     fTime++;
2699     if (fCallbackFn != NULL) {
2700         if ((*fCallbackFn)(fCallbackContext, fTime) == FALSE) {
2701             status = U_REGEX_STOPPED_BY_CALLER;
2702             return;
2703         }
2704     }
2705     if (fTimeLimit > 0 && fTime >= fTimeLimit) {
2706         status = U_REGEX_TIME_OUT;
2707     }
2708 }
2709 
2710 //--------------------------------------------------------------------------------
2711 //
2712 //   StateSave
2713 //       Make a new stack frame, initialized as a copy of the current stack frame.
2714 //       Set the pattern index in the original stack frame from the operand value
2715 //       in the opcode.  Execution of the engine continues with the state in
2716 //       the newly created stack frame
2717 //
2718 //       Note that reserveBlock() may grow the stack, resulting in the
2719 //       whole thing being relocated in memory.
2720 //
2721 //    Parameters:
2722 //       fp           The top frame pointer when called.  At return, a new
2723 //                    fame will be present
2724 //       savePatIdx   An index into the compiled pattern.  Goes into the original
2725 //                    (not new) frame.  If execution ever back-tracks out of the
2726 //                    new frame, this will be where we continue from in the pattern.
2727 //    Return
2728 //                    The new frame pointer.
2729 //
2730 //--------------------------------------------------------------------------------
StateSave(REStackFrame * fp,int64_t savePatIdx,UErrorCode & status)2731 inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status) {
2732     if (U_FAILURE(status)) {
2733         return fp;
2734     }
2735     // push storage for a new frame.
2736     int64_t *newFP = fStack->reserveBlock(fFrameSize, status);
2737     if (U_FAILURE(status)) {
2738         // Failure on attempted stack expansion.
2739         //   Stack function set some other error code, change it to a more
2740         //   specific one for regular expressions.
2741         status = U_REGEX_STACK_OVERFLOW;
2742         // We need to return a writable stack frame, so just return the
2743         //    previous frame.  The match operation will stop quickly
2744         //    because of the error status, after which the frame will never
2745         //    be looked at again.
2746         return fp;
2747     }
2748     fp = (REStackFrame *)(newFP - fFrameSize);  // in case of realloc of stack.
2749 
2750     // New stack frame = copy of old top frame.
2751     int64_t *source = (int64_t *)fp;
2752     int64_t *dest   = newFP;
2753     for (;;) {
2754         *dest++ = *source++;
2755         if (source == newFP) {
2756             break;
2757         }
2758     }
2759 
2760     fTickCounter--;
2761     if (fTickCounter <= 0) {
2762        IncrementTime(status);    // Re-initializes fTickCounter
2763     }
2764     fp->fPatIdx = savePatIdx;
2765     return (REStackFrame *)newFP;
2766 }
2767 
2768 #if defined(REGEX_DEBUG)
2769 namespace {
StringFromUText(UText * ut)2770 UnicodeString StringFromUText(UText *ut) {
2771     UnicodeString result;
2772     for (UChar32 c = utext_next32From(ut, 0); c != U_SENTINEL; c = UTEXT_NEXT32(ut)) {
2773         result.append(c);
2774     }
2775     return result;
2776 }
2777 }
2778 #endif // REGEX_DEBUG
2779 
2780 
2781 //--------------------------------------------------------------------------------
2782 //
2783 //   MatchAt      This is the actual matching engine.
2784 //
2785 //                  startIdx:    begin matching a this index.
2786 //                  toEnd:       if true, match must extend to end of the input region
2787 //
2788 //--------------------------------------------------------------------------------
MatchAt(int64_t startIdx,UBool toEnd,UErrorCode & status)2789 void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) {
2790     UBool       isMatch  = FALSE;      // True if the we have a match.
2791 
2792     int64_t     backSearchIndex = U_INT64_MAX; // used after greedy single-character matches for searching backwards
2793 
2794     int32_t     op;                    // Operation from the compiled pattern, split into
2795     int32_t     opType;                //    the opcode
2796     int32_t     opValue;               //    and the operand value.
2797 
2798 #ifdef REGEX_RUN_DEBUG
2799     if (fTraceDebug) {
2800         printf("MatchAt(startIdx=%ld)\n", startIdx);
2801         printf("Original Pattern: \"%s\"\n", CStr(StringFromUText(fPattern->fPattern))());
2802         printf("Input String:     \"%s\"\n\n", CStr(StringFromUText(fInputText))());
2803     }
2804 #endif
2805 
2806     if (U_FAILURE(status)) {
2807         return;
2808     }
2809 
2810     //  Cache frequently referenced items from the compiled pattern
2811     //
2812     int64_t             *pat           = fPattern->fCompiledPat->getBuffer();
2813 
2814     const UChar         *litText       = fPattern->fLiteralText.getBuffer();
2815     UVector             *fSets         = fPattern->fSets;
2816 
2817     fFrameSize = fPattern->fFrameSize;
2818     REStackFrame        *fp            = resetStack();
2819     if (U_FAILURE(fDeferredStatus)) {
2820         status = fDeferredStatus;
2821         return;
2822     }
2823 
2824     fp->fPatIdx   = 0;
2825     fp->fInputIdx = startIdx;
2826 
2827     // Zero out the pattern's static data
2828     int32_t i;
2829     for (i = 0; i<fPattern->fDataSize; i++) {
2830         fData[i] = 0;
2831     }
2832 
2833     //
2834     //  Main loop for interpreting the compiled pattern.
2835     //  One iteration of the loop per pattern operation performed.
2836     //
2837     for (;;) {
2838         op      = (int32_t)pat[fp->fPatIdx];
2839         opType  = URX_TYPE(op);
2840         opValue = URX_VAL(op);
2841 #ifdef REGEX_RUN_DEBUG
2842         if (fTraceDebug) {
2843             UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2844             printf("inputIdx=%ld   inputChar=%x   sp=%3ld   activeLimit=%ld  ", fp->fInputIdx,
2845                 UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
2846             fPattern->dumpOp(fp->fPatIdx);
2847         }
2848 #endif
2849         fp->fPatIdx++;
2850 
2851         switch (opType) {
2852 
2853 
2854         case URX_NOP:
2855             break;
2856 
2857 
2858         case URX_BACKTRACK:
2859             // Force a backtrack.  In some circumstances, the pattern compiler
2860             //   will notice that the pattern can't possibly match anything, and will
2861             //   emit one of these at that point.
2862             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2863             break;
2864 
2865 
2866         case URX_ONECHAR:
2867             if (fp->fInputIdx < fActiveLimit) {
2868                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2869                 UChar32 c = UTEXT_NEXT32(fInputText);
2870                 if (c == opValue) {
2871                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
2872                     break;
2873                 }
2874             } else {
2875                 fHitEnd = TRUE;
2876             }
2877             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2878             break;
2879 
2880 
2881         case URX_STRING:
2882             {
2883                 // Test input against a literal string.
2884                 // Strings require two slots in the compiled pattern, one for the
2885                 //   offset to the string text, and one for the length.
2886 
2887                 int32_t   stringStartIdx = opValue;
2888                 op      = (int32_t)pat[fp->fPatIdx];     // Fetch the second operand
2889                 fp->fPatIdx++;
2890                 opType    = URX_TYPE(op);
2891                 int32_t stringLen = URX_VAL(op);
2892                 U_ASSERT(opType == URX_STRING_LEN);
2893                 U_ASSERT(stringLen >= 2);
2894 
2895                 const UChar *patternString = litText+stringStartIdx;
2896                 int32_t patternStringIndex = 0;
2897                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2898                 UChar32 inputChar;
2899                 UChar32 patternChar;
2900                 UBool success = TRUE;
2901                 while (patternStringIndex < stringLen) {
2902                     if (UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) {
2903                         success = FALSE;
2904                         fHitEnd = TRUE;
2905                         break;
2906                     }
2907                     inputChar = UTEXT_NEXT32(fInputText);
2908                     U16_NEXT(patternString, patternStringIndex, stringLen, patternChar);
2909                     if (patternChar != inputChar) {
2910                         success = FALSE;
2911                         break;
2912                     }
2913                 }
2914 
2915                 if (success) {
2916                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
2917                 } else {
2918                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2919                 }
2920             }
2921             break;
2922 
2923 
2924         case URX_STATE_SAVE:
2925             fp = StateSave(fp, opValue, status);
2926             break;
2927 
2928 
2929         case URX_END:
2930             // The match loop will exit via this path on a successful match,
2931             //   when we reach the end of the pattern.
2932             if (toEnd && fp->fInputIdx != fActiveLimit) {
2933                 // The pattern matched, but not to the end of input.  Try some more.
2934                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2935                 break;
2936             }
2937             isMatch = TRUE;
2938             goto  breakFromLoop;
2939 
2940         // Start and End Capture stack frame variables are laid out out like this:
2941             //  fp->fExtra[opValue]  - The start of a completed capture group
2942             //             opValue+1 - The end   of a completed capture group
2943             //             opValue+2 - the start of a capture group whose end
2944             //                          has not yet been reached (and might not ever be).
2945         case URX_START_CAPTURE:
2946             U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
2947             fp->fExtra[opValue+2] = fp->fInputIdx;
2948             break;
2949 
2950 
2951         case URX_END_CAPTURE:
2952             U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
2953             U_ASSERT(fp->fExtra[opValue+2] >= 0);            // Start pos for this group must be set.
2954             fp->fExtra[opValue]   = fp->fExtra[opValue+2];   // Tentative start becomes real.
2955             fp->fExtra[opValue+1] = fp->fInputIdx;           // End position
2956             U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
2957             break;
2958 
2959 
2960         case URX_DOLLAR:                   //  $, test for End of line
2961                                            //     or for position before new line at end of input
2962             {
2963                 if (fp->fInputIdx >= fAnchorLimit) {
2964                     // We really are at the end of input.  Success.
2965                     fHitEnd = TRUE;
2966                     fRequireEnd = TRUE;
2967                     break;
2968                 }
2969 
2970                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2971 
2972                 // If we are positioned just before a new-line that is located at the
2973                 //   end of input, succeed.
2974                 UChar32 c = UTEXT_NEXT32(fInputText);
2975                 if (UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
2976                     if (isLineTerminator(c)) {
2977                         // If not in the middle of a CR/LF sequence
2978                         if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && ((void)UTEXT_PREVIOUS32(fInputText), UTEXT_PREVIOUS32(fInputText))==0x0d)) {
2979                             // At new-line at end of input. Success
2980                             fHitEnd = TRUE;
2981                             fRequireEnd = TRUE;
2982 
2983                             break;
2984                         }
2985                     }
2986                 } else {
2987                     UChar32 nextC = UTEXT_NEXT32(fInputText);
2988                     if (c == 0x0d && nextC == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
2989                         fHitEnd = TRUE;
2990                         fRequireEnd = TRUE;
2991                         break;                         // At CR/LF at end of input.  Success
2992                     }
2993                 }
2994 
2995                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2996             }
2997             break;
2998 
2999 
3000          case URX_DOLLAR_D:                   //  $, test for End of Line, in UNIX_LINES mode.
3001             if (fp->fInputIdx >= fAnchorLimit) {
3002                 // Off the end of input.  Success.
3003                 fHitEnd = TRUE;
3004                 fRequireEnd = TRUE;
3005                 break;
3006             } else {
3007                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3008                 UChar32 c = UTEXT_NEXT32(fInputText);
3009                 // Either at the last character of input, or off the end.
3010                 if (c == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) == fAnchorLimit) {
3011                     fHitEnd = TRUE;
3012                     fRequireEnd = TRUE;
3013                     break;
3014                 }
3015             }
3016 
3017             // Not at end of input.  Back-track out.
3018             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3019             break;
3020 
3021 
3022          case URX_DOLLAR_M:                //  $, test for End of line in multi-line mode
3023              {
3024                  if (fp->fInputIdx >= fAnchorLimit) {
3025                      // We really are at the end of input.  Success.
3026                      fHitEnd = TRUE;
3027                      fRequireEnd = TRUE;
3028                      break;
3029                  }
3030                  // If we are positioned just before a new-line, succeed.
3031                  // It makes no difference where the new-line is within the input.
3032                  UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3033                  UChar32 c = UTEXT_CURRENT32(fInputText);
3034                  if (isLineTerminator(c)) {
3035                      // At a line end, except for the odd chance of  being in the middle of a CR/LF sequence
3036                      //  In multi-line mode, hitting a new-line just before the end of input does not
3037                      //   set the hitEnd or requireEnd flags
3038                      if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && UTEXT_PREVIOUS32(fInputText)==0x0d)) {
3039                         break;
3040                      }
3041                  }
3042                  // not at a new line.  Fail.
3043                  fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3044              }
3045              break;
3046 
3047 
3048          case URX_DOLLAR_MD:                //  $, test for End of line in multi-line and UNIX_LINES mode
3049              {
3050                  if (fp->fInputIdx >= fAnchorLimit) {
3051                      // We really are at the end of input.  Success.
3052                      fHitEnd = TRUE;
3053                      fRequireEnd = TRUE;  // Java set requireEnd in this case, even though
3054                      break;               //   adding a new-line would not lose the match.
3055                  }
3056                  // If we are not positioned just before a new-line, the test fails; backtrack out.
3057                  // It makes no difference where the new-line is within the input.
3058                  UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3059                  if (UTEXT_CURRENT32(fInputText) != 0x0a) {
3060                      fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3061                  }
3062              }
3063              break;
3064 
3065 
3066        case URX_CARET:                    //  ^, test for start of line
3067             if (fp->fInputIdx != fAnchorStart) {
3068                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3069             }
3070             break;
3071 
3072 
3073        case URX_CARET_M:                   //  ^, test for start of line in mulit-line mode
3074            {
3075                if (fp->fInputIdx == fAnchorStart) {
3076                    // We are at the start input.  Success.
3077                    break;
3078                }
3079                // Check whether character just before the current pos is a new-line
3080                //   unless we are at the end of input
3081                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3082                UChar32  c = UTEXT_PREVIOUS32(fInputText);
3083                if ((fp->fInputIdx < fAnchorLimit) && isLineTerminator(c)) {
3084                    //  It's a new-line.  ^ is true.  Success.
3085                    //  TODO:  what should be done with positions between a CR and LF?
3086                    break;
3087                }
3088                // Not at the start of a line.  Fail.
3089                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3090            }
3091            break;
3092 
3093 
3094        case URX_CARET_M_UNIX:       //  ^, test for start of line in mulit-line + Unix-line mode
3095            {
3096                U_ASSERT(fp->fInputIdx >= fAnchorStart);
3097                if (fp->fInputIdx <= fAnchorStart) {
3098                    // We are at the start input.  Success.
3099                    break;
3100                }
3101                // Check whether character just before the current pos is a new-line
3102                U_ASSERT(fp->fInputIdx <= fAnchorLimit);
3103                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3104                UChar32  c = UTEXT_PREVIOUS32(fInputText);
3105                if (c != 0x0a) {
3106                    // Not at the start of a line.  Back-track out.
3107                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3108                }
3109            }
3110            break;
3111 
3112         case URX_BACKSLASH_B:          // Test for word boundaries
3113             {
3114                 UBool success = isWordBoundary(fp->fInputIdx);
3115                 success ^= (UBool)(opValue != 0);     // flip sense for \B
3116                 if (!success) {
3117                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3118                 }
3119             }
3120             break;
3121 
3122 
3123         case URX_BACKSLASH_BU:          // Test for word boundaries, Unicode-style
3124             {
3125                 UBool success = isUWordBoundary(fp->fInputIdx, status);
3126                 success ^= (UBool)(opValue != 0);     // flip sense for \B
3127                 if (!success) {
3128                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3129                 }
3130             }
3131             break;
3132 
3133 
3134         case URX_BACKSLASH_D:            // Test for decimal digit
3135             {
3136                 if (fp->fInputIdx >= fActiveLimit) {
3137                     fHitEnd = TRUE;
3138                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3139                     break;
3140                 }
3141 
3142                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3143 
3144                 UChar32 c = UTEXT_NEXT32(fInputText);
3145                 int8_t ctype = u_charType(c);     // TODO:  make a unicode set for this.  Will be faster.
3146                 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
3147                 success ^= (UBool)(opValue != 0);        // flip sense for \D
3148                 if (success) {
3149                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3150                 } else {
3151                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3152                 }
3153             }
3154             break;
3155 
3156 
3157         case URX_BACKSLASH_G:          // Test for position at end of previous match
3158             if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) {
3159                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3160             }
3161             break;
3162 
3163 
3164         case URX_BACKSLASH_H:            // Test for \h, horizontal white space.
3165             {
3166                 if (fp->fInputIdx >= fActiveLimit) {
3167                     fHitEnd = TRUE;
3168                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3169                     break;
3170                 }
3171                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3172                 UChar32 c = UTEXT_NEXT32(fInputText);
3173                 int8_t ctype = u_charType(c);
3174                 UBool success = (ctype == U_SPACE_SEPARATOR || c == 9);  // SPACE_SEPARATOR || TAB
3175                 success ^= (UBool)(opValue != 0);        // flip sense for \H
3176                 if (success) {
3177                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3178                 } else {
3179                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3180                 }
3181             }
3182             break;
3183 
3184 
3185         case URX_BACKSLASH_R:            // Test for \R, any line break sequence.
3186             {
3187                 if (fp->fInputIdx >= fActiveLimit) {
3188                     fHitEnd = TRUE;
3189                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3190                     break;
3191                 }
3192                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3193                 UChar32 c = UTEXT_NEXT32(fInputText);
3194                 if (isLineTerminator(c)) {
3195                     if (c == 0x0d && utext_current32(fInputText) == 0x0a) {
3196                         utext_next32(fInputText);
3197                     }
3198                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3199                 } else {
3200                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3201                 }
3202             }
3203             break;
3204 
3205 
3206         case URX_BACKSLASH_V:            // \v, any single line ending character.
3207             {
3208                 if (fp->fInputIdx >= fActiveLimit) {
3209                     fHitEnd = TRUE;
3210                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3211                     break;
3212                 }
3213                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3214                 UChar32 c = UTEXT_NEXT32(fInputText);
3215                 UBool success = isLineTerminator(c);
3216                 success ^= (UBool)(opValue != 0);        // flip sense for \V
3217                 if (success) {
3218                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3219                 } else {
3220                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3221                 }
3222             }
3223             break;
3224 
3225 
3226         case URX_BACKSLASH_X:
3227             //  Match a Grapheme, as defined by Unicode UAX 29.
3228 
3229             // Fail if at end of input
3230             if (fp->fInputIdx >= fActiveLimit) {
3231                 fHitEnd = TRUE;
3232                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3233                 break;
3234             }
3235 
3236             fp->fInputIdx = followingGCBoundary(fp->fInputIdx, status);
3237             if (fp->fInputIdx >= fActiveLimit) {
3238                 fHitEnd = TRUE;
3239                 fp->fInputIdx = fActiveLimit;
3240             }
3241             break;
3242 
3243 
3244         case URX_BACKSLASH_Z:          // Test for end of Input
3245             if (fp->fInputIdx < fAnchorLimit) {
3246                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3247             } else {
3248                 fHitEnd = TRUE;
3249                 fRequireEnd = TRUE;
3250             }
3251             break;
3252 
3253 
3254 
3255         case URX_STATIC_SETREF:
3256             {
3257                 // Test input character against one of the predefined sets
3258                 //    (Word Characters, for example)
3259                 // The high bit of the op value is a flag for the match polarity.
3260                 //    0:   success if input char is in set.
3261                 //    1:   success if input char is not in set.
3262                 if (fp->fInputIdx >= fActiveLimit) {
3263                     fHitEnd = TRUE;
3264                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3265                     break;
3266                 }
3267 
3268                 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
3269                 opValue &= ~URX_NEG_SET;
3270                 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
3271 
3272                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3273                 UChar32 c = UTEXT_NEXT32(fInputText);
3274                 if (c < 256) {
3275                     Regex8BitSet &s8 = RegexStaticSets::gStaticSets->fPropSets8[opValue];
3276                     if (s8.contains(c)) {
3277                         success = !success;
3278                     }
3279                 } else {
3280                     const UnicodeSet &s = RegexStaticSets::gStaticSets->fPropSets[opValue];
3281                     if (s.contains(c)) {
3282                         success = !success;
3283                     }
3284                 }
3285                 if (success) {
3286                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3287                 } else {
3288                     // the character wasn't in the set.
3289                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3290                 }
3291             }
3292             break;
3293 
3294 
3295         case URX_STAT_SETREF_N:
3296             {
3297                 // Test input character for NOT being a member of  one of
3298                 //    the predefined sets (Word Characters, for example)
3299                 if (fp->fInputIdx >= fActiveLimit) {
3300                     fHitEnd = TRUE;
3301                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3302                     break;
3303                 }
3304 
3305                 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
3306 
3307                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3308 
3309                 UChar32 c = UTEXT_NEXT32(fInputText);
3310                 if (c < 256) {
3311                     Regex8BitSet &s8 = RegexStaticSets::gStaticSets->fPropSets8[opValue];
3312                     if (s8.contains(c) == FALSE) {
3313                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3314                         break;
3315                     }
3316                 } else {
3317                     const UnicodeSet &s = RegexStaticSets::gStaticSets->fPropSets[opValue];
3318                     if (s.contains(c) == FALSE) {
3319                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3320                         break;
3321                     }
3322                 }
3323                 // the character wasn't in the set.
3324                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3325             }
3326             break;
3327 
3328 
3329         case URX_SETREF:
3330             if (fp->fInputIdx >= fActiveLimit) {
3331                 fHitEnd = TRUE;
3332                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3333                 break;
3334             } else {
3335                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3336 
3337                 // There is input left.  Pick up one char and test it for set membership.
3338                 UChar32 c = UTEXT_NEXT32(fInputText);
3339                 U_ASSERT(opValue > 0 && opValue < fSets->size());
3340                 if (c<256) {
3341                     Regex8BitSet *s8 = &fPattern->fSets8[opValue];
3342                     if (s8->contains(c)) {
3343                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3344                         break;
3345                     }
3346                 } else {
3347                     UnicodeSet *s = (UnicodeSet *)fSets->elementAt(opValue);
3348                     if (s->contains(c)) {
3349                         // The character is in the set.  A Match.
3350                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3351                         break;
3352                     }
3353                 }
3354 
3355                 // the character wasn't in the set.
3356                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3357             }
3358             break;
3359 
3360 
3361         case URX_DOTANY:
3362             {
3363                 // . matches anything, but stops at end-of-line.
3364                 if (fp->fInputIdx >= fActiveLimit) {
3365                     // At end of input.  Match failed.  Backtrack out.
3366                     fHitEnd = TRUE;
3367                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3368                     break;
3369                 }
3370 
3371                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3372 
3373                 // There is input left.  Advance over one char, unless we've hit end-of-line
3374                 UChar32 c = UTEXT_NEXT32(fInputText);
3375                 if (isLineTerminator(c)) {
3376                     // End of line in normal mode.   . does not match.
3377                         fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3378                     break;
3379                 }
3380                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3381             }
3382             break;
3383 
3384 
3385         case URX_DOTANY_ALL:
3386             {
3387                 // ., in dot-matches-all (including new lines) mode
3388                 if (fp->fInputIdx >= fActiveLimit) {
3389                     // At end of input.  Match failed.  Backtrack out.
3390                     fHitEnd = TRUE;
3391                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3392                     break;
3393                 }
3394 
3395                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3396 
3397                 // There is input left.  Advance over one char, except if we are
3398                 //   at a cr/lf, advance over both of them.
3399                 UChar32 c;
3400                 c = UTEXT_NEXT32(fInputText);
3401                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3402                 if (c==0x0d && fp->fInputIdx < fActiveLimit) {
3403                     // In the case of a CR/LF, we need to advance over both.
3404                     UChar32 nextc = UTEXT_CURRENT32(fInputText);
3405                     if (nextc == 0x0a) {
3406                         (void)UTEXT_NEXT32(fInputText);
3407                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3408                     }
3409                 }
3410             }
3411             break;
3412 
3413 
3414         case URX_DOTANY_UNIX:
3415             {
3416                 // '.' operator, matches all, but stops at end-of-line.
3417                 //   UNIX_LINES mode, so 0x0a is the only recognized line ending.
3418                 if (fp->fInputIdx >= fActiveLimit) {
3419                     // At end of input.  Match failed.  Backtrack out.
3420                     fHitEnd = TRUE;
3421                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3422                     break;
3423                 }
3424 
3425                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3426 
3427                 // There is input left.  Advance over one char, unless we've hit end-of-line
3428                 UChar32 c = UTEXT_NEXT32(fInputText);
3429                 if (c == 0x0a) {
3430                     // End of line in normal mode.   '.' does not match the \n
3431                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3432                 } else {
3433                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3434                 }
3435             }
3436             break;
3437 
3438 
3439         case URX_JMP:
3440             fp->fPatIdx = opValue;
3441             break;
3442 
3443         case URX_FAIL:
3444             isMatch = FALSE;
3445             goto breakFromLoop;
3446 
3447         case URX_JMP_SAV:
3448             U_ASSERT(opValue < fPattern->fCompiledPat->size());
3449             fp = StateSave(fp, fp->fPatIdx, status);       // State save to loc following current
3450             fp->fPatIdx = opValue;                         // Then JMP.
3451             break;
3452 
3453         case URX_JMP_SAV_X:
3454             // This opcode is used with (x)+, when x can match a zero length string.
3455             // Same as JMP_SAV, except conditional on the match having made forward progress.
3456             // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
3457             //   data address of the input position at the start of the loop.
3458             {
3459                 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
3460                 int32_t  stoOp = (int32_t)pat[opValue-1];
3461                 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
3462                 int32_t  frameLoc = URX_VAL(stoOp);
3463                 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
3464                 int64_t prevInputIdx = fp->fExtra[frameLoc];
3465                 U_ASSERT(prevInputIdx <= fp->fInputIdx);
3466                 if (prevInputIdx < fp->fInputIdx) {
3467                     // The match did make progress.  Repeat the loop.
3468                     fp = StateSave(fp, fp->fPatIdx, status);  // State save to loc following current
3469                     fp->fPatIdx = opValue;
3470                     fp->fExtra[frameLoc] = fp->fInputIdx;
3471                 }
3472                 // If the input position did not advance, we do nothing here,
3473                 //   execution will fall out of the loop.
3474             }
3475             break;
3476 
3477         case URX_CTR_INIT:
3478             {
3479                 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
3480                 fp->fExtra[opValue] = 0;                 //  Set the loop counter variable to zero
3481 
3482                 // Pick up the three extra operands that CTR_INIT has, and
3483                 //    skip the pattern location counter past
3484                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3485                 fp->fPatIdx += 3;
3486                 int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
3487                 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
3488                 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
3489                 U_ASSERT(minCount>=0);
3490                 U_ASSERT(maxCount>=minCount || maxCount==-1);
3491                 U_ASSERT(loopLoc>=fp->fPatIdx);
3492 
3493                 if (minCount == 0) {
3494                     fp = StateSave(fp, loopLoc+1, status);
3495                 }
3496                 if (maxCount == -1) {
3497                     fp->fExtra[opValue+1] = fp->fInputIdx;   //  For loop breaking.
3498                 } else if (maxCount == 0) {
3499                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3500                 }
3501             }
3502             break;
3503 
3504         case URX_CTR_LOOP:
3505             {
3506                 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
3507                 int32_t initOp = (int32_t)pat[opValue];
3508                 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
3509                 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
3510                 int32_t minCount  = (int32_t)pat[opValue+2];
3511                 int32_t maxCount  = (int32_t)pat[opValue+3];
3512                 (*pCounter)++;
3513                 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
3514                     U_ASSERT(*pCounter == maxCount);
3515                     break;
3516                 }
3517                 if (*pCounter >= minCount) {
3518                     if (maxCount == -1) {
3519                         // Loop has no hard upper bound.
3520                         // Check that it is progressing through the input, break if it is not.
3521                         int64_t *pLastInputIdx =  &fp->fExtra[URX_VAL(initOp) + 1];
3522                         if (fp->fInputIdx == *pLastInputIdx) {
3523                             break;
3524                         } else {
3525                             *pLastInputIdx = fp->fInputIdx;
3526                         }
3527                     }
3528                     fp = StateSave(fp, fp->fPatIdx, status);
3529                 } else {
3530                     // Increment time-out counter. (StateSave() does it if count >= minCount)
3531                     fTickCounter--;
3532                     if (fTickCounter <= 0) {
3533                         IncrementTime(status);    // Re-initializes fTickCounter
3534                     }
3535                 }
3536 
3537                 fp->fPatIdx = opValue + 4;    // Loop back.
3538             }
3539             break;
3540 
3541         case URX_CTR_INIT_NG:
3542             {
3543                 // Initialize a non-greedy loop
3544                 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
3545                 fp->fExtra[opValue] = 0;                 //  Set the loop counter variable to zero
3546 
3547                 // Pick up the three extra operands that CTR_INIT_NG has, and
3548                 //    skip the pattern location counter past
3549                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3550                 fp->fPatIdx += 3;
3551                 int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
3552                 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
3553                 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
3554                 U_ASSERT(minCount>=0);
3555                 U_ASSERT(maxCount>=minCount || maxCount==-1);
3556                 U_ASSERT(loopLoc>fp->fPatIdx);
3557                 if (maxCount == -1) {
3558                     fp->fExtra[opValue+1] = fp->fInputIdx;   //  Save initial input index for loop breaking.
3559                 }
3560 
3561                 if (minCount == 0) {
3562                     if (maxCount != 0) {
3563                         fp = StateSave(fp, fp->fPatIdx, status);
3564                     }
3565                     fp->fPatIdx = loopLoc+1;   // Continue with stuff after repeated block
3566                 }
3567             }
3568             break;
3569 
3570         case URX_CTR_LOOP_NG:
3571             {
3572                 // Non-greedy {min, max} loops
3573                 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
3574                 int32_t initOp = (int32_t)pat[opValue];
3575                 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
3576                 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
3577                 int32_t minCount  = (int32_t)pat[opValue+2];
3578                 int32_t maxCount  = (int32_t)pat[opValue+3];
3579 
3580                 (*pCounter)++;
3581                 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
3582                     // The loop has matched the maximum permitted number of times.
3583                     //   Break out of here with no action.  Matching will
3584                     //   continue with the following pattern.
3585                     U_ASSERT(*pCounter == maxCount);
3586                     break;
3587                 }
3588 
3589                 if (*pCounter < minCount) {
3590                     // We haven't met the minimum number of matches yet.
3591                     //   Loop back for another one.
3592                     fp->fPatIdx = opValue + 4;    // Loop back.
3593                     // Increment time-out counter. (StateSave() does it if count >= minCount)
3594                     fTickCounter--;
3595                     if (fTickCounter <= 0) {
3596                         IncrementTime(status);    // Re-initializes fTickCounter
3597                     }
3598                 } else {
3599                     // We do have the minimum number of matches.
3600 
3601                     // If there is no upper bound on the loop iterations, check that the input index
3602                     // is progressing, and stop the loop if it is not.
3603                     if (maxCount == -1) {
3604                         int64_t *pLastInputIdx =  &fp->fExtra[URX_VAL(initOp) + 1];
3605                         if (fp->fInputIdx == *pLastInputIdx) {
3606                             break;
3607                         }
3608                         *pLastInputIdx = fp->fInputIdx;
3609                     }
3610 
3611                     // Loop Continuation: we will fall into the pattern following the loop
3612                     //   (non-greedy, don't execute loop body first), but first do
3613                     //   a state save to the top of the loop, so that a match failure
3614                     //   in the following pattern will try another iteration of the loop.
3615                     fp = StateSave(fp, opValue + 4, status);
3616                 }
3617             }
3618             break;
3619 
3620         case URX_STO_SP:
3621             U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
3622             fData[opValue] = fStack->size();
3623             break;
3624 
3625         case URX_LD_SP:
3626             {
3627                 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
3628                 int32_t newStackSize = (int32_t)fData[opValue];
3629                 U_ASSERT(newStackSize <= fStack->size());
3630                 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
3631                 if (newFP == (int64_t *)fp) {
3632                     break;
3633                 }
3634                 int32_t j;
3635                 for (j=0; j<fFrameSize; j++) {
3636                     newFP[j] = ((int64_t *)fp)[j];
3637                 }
3638                 fp = (REStackFrame *)newFP;
3639                 fStack->setSize(newStackSize);
3640             }
3641             break;
3642 
3643         case URX_BACKREF:
3644             {
3645                 U_ASSERT(opValue < fFrameSize);
3646                 int64_t groupStartIdx = fp->fExtra[opValue];
3647                 int64_t groupEndIdx   = fp->fExtra[opValue+1];
3648                 U_ASSERT(groupStartIdx <= groupEndIdx);
3649                 if (groupStartIdx < 0) {
3650                     // This capture group has not participated in the match thus far,
3651                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
3652                     break;
3653                 }
3654                 UTEXT_SETNATIVEINDEX(fAltInputText, groupStartIdx);
3655                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3656 
3657                 //   Note: if the capture group match was of an empty string the backref
3658                 //         match succeeds.  Verified by testing:  Perl matches succeed
3659                 //         in this case, so we do too.
3660 
3661                 UBool success = TRUE;
3662                 for (;;) {
3663                     if (utext_getNativeIndex(fAltInputText) >= groupEndIdx) {
3664                         success = TRUE;
3665                         break;
3666                     }
3667                     if (utext_getNativeIndex(fInputText) >= fActiveLimit) {
3668                         success = FALSE;
3669                         fHitEnd = TRUE;
3670                         break;
3671                     }
3672                     UChar32 captureGroupChar = utext_next32(fAltInputText);
3673                     UChar32 inputChar = utext_next32(fInputText);
3674                     if (inputChar != captureGroupChar) {
3675                         success = FALSE;
3676                         break;
3677                     }
3678                 }
3679 
3680                 if (success) {
3681                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3682                 } else {
3683                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3684                 }
3685             }
3686             break;
3687 
3688 
3689 
3690         case URX_BACKREF_I:
3691             {
3692                 U_ASSERT(opValue < fFrameSize);
3693                 int64_t groupStartIdx = fp->fExtra[opValue];
3694                 int64_t groupEndIdx   = fp->fExtra[opValue+1];
3695                 U_ASSERT(groupStartIdx <= groupEndIdx);
3696                 if (groupStartIdx < 0) {
3697                     // This capture group has not participated in the match thus far,
3698                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
3699                     break;
3700                 }
3701                 utext_setNativeIndex(fAltInputText, groupStartIdx);
3702                 utext_setNativeIndex(fInputText, fp->fInputIdx);
3703                 CaseFoldingUTextIterator captureGroupItr(*fAltInputText);
3704                 CaseFoldingUTextIterator inputItr(*fInputText);
3705 
3706                 //   Note: if the capture group match was of an empty string the backref
3707                 //         match succeeds.  Verified by testing:  Perl matches succeed
3708                 //         in this case, so we do too.
3709 
3710                 UBool success = TRUE;
3711                 for (;;) {
3712                     if (!captureGroupItr.inExpansion() && utext_getNativeIndex(fAltInputText) >= groupEndIdx) {
3713                         success = TRUE;
3714                         break;
3715                     }
3716                     if (!inputItr.inExpansion() && utext_getNativeIndex(fInputText) >= fActiveLimit) {
3717                         success = FALSE;
3718                         fHitEnd = TRUE;
3719                         break;
3720                     }
3721                     UChar32 captureGroupChar = captureGroupItr.next();
3722                     UChar32 inputChar = inputItr.next();
3723                     if (inputChar != captureGroupChar) {
3724                         success = FALSE;
3725                         break;
3726                     }
3727                 }
3728 
3729                 if (success && inputItr.inExpansion()) {
3730                     // We obtained a match by consuming part of a string obtained from
3731                     // case-folding a single code point of the input text.
3732                     // This does not count as an overall match.
3733                     success = FALSE;
3734                 }
3735 
3736                 if (success) {
3737                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3738                 } else {
3739                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3740                 }
3741 
3742             }
3743             break;
3744 
3745         case URX_STO_INP_LOC:
3746             {
3747                 U_ASSERT(opValue >= 0 && opValue < fFrameSize);
3748                 fp->fExtra[opValue] = fp->fInputIdx;
3749             }
3750             break;
3751 
3752         case URX_JMPX:
3753             {
3754                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3755                 fp->fPatIdx += 1;
3756                 int32_t dataLoc  = URX_VAL(pat[instrOperandLoc]);
3757                 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
3758                 int64_t savedInputIdx = fp->fExtra[dataLoc];
3759                 U_ASSERT(savedInputIdx <= fp->fInputIdx);
3760                 if (savedInputIdx < fp->fInputIdx) {
3761                     fp->fPatIdx = opValue;                               // JMP
3762                 } else {
3763                      fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no progress in loop.
3764                 }
3765             }
3766             break;
3767 
3768         case URX_LA_START:
3769             {
3770                 // Entering a look around block.
3771                 // Save Stack Ptr, Input Pos.
3772                 U_ASSERT(opValue>=0 && opValue+3<fPattern->fDataSize);
3773                 fData[opValue]   = fStack->size();
3774                 fData[opValue+1] = fp->fInputIdx;
3775                 fData[opValue+2] = fActiveStart;
3776                 fData[opValue+3] = fActiveLimit;
3777                 fActiveStart     = fLookStart;          // Set the match region change for
3778                 fActiveLimit     = fLookLimit;          //   transparent bounds.
3779             }
3780             break;
3781 
3782         case URX_LA_END:
3783             {
3784                 // Leaving a look-ahead block.
3785                 //  restore Stack Ptr, Input Pos to positions they had on entry to block.
3786                 U_ASSERT(opValue>=0 && opValue+3<fPattern->fDataSize);
3787                 int32_t stackSize = fStack->size();
3788                 int32_t newStackSize =(int32_t)fData[opValue];
3789                 U_ASSERT(stackSize >= newStackSize);
3790                 if (stackSize > newStackSize) {
3791                     // Copy the current top frame back to the new (cut back) top frame.
3792                     //   This makes the capture groups from within the look-ahead
3793                     //   expression available.
3794                     int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
3795                     int32_t j;
3796                     for (j=0; j<fFrameSize; j++) {
3797                         newFP[j] = ((int64_t *)fp)[j];
3798                     }
3799                     fp = (REStackFrame *)newFP;
3800                     fStack->setSize(newStackSize);
3801                 }
3802                 fp->fInputIdx = fData[opValue+1];
3803 
3804                 // Restore the active region bounds in the input string; they may have
3805                 //    been changed because of transparent bounds on a Region.
3806                 fActiveStart = fData[opValue+2];
3807                 fActiveLimit = fData[opValue+3];
3808                 U_ASSERT(fActiveStart >= 0);
3809                 U_ASSERT(fActiveLimit <= fInputLength);
3810             }
3811             break;
3812 
3813         case URX_ONECHAR_I:
3814             // Case insensitive one char.  The char from the pattern is already case folded.
3815             // Input text is not, but case folding the input can not reduce two or more code
3816             // points to one.
3817             if (fp->fInputIdx < fActiveLimit) {
3818                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3819 
3820                 UChar32 c = UTEXT_NEXT32(fInputText);
3821                 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
3822                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3823                     break;
3824                 }
3825             } else {
3826                 fHitEnd = TRUE;
3827             }
3828 
3829             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3830             break;
3831 
3832         case URX_STRING_I:
3833             {
3834                 // Case-insensitive test input against a literal string.
3835                 // Strings require two slots in the compiled pattern, one for the
3836                 //   offset to the string text, and one for the length.
3837                 //   The compiled string has already been case folded.
3838                 {
3839                     const UChar *patternString = litText + opValue;
3840                     int32_t      patternStringIdx  = 0;
3841 
3842                     op      = (int32_t)pat[fp->fPatIdx];
3843                     fp->fPatIdx++;
3844                     opType  = URX_TYPE(op);
3845                     opValue = URX_VAL(op);
3846                     U_ASSERT(opType == URX_STRING_LEN);
3847                     int32_t patternStringLen = opValue;  // Length of the string from the pattern.
3848 
3849 
3850                     UChar32   cPattern;
3851                     UChar32   cText;
3852                     UBool     success = TRUE;
3853 
3854                     UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3855                     CaseFoldingUTextIterator inputIterator(*fInputText);
3856                     while (patternStringIdx < patternStringLen) {
3857                         if (!inputIterator.inExpansion() && UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) {
3858                             success = FALSE;
3859                             fHitEnd = TRUE;
3860                             break;
3861                         }
3862                         U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern);
3863                         cText = inputIterator.next();
3864                         if (cText != cPattern) {
3865                             success = FALSE;
3866                             break;
3867                         }
3868                     }
3869                     if (inputIterator.inExpansion()) {
3870                         success = FALSE;
3871                     }
3872 
3873                     if (success) {
3874                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3875                     } else {
3876                         fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3877                     }
3878                 }
3879             }
3880             break;
3881 
3882         case URX_LB_START:
3883             {
3884                 // Entering a look-behind block.
3885                 // Save Stack Ptr, Input Pos and active input region.
3886                 //   TODO:  implement transparent bounds.  Ticket #6067
3887                 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
3888                 fData[opValue]   = fStack->size();
3889                 fData[opValue+1] = fp->fInputIdx;
3890                 // Save input string length, then reset to pin any matches to end at
3891                 //   the current position.
3892                 fData[opValue+2] = fActiveStart;
3893                 fData[opValue+3] = fActiveLimit;
3894                 fActiveStart     = fRegionStart;
3895                 fActiveLimit     = fp->fInputIdx;
3896                 // Init the variable containing the start index for attempted matches.
3897                 fData[opValue+4] = -1;
3898             }
3899             break;
3900 
3901 
3902         case URX_LB_CONT:
3903             {
3904                 // Positive Look-Behind, at top of loop checking for matches of LB expression
3905                 //    at all possible input starting positions.
3906 
3907                 // Fetch the min and max possible match lengths.  They are the operands
3908                 //   of this op in the pattern.
3909                 int32_t minML = (int32_t)pat[fp->fPatIdx++];
3910                 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
3911                 if (!UTEXT_USES_U16(fInputText)) {
3912                     // utf-8 fix to maximum match length. The pattern compiler assumes utf-16.
3913                     // The max length need not be exact; it just needs to be >= actual maximum.
3914                     maxML *= 3;
3915                 }
3916                 U_ASSERT(minML <= maxML);
3917                 U_ASSERT(minML >= 0);
3918 
3919                 // Fetch (from data) the last input index where a match was attempted.
3920                 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
3921                 int64_t  &lbStartIdx = fData[opValue+4];
3922                 if (lbStartIdx < 0) {
3923                     // First time through loop.
3924                     lbStartIdx = fp->fInputIdx - minML;
3925                     if (lbStartIdx > 0) {
3926                         // move index to a code point boundary, if it's not on one already.
3927                         UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
3928                         lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
3929                     }
3930                 } else {
3931                     // 2nd through nth time through the loop.
3932                     // Back up start position for match by one.
3933                     if (lbStartIdx == 0) {
3934                         (lbStartIdx)--;
3935                     } else {
3936                         UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
3937                         (void)UTEXT_PREVIOUS32(fInputText);
3938                         lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
3939                     }
3940                 }
3941 
3942                 if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
3943                     // We have tried all potential match starting points without
3944                     //  getting a match.  Backtrack out, and out of the
3945                     //   Look Behind altogether.
3946                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3947                     fActiveStart = fData[opValue+2];
3948                     fActiveLimit = fData[opValue+3];
3949                     U_ASSERT(fActiveStart >= 0);
3950                     U_ASSERT(fActiveLimit <= fInputLength);
3951                     break;
3952                 }
3953 
3954                 //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
3955                 //      (successful match will fall off the end of the loop.)
3956                 fp = StateSave(fp, fp->fPatIdx-3, status);
3957                 fp->fInputIdx = lbStartIdx;
3958             }
3959             break;
3960 
3961         case URX_LB_END:
3962             // End of a look-behind block, after a successful match.
3963             {
3964                 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
3965                 if (fp->fInputIdx != fActiveLimit) {
3966                     //  The look-behind expression matched, but the match did not
3967                     //    extend all the way to the point that we are looking behind from.
3968                     //  FAIL out of here, which will take us back to the LB_CONT, which
3969                     //     will retry the match starting at another position or fail
3970                     //     the look-behind altogether, whichever is appropriate.
3971                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3972                     break;
3973                 }
3974 
3975                 // Look-behind match is good.  Restore the original input string region,
3976                 //   which had been truncated to pin the end of the lookbehind match to the
3977                 //   position being looked-behind.
3978                 fActiveStart = fData[opValue+2];
3979                 fActiveLimit = fData[opValue+3];
3980                 U_ASSERT(fActiveStart >= 0);
3981                 U_ASSERT(fActiveLimit <= fInputLength);
3982             }
3983             break;
3984 
3985 
3986         case URX_LBN_CONT:
3987             {
3988                 // Negative Look-Behind, at top of loop checking for matches of LB expression
3989                 //    at all possible input starting positions.
3990 
3991                 // Fetch the extra parameters of this op.
3992                 int32_t minML       = (int32_t)pat[fp->fPatIdx++];
3993                 int32_t maxML       = (int32_t)pat[fp->fPatIdx++];
3994                 if (!UTEXT_USES_U16(fInputText)) {
3995                     // utf-8 fix to maximum match length. The pattern compiler assumes utf-16.
3996                     // The max length need not be exact; it just needs to be >= actual maximum.
3997                     maxML *= 3;
3998                 }
3999                 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
4000                         continueLoc = URX_VAL(continueLoc);
4001                 U_ASSERT(minML <= maxML);
4002                 U_ASSERT(minML >= 0);
4003                 U_ASSERT(continueLoc > fp->fPatIdx);
4004 
4005                 // Fetch (from data) the last input index where a match was attempted.
4006                 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
4007                 int64_t  &lbStartIdx = fData[opValue+4];
4008                 if (lbStartIdx < 0) {
4009                     // First time through loop.
4010                     lbStartIdx = fp->fInputIdx - minML;
4011                     if (lbStartIdx > 0) {
4012                         // move index to a code point boundary, if it's not on one already.
4013                         UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
4014                         lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
4015                     }
4016                 } else {
4017                     // 2nd through nth time through the loop.
4018                     // Back up start position for match by one.
4019                     if (lbStartIdx == 0) {
4020                         (lbStartIdx)--;
4021                     } else {
4022                         UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
4023                         (void)UTEXT_PREVIOUS32(fInputText);
4024                         lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
4025                     }
4026                 }
4027 
4028                 if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
4029                     // We have tried all potential match starting points without
4030                     //  getting a match, which means that the negative lookbehind as
4031                     //  a whole has succeeded.  Jump forward to the continue location
4032                     fActiveStart = fData[opValue+2];
4033                     fActiveLimit = fData[opValue+3];
4034                     U_ASSERT(fActiveStart >= 0);
4035                     U_ASSERT(fActiveLimit <= fInputLength);
4036                     fp->fPatIdx = continueLoc;
4037                     break;
4038                 }
4039 
4040                 //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
4041                 //      (successful match will cause a FAIL out of the loop altogether.)
4042                 fp = StateSave(fp, fp->fPatIdx-4, status);
4043                 fp->fInputIdx = lbStartIdx;
4044             }
4045             break;
4046 
4047         case URX_LBN_END:
4048             // End of a negative look-behind block, after a successful match.
4049             {
4050                 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
4051                 if (fp->fInputIdx != fActiveLimit) {
4052                     //  The look-behind expression matched, but the match did not
4053                     //    extend all the way to the point that we are looking behind from.
4054                     //  FAIL out of here, which will take us back to the LB_CONT, which
4055                     //     will retry the match starting at another position or succeed
4056                     //     the look-behind altogether, whichever is appropriate.
4057                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4058                     break;
4059                 }
4060 
4061                 // Look-behind expression matched, which means look-behind test as
4062                 //   a whole Fails
4063 
4064                 //   Restore the original input string length, which had been truncated
4065                 //   inorder to pin the end of the lookbehind match
4066                 //   to the position being looked-behind.
4067                 fActiveStart = fData[opValue+2];
4068                 fActiveLimit = fData[opValue+3];
4069                 U_ASSERT(fActiveStart >= 0);
4070                 U_ASSERT(fActiveLimit <= fInputLength);
4071 
4072                 // Restore original stack position, discarding any state saved
4073                 //   by the successful pattern match.
4074                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4075                 int32_t newStackSize = (int32_t)fData[opValue];
4076                 U_ASSERT(fStack->size() > newStackSize);
4077                 fStack->setSize(newStackSize);
4078 
4079                 //  FAIL, which will take control back to someplace
4080                 //  prior to entering the look-behind test.
4081                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4082             }
4083             break;
4084 
4085 
4086         case URX_LOOP_SR_I:
4087             // Loop Initialization for the optimized implementation of
4088             //     [some character set]*
4089             //   This op scans through all matching input.
4090             //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
4091             {
4092                 U_ASSERT(opValue > 0 && opValue < fSets->size());
4093                 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
4094                 UnicodeSet   *s  = (UnicodeSet *)fSets->elementAt(opValue);
4095 
4096                 // Loop through input, until either the input is exhausted or
4097                 //   we reach a character that is not a member of the set.
4098                 int64_t ix = fp->fInputIdx;
4099                 UTEXT_SETNATIVEINDEX(fInputText, ix);
4100                 for (;;) {
4101                     if (ix >= fActiveLimit) {
4102                         fHitEnd = TRUE;
4103                         break;
4104                     }
4105                     UChar32 c = UTEXT_NEXT32(fInputText);
4106                     if (c<256) {
4107                         if (s8->contains(c) == FALSE) {
4108                             break;
4109                         }
4110                     } else {
4111                         if (s->contains(c) == FALSE) {
4112                             break;
4113                         }
4114                     }
4115                     ix = UTEXT_GETNATIVEINDEX(fInputText);
4116                 }
4117 
4118                 // If there were no matching characters, skip over the loop altogether.
4119                 //   The loop doesn't run at all, a * op always succeeds.
4120                 if (ix == fp->fInputIdx) {
4121                     fp->fPatIdx++;   // skip the URX_LOOP_C op.
4122                     break;
4123                 }
4124 
4125                 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
4126                 //   must follow.  It's operand is the stack location
4127                 //   that holds the starting input index for the match of this [set]*
4128                 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
4129                 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
4130                 int32_t stackLoc = URX_VAL(loopcOp);
4131                 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
4132                 fp->fExtra[stackLoc] = fp->fInputIdx;
4133                 fp->fInputIdx = ix;
4134 
4135                 // Save State to the URX_LOOP_C op that follows this one,
4136                 //   so that match failures in the following code will return to there.
4137                 //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
4138                 fp = StateSave(fp, fp->fPatIdx, status);
4139                 fp->fPatIdx++;
4140             }
4141             break;
4142 
4143 
4144         case URX_LOOP_DOT_I:
4145             // Loop Initialization for the optimized implementation of .*
4146             //   This op scans through all remaining input.
4147             //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
4148             {
4149                 // Loop through input until the input is exhausted (we reach an end-of-line)
4150                 // In DOTALL mode, we can just go straight to the end of the input.
4151                 int64_t ix;
4152                 if ((opValue & 1) == 1) {
4153                     // Dot-matches-All mode.  Jump straight to the end of the string.
4154                     ix = fActiveLimit;
4155                     fHitEnd = TRUE;
4156                 } else {
4157                     // NOT DOT ALL mode.  Line endings do not match '.'
4158                     // Scan forward until a line ending or end of input.
4159                     ix = fp->fInputIdx;
4160                     UTEXT_SETNATIVEINDEX(fInputText, ix);
4161                     for (;;) {
4162                         if (ix >= fActiveLimit) {
4163                             fHitEnd = TRUE;
4164                             break;
4165                         }
4166                         UChar32 c = UTEXT_NEXT32(fInputText);
4167                         if ((c & 0x7f) <= 0x29) {          // Fast filter of non-new-line-s
4168                             if ((c == 0x0a) ||             //  0x0a is newline in both modes.
4169                                (((opValue & 2) == 0) &&    // IF not UNIX_LINES mode
4170                                     isLineTerminator(c))) {
4171                                 //  char is a line ending.  Exit the scanning loop.
4172                                 break;
4173                             }
4174                         }
4175                         ix = UTEXT_GETNATIVEINDEX(fInputText);
4176                     }
4177                 }
4178 
4179                 // If there were no matching characters, skip over the loop altogether.
4180                 //   The loop doesn't run at all, a * op always succeeds.
4181                 if (ix == fp->fInputIdx) {
4182                     fp->fPatIdx++;   // skip the URX_LOOP_C op.
4183                     break;
4184                 }
4185 
4186                 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
4187                 //   must follow.  It's operand is the stack location
4188                 //   that holds the starting input index for the match of this .*
4189                 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
4190                 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
4191                 int32_t stackLoc = URX_VAL(loopcOp);
4192                 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
4193                 fp->fExtra[stackLoc] = fp->fInputIdx;
4194                 fp->fInputIdx = ix;
4195 
4196                 // Save State to the URX_LOOP_C op that follows this one,
4197                 //   so that match failures in the following code will return to there.
4198                 //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
4199                 fp = StateSave(fp, fp->fPatIdx, status);
4200                 fp->fPatIdx++;
4201             }
4202             break;
4203 
4204 
4205         case URX_LOOP_C:
4206             {
4207                 U_ASSERT(opValue>=0 && opValue<fFrameSize);
4208                 backSearchIndex = fp->fExtra[opValue];
4209                 U_ASSERT(backSearchIndex <= fp->fInputIdx);
4210                 if (backSearchIndex == fp->fInputIdx) {
4211                     // We've backed up the input idx to the point that the loop started.
4212                     // The loop is done.  Leave here without saving state.
4213                     //  Subsequent failures won't come back here.
4214                     break;
4215                 }
4216                 // Set up for the next iteration of the loop, with input index
4217                 //   backed up by one from the last time through,
4218                 //   and a state save to this instruction in case the following code fails again.
4219                 //   (We're going backwards because this loop emulates stack unwinding, not
4220                 //    the initial scan forward.)
4221                 U_ASSERT(fp->fInputIdx > 0);
4222                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
4223                 UChar32 prevC = UTEXT_PREVIOUS32(fInputText);
4224                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
4225 
4226                 UChar32 twoPrevC = UTEXT_PREVIOUS32(fInputText);
4227                 if (prevC == 0x0a &&
4228                     fp->fInputIdx > backSearchIndex &&
4229                     twoPrevC == 0x0d) {
4230                     int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
4231                     if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
4232                         // .*, stepping back over CRLF pair.
4233                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
4234                     }
4235                 }
4236 
4237 
4238                 fp = StateSave(fp, fp->fPatIdx-1, status);
4239             }
4240             break;
4241 
4242 
4243 
4244         default:
4245             // Trouble.  The compiled pattern contains an entry with an
4246             //           unrecognized type tag.
4247             UPRV_UNREACHABLE_ASSERT;
4248             // Unknown opcode type in opType = URX_TYPE(pat[fp->fPatIdx]). But we have
4249             // reports of this in production code, don't use UPRV_UNREACHABLE_EXIT.
4250             // See ICU-21669.
4251             status = U_INTERNAL_PROGRAM_ERROR;
4252         }
4253 
4254         if (U_FAILURE(status)) {
4255             isMatch = FALSE;
4256             break;
4257         }
4258     }
4259 
4260 breakFromLoop:
4261     fMatch = isMatch;
4262     if (isMatch) {
4263         fLastMatchEnd = fMatchEnd;
4264         fMatchStart   = startIdx;
4265         fMatchEnd     = fp->fInputIdx;
4266     }
4267 
4268 #ifdef REGEX_RUN_DEBUG
4269     if (fTraceDebug) {
4270         if (isMatch) {
4271             printf("Match.  start=%ld   end=%ld\n\n", fMatchStart, fMatchEnd);
4272         } else {
4273             printf("No match\n\n");
4274         }
4275     }
4276 #endif
4277 
4278     fFrame = fp;                // The active stack frame when the engine stopped.
4279                                 //   Contains the capture group results that we need to
4280                                 //    access later.
4281     return;
4282 }
4283 
4284 
4285 //--------------------------------------------------------------------------------
4286 //
4287 //   MatchChunkAt   This is the actual matching engine. Like MatchAt, but with the
4288 //                  assumption that the entire string is available in the UText's
4289 //                  chunk buffer. For now, that means we can use int32_t indexes,
4290 //                  except for anything that needs to be saved (like group starts
4291 //                  and ends).
4292 //
4293 //                  startIdx:    begin matching a this index.
4294 //                  toEnd:       if true, match must extend to end of the input region
4295 //
4296 //--------------------------------------------------------------------------------
MatchChunkAt(int32_t startIdx,UBool toEnd,UErrorCode & status)4297 void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status) {
4298     UBool       isMatch  = FALSE;      // True if the we have a match.
4299 
4300     int32_t     backSearchIndex = INT32_MAX; // used after greedy single-character matches for searching backwards
4301 
4302     int32_t     op;                    // Operation from the compiled pattern, split into
4303     int32_t     opType;                //    the opcode
4304     int32_t     opValue;               //    and the operand value.
4305 
4306 #ifdef REGEX_RUN_DEBUG
4307     if (fTraceDebug) {
4308         printf("MatchAt(startIdx=%d)\n", startIdx);
4309         printf("Original Pattern: \"%s\"\n", CStr(StringFromUText(fPattern->fPattern))());
4310         printf("Input String:     \"%s\"\n\n", CStr(StringFromUText(fInputText))());
4311     }
4312 #endif
4313 
4314     if (U_FAILURE(status)) {
4315         return;
4316     }
4317 
4318     //  Cache frequently referenced items from the compiled pattern
4319     //
4320     int64_t             *pat           = fPattern->fCompiledPat->getBuffer();
4321 
4322     const UChar         *litText       = fPattern->fLiteralText.getBuffer();
4323     UVector             *fSets         = fPattern->fSets;
4324 
4325     const UChar         *inputBuf      = fInputText->chunkContents;
4326 
4327     fFrameSize = fPattern->fFrameSize;
4328     REStackFrame        *fp            = resetStack();
4329     if (U_FAILURE(fDeferredStatus)) {
4330         status = fDeferredStatus;
4331         return;
4332     }
4333 
4334     fp->fPatIdx   = 0;
4335     fp->fInputIdx = startIdx;
4336 
4337     // Zero out the pattern's static data
4338     int32_t i;
4339     for (i = 0; i<fPattern->fDataSize; i++) {
4340         fData[i] = 0;
4341     }
4342 
4343     //
4344     //  Main loop for interpreting the compiled pattern.
4345     //  One iteration of the loop per pattern operation performed.
4346     //
4347     for (;;) {
4348         op      = (int32_t)pat[fp->fPatIdx];
4349         opType  = URX_TYPE(op);
4350         opValue = URX_VAL(op);
4351 #ifdef REGEX_RUN_DEBUG
4352         if (fTraceDebug) {
4353             UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
4354             printf("inputIdx=%ld   inputChar=%x   sp=%3ld   activeLimit=%ld  ", fp->fInputIdx,
4355                    UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
4356             fPattern->dumpOp(fp->fPatIdx);
4357         }
4358 #endif
4359         fp->fPatIdx++;
4360 
4361         switch (opType) {
4362 
4363 
4364         case URX_NOP:
4365             break;
4366 
4367 
4368         case URX_BACKTRACK:
4369             // Force a backtrack.  In some circumstances, the pattern compiler
4370             //   will notice that the pattern can't possibly match anything, and will
4371             //   emit one of these at that point.
4372             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4373             break;
4374 
4375 
4376         case URX_ONECHAR:
4377             if (fp->fInputIdx < fActiveLimit) {
4378                 UChar32 c;
4379                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4380                 if (c == opValue) {
4381                     break;
4382                 }
4383             } else {
4384                 fHitEnd = TRUE;
4385             }
4386             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4387             break;
4388 
4389 
4390         case URX_STRING:
4391             {
4392                 // Test input against a literal string.
4393                 // Strings require two slots in the compiled pattern, one for the
4394                 //   offset to the string text, and one for the length.
4395                 int32_t   stringStartIdx = opValue;
4396                 int32_t   stringLen;
4397 
4398                 op      = (int32_t)pat[fp->fPatIdx];     // Fetch the second operand
4399                 fp->fPatIdx++;
4400                 opType    = URX_TYPE(op);
4401                 stringLen = URX_VAL(op);
4402                 U_ASSERT(opType == URX_STRING_LEN);
4403                 U_ASSERT(stringLen >= 2);
4404 
4405                 const UChar * pInp = inputBuf + fp->fInputIdx;
4406                 const UChar * pInpLimit = inputBuf + fActiveLimit;
4407                 const UChar * pPat = litText+stringStartIdx;
4408                 const UChar * pEnd = pInp + stringLen;
4409                 UBool success = TRUE;
4410                 while (pInp < pEnd) {
4411                     if (pInp >= pInpLimit) {
4412                         fHitEnd = TRUE;
4413                         success = FALSE;
4414                         break;
4415                     }
4416                     if (*pInp++ != *pPat++) {
4417                         success = FALSE;
4418                         break;
4419                     }
4420                 }
4421 
4422                 if (success) {
4423                     fp->fInputIdx += stringLen;
4424                 } else {
4425                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4426                 }
4427             }
4428             break;
4429 
4430 
4431         case URX_STATE_SAVE:
4432             fp = StateSave(fp, opValue, status);
4433             break;
4434 
4435 
4436         case URX_END:
4437             // The match loop will exit via this path on a successful match,
4438             //   when we reach the end of the pattern.
4439             if (toEnd && fp->fInputIdx != fActiveLimit) {
4440                 // The pattern matched, but not to the end of input.  Try some more.
4441                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4442                 break;
4443             }
4444             isMatch = TRUE;
4445             goto  breakFromLoop;
4446 
4447             // Start and End Capture stack frame variables are laid out out like this:
4448             //  fp->fExtra[opValue]  - The start of a completed capture group
4449             //             opValue+1 - The end   of a completed capture group
4450             //             opValue+2 - the start of a capture group whose end
4451             //                          has not yet been reached (and might not ever be).
4452         case URX_START_CAPTURE:
4453             U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
4454             fp->fExtra[opValue+2] = fp->fInputIdx;
4455             break;
4456 
4457 
4458         case URX_END_CAPTURE:
4459             U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
4460             U_ASSERT(fp->fExtra[opValue+2] >= 0);            // Start pos for this group must be set.
4461             fp->fExtra[opValue]   = fp->fExtra[opValue+2];   // Tentative start becomes real.
4462             fp->fExtra[opValue+1] = fp->fInputIdx;           // End position
4463             U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
4464             break;
4465 
4466 
4467         case URX_DOLLAR:                   //  $, test for End of line
4468             //     or for position before new line at end of input
4469             if (fp->fInputIdx < fAnchorLimit-2) {
4470                 // We are no where near the end of input.  Fail.
4471                 //   This is the common case.  Keep it first.
4472                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4473                 break;
4474             }
4475             if (fp->fInputIdx >= fAnchorLimit) {
4476                 // We really are at the end of input.  Success.
4477                 fHitEnd = TRUE;
4478                 fRequireEnd = TRUE;
4479                 break;
4480             }
4481 
4482             // If we are positioned just before a new-line that is located at the
4483             //   end of input, succeed.
4484             if (fp->fInputIdx == fAnchorLimit-1) {
4485                 UChar32 c;
4486                 U16_GET(inputBuf, fAnchorStart, fp->fInputIdx, fAnchorLimit, c);
4487 
4488                 if (isLineTerminator(c)) {
4489                     if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
4490                         // At new-line at end of input. Success
4491                         fHitEnd = TRUE;
4492                         fRequireEnd = TRUE;
4493                         break;
4494                     }
4495                 }
4496             } else if (fp->fInputIdx == fAnchorLimit-2 &&
4497                 inputBuf[fp->fInputIdx]==0x0d && inputBuf[fp->fInputIdx+1]==0x0a) {
4498                     fHitEnd = TRUE;
4499                     fRequireEnd = TRUE;
4500                     break;                         // At CR/LF at end of input.  Success
4501             }
4502 
4503             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4504 
4505             break;
4506 
4507 
4508         case URX_DOLLAR_D:                   //  $, test for End of Line, in UNIX_LINES mode.
4509             if (fp->fInputIdx >= fAnchorLimit-1) {
4510                 // Either at the last character of input, or off the end.
4511                 if (fp->fInputIdx == fAnchorLimit-1) {
4512                     // At last char of input.  Success if it's a new line.
4513                     if (inputBuf[fp->fInputIdx] == 0x0a) {
4514                         fHitEnd = TRUE;
4515                         fRequireEnd = TRUE;
4516                         break;
4517                     }
4518                 } else {
4519                     // Off the end of input.  Success.
4520                     fHitEnd = TRUE;
4521                     fRequireEnd = TRUE;
4522                     break;
4523                 }
4524             }
4525 
4526             // Not at end of input.  Back-track out.
4527             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4528             break;
4529 
4530 
4531         case URX_DOLLAR_M:                //  $, test for End of line in multi-line mode
4532             {
4533                 if (fp->fInputIdx >= fAnchorLimit) {
4534                     // We really are at the end of input.  Success.
4535                     fHitEnd = TRUE;
4536                     fRequireEnd = TRUE;
4537                     break;
4538                 }
4539                 // If we are positioned just before a new-line, succeed.
4540                 // It makes no difference where the new-line is within the input.
4541                 UChar32 c = inputBuf[fp->fInputIdx];
4542                 if (isLineTerminator(c)) {
4543                     // At a line end, except for the odd chance of  being in the middle of a CR/LF sequence
4544                     //  In multi-line mode, hitting a new-line just before the end of input does not
4545                     //   set the hitEnd or requireEnd flags
4546                     if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
4547                         break;
4548                     }
4549                 }
4550                 // not at a new line.  Fail.
4551                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4552             }
4553             break;
4554 
4555 
4556         case URX_DOLLAR_MD:                //  $, test for End of line in multi-line and UNIX_LINES mode
4557             {
4558                 if (fp->fInputIdx >= fAnchorLimit) {
4559                     // We really are at the end of input.  Success.
4560                     fHitEnd = TRUE;
4561                     fRequireEnd = TRUE;  // Java set requireEnd in this case, even though
4562                     break;               //   adding a new-line would not lose the match.
4563                 }
4564                 // If we are not positioned just before a new-line, the test fails; backtrack out.
4565                 // It makes no difference where the new-line is within the input.
4566                 if (inputBuf[fp->fInputIdx] != 0x0a) {
4567                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4568                 }
4569             }
4570             break;
4571 
4572 
4573         case URX_CARET:                    //  ^, test for start of line
4574             if (fp->fInputIdx != fAnchorStart) {
4575                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4576             }
4577             break;
4578 
4579 
4580         case URX_CARET_M:                   //  ^, test for start of line in mulit-line mode
4581             {
4582                 if (fp->fInputIdx == fAnchorStart) {
4583                     // We are at the start input.  Success.
4584                     break;
4585                 }
4586                 // Check whether character just before the current pos is a new-line
4587                 //   unless we are at the end of input
4588                 UChar  c = inputBuf[fp->fInputIdx - 1];
4589                 if ((fp->fInputIdx < fAnchorLimit) &&
4590                     isLineTerminator(c)) {
4591                     //  It's a new-line.  ^ is true.  Success.
4592                     //  TODO:  what should be done with positions between a CR and LF?
4593                     break;
4594                 }
4595                 // Not at the start of a line.  Fail.
4596                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4597             }
4598             break;
4599 
4600 
4601         case URX_CARET_M_UNIX:       //  ^, test for start of line in mulit-line + Unix-line mode
4602             {
4603                 U_ASSERT(fp->fInputIdx >= fAnchorStart);
4604                 if (fp->fInputIdx <= fAnchorStart) {
4605                     // We are at the start input.  Success.
4606                     break;
4607                 }
4608                 // Check whether character just before the current pos is a new-line
4609                 U_ASSERT(fp->fInputIdx <= fAnchorLimit);
4610                 UChar  c = inputBuf[fp->fInputIdx - 1];
4611                 if (c != 0x0a) {
4612                     // Not at the start of a line.  Back-track out.
4613                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4614                 }
4615             }
4616             break;
4617 
4618         case URX_BACKSLASH_B:          // Test for word boundaries
4619             {
4620                 UBool success = isChunkWordBoundary((int32_t)fp->fInputIdx);
4621                 success ^= (UBool)(opValue != 0);     // flip sense for \B
4622                 if (!success) {
4623                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4624                 }
4625             }
4626             break;
4627 
4628 
4629         case URX_BACKSLASH_BU:          // Test for word boundaries, Unicode-style
4630             {
4631                 UBool success = isUWordBoundary(fp->fInputIdx, status);
4632                 success ^= (UBool)(opValue != 0);     // flip sense for \B
4633                 if (!success) {
4634                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4635                 }
4636             }
4637             break;
4638 
4639 
4640         case URX_BACKSLASH_D:            // Test for decimal digit
4641             {
4642                 if (fp->fInputIdx >= fActiveLimit) {
4643                     fHitEnd = TRUE;
4644                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4645                     break;
4646                 }
4647 
4648                 UChar32 c;
4649                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4650                 int8_t ctype = u_charType(c);     // TODO:  make a unicode set for this.  Will be faster.
4651                 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
4652                 success ^= (UBool)(opValue != 0);        // flip sense for \D
4653                 if (!success) {
4654                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4655                 }
4656             }
4657             break;
4658 
4659 
4660         case URX_BACKSLASH_G:          // Test for position at end of previous match
4661             if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) {
4662                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4663             }
4664             break;
4665 
4666 
4667         case URX_BACKSLASH_H:            // Test for \h, horizontal white space.
4668             {
4669                 if (fp->fInputIdx >= fActiveLimit) {
4670                     fHitEnd = TRUE;
4671                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4672                     break;
4673                 }
4674                 UChar32 c;
4675                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4676                 int8_t ctype = u_charType(c);
4677                 UBool success = (ctype == U_SPACE_SEPARATOR || c == 9);  // SPACE_SEPARATOR || TAB
4678                 success ^= (UBool)(opValue != 0);        // flip sense for \H
4679                 if (!success) {
4680                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4681                 }
4682             }
4683             break;
4684 
4685 
4686         case URX_BACKSLASH_R:            // Test for \R, any line break sequence.
4687             {
4688                 if (fp->fInputIdx >= fActiveLimit) {
4689                     fHitEnd = TRUE;
4690                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4691                     break;
4692                 }
4693                 UChar32 c;
4694                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4695                 if (isLineTerminator(c)) {
4696                     if (c == 0x0d && fp->fInputIdx < fActiveLimit) {
4697                         // Check for CR/LF sequence. Consume both together when found.
4698                         UChar c2;
4699                         U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c2);
4700                         if (c2 != 0x0a) {
4701                             U16_PREV(inputBuf, 0, fp->fInputIdx, c2);
4702                         }
4703                     }
4704                 } else {
4705                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4706                 }
4707             }
4708             break;
4709 
4710 
4711         case URX_BACKSLASH_V:         // Any single code point line ending.
4712             {
4713                 if (fp->fInputIdx >= fActiveLimit) {
4714                     fHitEnd = TRUE;
4715                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4716                     break;
4717                 }
4718                 UChar32 c;
4719                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4720                 UBool success = isLineTerminator(c);
4721                 success ^= (UBool)(opValue != 0);        // flip sense for \V
4722                 if (!success) {
4723                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4724                 }
4725             }
4726             break;
4727 
4728 
4729         case URX_BACKSLASH_X:
4730             //  Match a Grapheme, as defined by Unicode UAX 29.
4731 
4732             // Fail if at end of input
4733             if (fp->fInputIdx >= fActiveLimit) {
4734                 fHitEnd = TRUE;
4735                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4736                 break;
4737             }
4738 
4739             fp->fInputIdx = followingGCBoundary(fp->fInputIdx, status);
4740             if (fp->fInputIdx >= fActiveLimit) {
4741                 fHitEnd = TRUE;
4742                 fp->fInputIdx = fActiveLimit;
4743             }
4744             break;
4745 
4746 
4747         case URX_BACKSLASH_Z:          // Test for end of Input
4748             if (fp->fInputIdx < fAnchorLimit) {
4749                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4750             } else {
4751                 fHitEnd = TRUE;
4752                 fRequireEnd = TRUE;
4753             }
4754             break;
4755 
4756 
4757 
4758         case URX_STATIC_SETREF:
4759             {
4760                 // Test input character against one of the predefined sets
4761                 //    (Word Characters, for example)
4762                 // The high bit of the op value is a flag for the match polarity.
4763                 //    0:   success if input char is in set.
4764                 //    1:   success if input char is not in set.
4765                 if (fp->fInputIdx >= fActiveLimit) {
4766                     fHitEnd = TRUE;
4767                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4768                     break;
4769                 }
4770 
4771                 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
4772                 opValue &= ~URX_NEG_SET;
4773                 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
4774 
4775                 UChar32 c;
4776                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4777                 if (c < 256) {
4778                     Regex8BitSet &s8 = RegexStaticSets::gStaticSets->fPropSets8[opValue];
4779                     if (s8.contains(c)) {
4780                         success = !success;
4781                     }
4782                 } else {
4783                     const UnicodeSet &s = RegexStaticSets::gStaticSets->fPropSets[opValue];
4784                     if (s.contains(c)) {
4785                         success = !success;
4786                     }
4787                 }
4788                 if (!success) {
4789                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4790                 }
4791             }
4792             break;
4793 
4794 
4795         case URX_STAT_SETREF_N:
4796             {
4797                 // Test input character for NOT being a member of  one of
4798                 //    the predefined sets (Word Characters, for example)
4799                 if (fp->fInputIdx >= fActiveLimit) {
4800                     fHitEnd = TRUE;
4801                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4802                     break;
4803                 }
4804 
4805                 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
4806 
4807                 UChar32  c;
4808                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4809                 if (c < 256) {
4810                     Regex8BitSet &s8 = RegexStaticSets::gStaticSets->fPropSets8[opValue];
4811                     if (s8.contains(c) == FALSE) {
4812                         break;
4813                     }
4814                 } else {
4815                     const UnicodeSet &s = RegexStaticSets::gStaticSets->fPropSets[opValue];
4816                     if (s.contains(c) == FALSE) {
4817                         break;
4818                     }
4819                 }
4820                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4821             }
4822             break;
4823 
4824 
4825         case URX_SETREF:
4826             {
4827                 if (fp->fInputIdx >= fActiveLimit) {
4828                     fHitEnd = TRUE;
4829                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4830                     break;
4831                 }
4832 
4833                 U_ASSERT(opValue > 0 && opValue < fSets->size());
4834 
4835                 // There is input left.  Pick up one char and test it for set membership.
4836                 UChar32  c;
4837                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4838                 if (c<256) {
4839                     Regex8BitSet *s8 = &fPattern->fSets8[opValue];
4840                     if (s8->contains(c)) {
4841                         // The character is in the set.  A Match.
4842                         break;
4843                     }
4844                 } else {
4845                     UnicodeSet *s = (UnicodeSet *)fSets->elementAt(opValue);
4846                     if (s->contains(c)) {
4847                         // The character is in the set.  A Match.
4848                         break;
4849                     }
4850                 }
4851 
4852                 // the character wasn't in the set.
4853                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4854             }
4855             break;
4856 
4857 
4858         case URX_DOTANY:
4859             {
4860                 // . matches anything, but stops at end-of-line.
4861                 if (fp->fInputIdx >= fActiveLimit) {
4862                     // At end of input.  Match failed.  Backtrack out.
4863                     fHitEnd = TRUE;
4864                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4865                     break;
4866                 }
4867 
4868                 // There is input left.  Advance over one char, unless we've hit end-of-line
4869                 UChar32  c;
4870                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4871                 if (isLineTerminator(c)) {
4872                     // End of line in normal mode.   . does not match.
4873                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4874                     break;
4875                 }
4876             }
4877             break;
4878 
4879 
4880         case URX_DOTANY_ALL:
4881             {
4882                 // . in dot-matches-all (including new lines) mode
4883                 if (fp->fInputIdx >= fActiveLimit) {
4884                     // At end of input.  Match failed.  Backtrack out.
4885                     fHitEnd = TRUE;
4886                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4887                     break;
4888                 }
4889 
4890                 // There is input left.  Advance over one char, except if we are
4891                 //   at a cr/lf, advance over both of them.
4892                 UChar32 c;
4893                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4894                 if (c==0x0d && fp->fInputIdx < fActiveLimit) {
4895                     // In the case of a CR/LF, we need to advance over both.
4896                     if (inputBuf[fp->fInputIdx] == 0x0a) {
4897                         U16_FWD_1(inputBuf, fp->fInputIdx, fActiveLimit);
4898                     }
4899                 }
4900             }
4901             break;
4902 
4903 
4904         case URX_DOTANY_UNIX:
4905             {
4906                 // '.' operator, matches all, but stops at end-of-line.
4907                 //   UNIX_LINES mode, so 0x0a is the only recognized line ending.
4908                 if (fp->fInputIdx >= fActiveLimit) {
4909                     // At end of input.  Match failed.  Backtrack out.
4910                     fHitEnd = TRUE;
4911                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4912                     break;
4913                 }
4914 
4915                 // There is input left.  Advance over one char, unless we've hit end-of-line
4916                 UChar32 c;
4917                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4918                 if (c == 0x0a) {
4919                     // End of line in normal mode.   '.' does not match the \n
4920                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4921                 }
4922             }
4923             break;
4924 
4925 
4926         case URX_JMP:
4927             fp->fPatIdx = opValue;
4928             break;
4929 
4930         case URX_FAIL:
4931             isMatch = FALSE;
4932             goto breakFromLoop;
4933 
4934         case URX_JMP_SAV:
4935             U_ASSERT(opValue < fPattern->fCompiledPat->size());
4936             fp = StateSave(fp, fp->fPatIdx, status);       // State save to loc following current
4937             fp->fPatIdx = opValue;                         // Then JMP.
4938             break;
4939 
4940         case URX_JMP_SAV_X:
4941             // This opcode is used with (x)+, when x can match a zero length string.
4942             // Same as JMP_SAV, except conditional on the match having made forward progress.
4943             // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
4944             //   data address of the input position at the start of the loop.
4945             {
4946                 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
4947                 int32_t  stoOp = (int32_t)pat[opValue-1];
4948                 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
4949                 int32_t  frameLoc = URX_VAL(stoOp);
4950                 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
4951                 int32_t prevInputIdx = (int32_t)fp->fExtra[frameLoc];
4952                 U_ASSERT(prevInputIdx <= fp->fInputIdx);
4953                 if (prevInputIdx < fp->fInputIdx) {
4954                     // The match did make progress.  Repeat the loop.
4955                     fp = StateSave(fp, fp->fPatIdx, status);  // State save to loc following current
4956                     fp->fPatIdx = opValue;
4957                     fp->fExtra[frameLoc] = fp->fInputIdx;
4958                 }
4959                 // If the input position did not advance, we do nothing here,
4960                 //   execution will fall out of the loop.
4961             }
4962             break;
4963 
4964         case URX_CTR_INIT:
4965             {
4966                 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
4967                 fp->fExtra[opValue] = 0;                 //  Set the loop counter variable to zero
4968 
4969                 // Pick up the three extra operands that CTR_INIT has, and
4970                 //    skip the pattern location counter past
4971                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
4972                 fp->fPatIdx += 3;
4973                 int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
4974                 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
4975                 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
4976                 U_ASSERT(minCount>=0);
4977                 U_ASSERT(maxCount>=minCount || maxCount==-1);
4978                 U_ASSERT(loopLoc>=fp->fPatIdx);
4979 
4980                 if (minCount == 0) {
4981                     fp = StateSave(fp, loopLoc+1, status);
4982                 }
4983                 if (maxCount == -1) {
4984                     fp->fExtra[opValue+1] = fp->fInputIdx;   //  For loop breaking.
4985                 } else if (maxCount == 0) {
4986                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4987                 }
4988             }
4989             break;
4990 
4991         case URX_CTR_LOOP:
4992             {
4993                 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
4994                 int32_t initOp = (int32_t)pat[opValue];
4995                 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
4996                 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
4997                 int32_t minCount  = (int32_t)pat[opValue+2];
4998                 int32_t maxCount  = (int32_t)pat[opValue+3];
4999                 (*pCounter)++;
5000                 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
5001                     U_ASSERT(*pCounter == maxCount);
5002                     break;
5003                 }
5004                 if (*pCounter >= minCount) {
5005                     if (maxCount == -1) {
5006                         // Loop has no hard upper bound.
5007                         // Check that it is progressing through the input, break if it is not.
5008                         int64_t *pLastInputIdx =  &fp->fExtra[URX_VAL(initOp) + 1];
5009                         if (fp->fInputIdx == *pLastInputIdx) {
5010                             break;
5011                         } else {
5012                             *pLastInputIdx = fp->fInputIdx;
5013                         }
5014                     }
5015                     fp = StateSave(fp, fp->fPatIdx, status);
5016                 } else {
5017                     // Increment time-out counter. (StateSave() does it if count >= minCount)
5018                     fTickCounter--;
5019                     if (fTickCounter <= 0) {
5020                         IncrementTime(status);    // Re-initializes fTickCounter
5021                     }
5022                 }
5023                 fp->fPatIdx = opValue + 4;    // Loop back.
5024             }
5025             break;
5026 
5027         case URX_CTR_INIT_NG:
5028             {
5029                 // Initialize a non-greedy loop
5030                 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
5031                 fp->fExtra[opValue] = 0;                 //  Set the loop counter variable to zero
5032 
5033                 // Pick up the three extra operands that CTR_INIT_NG has, and
5034                 //    skip the pattern location counter past
5035                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
5036                 fp->fPatIdx += 3;
5037                 int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
5038                 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
5039                 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
5040                 U_ASSERT(minCount>=0);
5041                 U_ASSERT(maxCount>=minCount || maxCount==-1);
5042                 U_ASSERT(loopLoc>fp->fPatIdx);
5043                 if (maxCount == -1) {
5044                     fp->fExtra[opValue+1] = fp->fInputIdx;   //  Save initial input index for loop breaking.
5045                 }
5046 
5047                 if (minCount == 0) {
5048                     if (maxCount != 0) {
5049                         fp = StateSave(fp, fp->fPatIdx, status);
5050                     }
5051                     fp->fPatIdx = loopLoc+1;   // Continue with stuff after repeated block
5052                 }
5053             }
5054             break;
5055 
5056         case URX_CTR_LOOP_NG:
5057             {
5058                 // Non-greedy {min, max} loops
5059                 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
5060                 int32_t initOp = (int32_t)pat[opValue];
5061                 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
5062                 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
5063                 int32_t minCount  = (int32_t)pat[opValue+2];
5064                 int32_t maxCount  = (int32_t)pat[opValue+3];
5065 
5066                 (*pCounter)++;
5067                 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
5068                     // The loop has matched the maximum permitted number of times.
5069                     //   Break out of here with no action.  Matching will
5070                     //   continue with the following pattern.
5071                     U_ASSERT(*pCounter == maxCount);
5072                     break;
5073                 }
5074 
5075                 if (*pCounter < minCount) {
5076                     // We haven't met the minimum number of matches yet.
5077                     //   Loop back for another one.
5078                     fp->fPatIdx = opValue + 4;    // Loop back.
5079                     fTickCounter--;
5080                     if (fTickCounter <= 0) {
5081                         IncrementTime(status);    // Re-initializes fTickCounter
5082                     }
5083                 } else {
5084                     // We do have the minimum number of matches.
5085 
5086                     // If there is no upper bound on the loop iterations, check that the input index
5087                     // is progressing, and stop the loop if it is not.
5088                     if (maxCount == -1) {
5089                         int64_t *pLastInputIdx =  &fp->fExtra[URX_VAL(initOp) + 1];
5090                         if (fp->fInputIdx == *pLastInputIdx) {
5091                             break;
5092                         }
5093                         *pLastInputIdx = fp->fInputIdx;
5094                     }
5095 
5096                     // Loop Continuation: we will fall into the pattern following the loop
5097                     //   (non-greedy, don't execute loop body first), but first do
5098                     //   a state save to the top of the loop, so that a match failure
5099                     //   in the following pattern will try another iteration of the loop.
5100                     fp = StateSave(fp, opValue + 4, status);
5101                 }
5102             }
5103             break;
5104 
5105         case URX_STO_SP:
5106             U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
5107             fData[opValue] = fStack->size();
5108             break;
5109 
5110         case URX_LD_SP:
5111             {
5112                 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
5113                 int32_t newStackSize = (int32_t)fData[opValue];
5114                 U_ASSERT(newStackSize <= fStack->size());
5115                 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
5116                 if (newFP == (int64_t *)fp) {
5117                     break;
5118                 }
5119                 int32_t j;
5120                 for (j=0; j<fFrameSize; j++) {
5121                     newFP[j] = ((int64_t *)fp)[j];
5122                 }
5123                 fp = (REStackFrame *)newFP;
5124                 fStack->setSize(newStackSize);
5125             }
5126             break;
5127 
5128         case URX_BACKREF:
5129             {
5130                 U_ASSERT(opValue < fFrameSize);
5131                 int64_t groupStartIdx = fp->fExtra[opValue];
5132                 int64_t groupEndIdx   = fp->fExtra[opValue+1];
5133                 U_ASSERT(groupStartIdx <= groupEndIdx);
5134                 int64_t inputIndex = fp->fInputIdx;
5135                 if (groupStartIdx < 0) {
5136                     // This capture group has not participated in the match thus far,
5137                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
5138                     break;
5139                 }
5140                 UBool success = TRUE;
5141                 for (int64_t groupIndex = groupStartIdx; groupIndex < groupEndIdx; ++groupIndex,++inputIndex) {
5142                     if (inputIndex >= fActiveLimit) {
5143                         success = FALSE;
5144                         fHitEnd = TRUE;
5145                         break;
5146                     }
5147                     if (inputBuf[groupIndex] != inputBuf[inputIndex]) {
5148                         success = FALSE;
5149                         break;
5150                     }
5151                 }
5152                 if (success && groupStartIdx < groupEndIdx && U16_IS_LEAD(inputBuf[groupEndIdx-1]) &&
5153                         inputIndex < fActiveLimit && U16_IS_TRAIL(inputBuf[inputIndex])) {
5154                     // Capture group ended with an unpaired lead surrogate.
5155                     // Back reference is not permitted to match lead only of a surrogatge pair.
5156                     success = FALSE;
5157                 }
5158                 if (success) {
5159                     fp->fInputIdx = inputIndex;
5160                 } else {
5161                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5162                 }
5163             }
5164             break;
5165 
5166         case URX_BACKREF_I:
5167             {
5168                 U_ASSERT(opValue < fFrameSize);
5169                 int64_t groupStartIdx = fp->fExtra[opValue];
5170                 int64_t groupEndIdx   = fp->fExtra[opValue+1];
5171                 U_ASSERT(groupStartIdx <= groupEndIdx);
5172                 if (groupStartIdx < 0) {
5173                     // This capture group has not participated in the match thus far,
5174                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
5175                     break;
5176                 }
5177                 CaseFoldingUCharIterator captureGroupItr(inputBuf, groupStartIdx, groupEndIdx);
5178                 CaseFoldingUCharIterator inputItr(inputBuf, fp->fInputIdx, fActiveLimit);
5179 
5180                 //   Note: if the capture group match was of an empty string the backref
5181                 //         match succeeds.  Verified by testing:  Perl matches succeed
5182                 //         in this case, so we do too.
5183 
5184                 UBool success = TRUE;
5185                 for (;;) {
5186                     UChar32 captureGroupChar = captureGroupItr.next();
5187                     if (captureGroupChar == U_SENTINEL) {
5188                         success = TRUE;
5189                         break;
5190                     }
5191                     UChar32 inputChar = inputItr.next();
5192                     if (inputChar == U_SENTINEL) {
5193                         success = FALSE;
5194                         fHitEnd = TRUE;
5195                         break;
5196                     }
5197                     if (inputChar != captureGroupChar) {
5198                         success = FALSE;
5199                         break;
5200                     }
5201                 }
5202 
5203                 if (success && inputItr.inExpansion()) {
5204                     // We obtained a match by consuming part of a string obtained from
5205                     // case-folding a single code point of the input text.
5206                     // This does not count as an overall match.
5207                     success = FALSE;
5208                 }
5209 
5210                 if (success) {
5211                     fp->fInputIdx = inputItr.getIndex();
5212                 } else {
5213                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5214                 }
5215             }
5216             break;
5217 
5218         case URX_STO_INP_LOC:
5219             {
5220                 U_ASSERT(opValue >= 0 && opValue < fFrameSize);
5221                 fp->fExtra[opValue] = fp->fInputIdx;
5222             }
5223             break;
5224 
5225         case URX_JMPX:
5226             {
5227                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
5228                 fp->fPatIdx += 1;
5229                 int32_t dataLoc  = URX_VAL(pat[instrOperandLoc]);
5230                 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
5231                 int32_t savedInputIdx = (int32_t)fp->fExtra[dataLoc];
5232                 U_ASSERT(savedInputIdx <= fp->fInputIdx);
5233                 if (savedInputIdx < fp->fInputIdx) {
5234                     fp->fPatIdx = opValue;                               // JMP
5235                 } else {
5236                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no progress in loop.
5237                 }
5238             }
5239             break;
5240 
5241         case URX_LA_START:
5242             {
5243                 // Entering a look around block.
5244                 // Save Stack Ptr, Input Pos.
5245                 U_ASSERT(opValue>=0 && opValue+3<fPattern->fDataSize);
5246                 fData[opValue]   = fStack->size();
5247                 fData[opValue+1] = fp->fInputIdx;
5248                 fData[opValue+2] = fActiveStart;
5249                 fData[opValue+3] = fActiveLimit;
5250                 fActiveStart     = fLookStart;          // Set the match region change for
5251                 fActiveLimit     = fLookLimit;          //   transparent bounds.
5252             }
5253             break;
5254 
5255         case URX_LA_END:
5256             {
5257                 // Leaving a look around block.
5258                 //  restore Stack Ptr, Input Pos to positions they had on entry to block.
5259                 U_ASSERT(opValue>=0 && opValue+3<fPattern->fDataSize);
5260                 int32_t stackSize = fStack->size();
5261                 int32_t newStackSize = (int32_t)fData[opValue];
5262                 U_ASSERT(stackSize >= newStackSize);
5263                 if (stackSize > newStackSize) {
5264                     // Copy the current top frame back to the new (cut back) top frame.
5265                     //   This makes the capture groups from within the look-ahead
5266                     //   expression available.
5267                     int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
5268                     int32_t j;
5269                     for (j=0; j<fFrameSize; j++) {
5270                         newFP[j] = ((int64_t *)fp)[j];
5271                     }
5272                     fp = (REStackFrame *)newFP;
5273                     fStack->setSize(newStackSize);
5274                 }
5275                 fp->fInputIdx = fData[opValue+1];
5276 
5277                 // Restore the active region bounds in the input string; they may have
5278                 //    been changed because of transparent bounds on a Region.
5279                 fActiveStart = fData[opValue+2];
5280                 fActiveLimit = fData[opValue+3];
5281                 U_ASSERT(fActiveStart >= 0);
5282                 U_ASSERT(fActiveLimit <= fInputLength);
5283             }
5284             break;
5285 
5286         case URX_ONECHAR_I:
5287             if (fp->fInputIdx < fActiveLimit) {
5288                 UChar32 c;
5289                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5290                 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
5291                     break;
5292                 }
5293             } else {
5294                 fHitEnd = TRUE;
5295             }
5296             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5297             break;
5298 
5299         case URX_STRING_I:
5300             // Case-insensitive test input against a literal string.
5301             // Strings require two slots in the compiled pattern, one for the
5302             //   offset to the string text, and one for the length.
5303             //   The compiled string has already been case folded.
5304             {
5305                 const UChar *patternString = litText + opValue;
5306 
5307                 op      = (int32_t)pat[fp->fPatIdx];
5308                 fp->fPatIdx++;
5309                 opType  = URX_TYPE(op);
5310                 opValue = URX_VAL(op);
5311                 U_ASSERT(opType == URX_STRING_LEN);
5312                 int32_t patternStringLen = opValue;  // Length of the string from the pattern.
5313 
5314                 UChar32      cText;
5315                 UChar32      cPattern;
5316                 UBool        success = TRUE;
5317                 int32_t      patternStringIdx  = 0;
5318                 CaseFoldingUCharIterator inputIterator(inputBuf, fp->fInputIdx, fActiveLimit);
5319                 while (patternStringIdx < patternStringLen) {
5320                     U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern);
5321                     cText = inputIterator.next();
5322                     if (cText != cPattern) {
5323                         success = FALSE;
5324                         if (cText == U_SENTINEL) {
5325                             fHitEnd = TRUE;
5326                         }
5327                         break;
5328                     }
5329                 }
5330                 if (inputIterator.inExpansion()) {
5331                     success = FALSE;
5332                 }
5333 
5334                 if (success) {
5335                     fp->fInputIdx = inputIterator.getIndex();
5336                 } else {
5337                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5338                 }
5339             }
5340             break;
5341 
5342         case URX_LB_START:
5343             {
5344                 // Entering a look-behind block.
5345                 // Save Stack Ptr, Input Pos and active input region.
5346                 //   TODO:  implement transparent bounds.  Ticket #6067
5347                 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
5348                 fData[opValue]   = fStack->size();
5349                 fData[opValue+1] = fp->fInputIdx;
5350                 // Save input string length, then reset to pin any matches to end at
5351                 //   the current position.
5352                 fData[opValue+2] = fActiveStart;
5353                 fData[opValue+3] = fActiveLimit;
5354                 fActiveStart     = fRegionStart;
5355                 fActiveLimit     = fp->fInputIdx;
5356                 // Init the variable containing the start index for attempted matches.
5357                 fData[opValue+4] = -1;
5358             }
5359             break;
5360 
5361 
5362         case URX_LB_CONT:
5363             {
5364                 // Positive Look-Behind, at top of loop checking for matches of LB expression
5365                 //    at all possible input starting positions.
5366 
5367                 // Fetch the min and max possible match lengths.  They are the operands
5368                 //   of this op in the pattern.
5369                 int32_t minML = (int32_t)pat[fp->fPatIdx++];
5370                 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
5371                 U_ASSERT(minML <= maxML);
5372                 U_ASSERT(minML >= 0);
5373 
5374                 // Fetch (from data) the last input index where a match was attempted.
5375                 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
5376                 int64_t  &lbStartIdx = fData[opValue+4];
5377                 if (lbStartIdx < 0) {
5378                     // First time through loop.
5379                     lbStartIdx = fp->fInputIdx - minML;
5380                     if (lbStartIdx > 0 && lbStartIdx < fInputLength) {
5381                         U16_SET_CP_START(inputBuf, 0, lbStartIdx);
5382                     }
5383                 } else {
5384                     // 2nd through nth time through the loop.
5385                     // Back up start position for match by one.
5386                     if (lbStartIdx == 0) {
5387                         lbStartIdx--;
5388                     } else {
5389                         U16_BACK_1(inputBuf, 0, lbStartIdx);
5390                     }
5391                 }
5392 
5393                 if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
5394                     // We have tried all potential match starting points without
5395                     //  getting a match.  Backtrack out, and out of the
5396                     //   Look Behind altogether.
5397                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5398                     fActiveStart = fData[opValue+2];
5399                     fActiveLimit = fData[opValue+3];
5400                     U_ASSERT(fActiveStart >= 0);
5401                     U_ASSERT(fActiveLimit <= fInputLength);
5402                     break;
5403                 }
5404 
5405                 //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
5406                 //      (successful match will fall off the end of the loop.)
5407                 fp = StateSave(fp, fp->fPatIdx-3, status);
5408                 fp->fInputIdx =  lbStartIdx;
5409             }
5410             break;
5411 
5412         case URX_LB_END:
5413             // End of a look-behind block, after a successful match.
5414             {
5415                 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
5416                 if (fp->fInputIdx != fActiveLimit) {
5417                     //  The look-behind expression matched, but the match did not
5418                     //    extend all the way to the point that we are looking behind from.
5419                     //  FAIL out of here, which will take us back to the LB_CONT, which
5420                     //     will retry the match starting at another position or fail
5421                     //     the look-behind altogether, whichever is appropriate.
5422                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5423                     break;
5424                 }
5425 
5426                 // Look-behind match is good.  Restore the original input string region,
5427                 //   which had been truncated to pin the end of the lookbehind match to the
5428                 //   position being looked-behind.
5429                 fActiveStart = fData[opValue+2];
5430                 fActiveLimit = fData[opValue+3];
5431                 U_ASSERT(fActiveStart >= 0);
5432                 U_ASSERT(fActiveLimit <= fInputLength);
5433             }
5434             break;
5435 
5436 
5437         case URX_LBN_CONT:
5438             {
5439                 // Negative Look-Behind, at top of loop checking for matches of LB expression
5440                 //    at all possible input starting positions.
5441 
5442                 // Fetch the extra parameters of this op.
5443                 int32_t minML       = (int32_t)pat[fp->fPatIdx++];
5444                 int32_t maxML       = (int32_t)pat[fp->fPatIdx++];
5445                 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
5446                 continueLoc = URX_VAL(continueLoc);
5447                 U_ASSERT(minML <= maxML);
5448                 U_ASSERT(minML >= 0);
5449                 U_ASSERT(continueLoc > fp->fPatIdx);
5450 
5451                 // Fetch (from data) the last input index where a match was attempted.
5452                 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
5453                 int64_t  &lbStartIdx = fData[opValue+4];
5454                 if (lbStartIdx < 0) {
5455                     // First time through loop.
5456                     lbStartIdx = fp->fInputIdx - minML;
5457                     if (lbStartIdx > 0 && lbStartIdx < fInputLength) {
5458                         U16_SET_CP_START(inputBuf, 0, lbStartIdx);
5459                     }
5460                 } else {
5461                     // 2nd through nth time through the loop.
5462                     // Back up start position for match by one.
5463                     if (lbStartIdx == 0) {
5464                         lbStartIdx--;   // Because U16_BACK is unsafe starting at 0.
5465                     } else {
5466                         U16_BACK_1(inputBuf, 0, lbStartIdx);
5467                     }
5468                 }
5469 
5470                 if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
5471                     // We have tried all potential match starting points without
5472                     //  getting a match, which means that the negative lookbehind as
5473                     //  a whole has succeeded.  Jump forward to the continue location
5474                     fActiveStart = fData[opValue+2];
5475                     fActiveLimit = fData[opValue+3];
5476                     U_ASSERT(fActiveStart >= 0);
5477                     U_ASSERT(fActiveLimit <= fInputLength);
5478                     fp->fPatIdx = continueLoc;
5479                     break;
5480                 }
5481 
5482                 //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
5483                 //      (successful match will cause a FAIL out of the loop altogether.)
5484                 fp = StateSave(fp, fp->fPatIdx-4, status);
5485                 fp->fInputIdx =  lbStartIdx;
5486             }
5487             break;
5488 
5489         case URX_LBN_END:
5490             // End of a negative look-behind block, after a successful match.
5491             {
5492                 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
5493                 if (fp->fInputIdx != fActiveLimit) {
5494                     //  The look-behind expression matched, but the match did not
5495                     //    extend all the way to the point that we are looking behind from.
5496                     //  FAIL out of here, which will take us back to the LB_CONT, which
5497                     //     will retry the match starting at another position or succeed
5498                     //     the look-behind altogether, whichever is appropriate.
5499                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5500                     break;
5501                 }
5502 
5503                 // Look-behind expression matched, which means look-behind test as
5504                 //   a whole Fails
5505 
5506                 //   Restore the original input string length, which had been truncated
5507                 //   inorder to pin the end of the lookbehind match
5508                 //   to the position being looked-behind.
5509                 fActiveStart = fData[opValue+2];
5510                 fActiveLimit = fData[opValue+3];
5511                 U_ASSERT(fActiveStart >= 0);
5512                 U_ASSERT(fActiveLimit <= fInputLength);
5513 
5514                 // Restore original stack position, discarding any state saved
5515                 //   by the successful pattern match.
5516                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5517                 int32_t newStackSize = (int32_t)fData[opValue];
5518                 U_ASSERT(fStack->size() > newStackSize);
5519                 fStack->setSize(newStackSize);
5520 
5521                 //  FAIL, which will take control back to someplace
5522                 //  prior to entering the look-behind test.
5523                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5524             }
5525             break;
5526 
5527 
5528         case URX_LOOP_SR_I:
5529             // Loop Initialization for the optimized implementation of
5530             //     [some character set]*
5531             //   This op scans through all matching input.
5532             //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
5533             {
5534                 U_ASSERT(opValue > 0 && opValue < fSets->size());
5535                 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
5536                 UnicodeSet   *s  = (UnicodeSet *)fSets->elementAt(opValue);
5537 
5538                 // Loop through input, until either the input is exhausted or
5539                 //   we reach a character that is not a member of the set.
5540                 int32_t ix = (int32_t)fp->fInputIdx;
5541                 for (;;) {
5542                     if (ix >= fActiveLimit) {
5543                         fHitEnd = TRUE;
5544                         break;
5545                     }
5546                     UChar32   c;
5547                     U16_NEXT(inputBuf, ix, fActiveLimit, c);
5548                     if (c<256) {
5549                         if (s8->contains(c) == FALSE) {
5550                             U16_BACK_1(inputBuf, 0, ix);
5551                             break;
5552                         }
5553                     } else {
5554                         if (s->contains(c) == FALSE) {
5555                             U16_BACK_1(inputBuf, 0, ix);
5556                             break;
5557                         }
5558                     }
5559                 }
5560 
5561                 // If there were no matching characters, skip over the loop altogether.
5562                 //   The loop doesn't run at all, a * op always succeeds.
5563                 if (ix == fp->fInputIdx) {
5564                     fp->fPatIdx++;   // skip the URX_LOOP_C op.
5565                     break;
5566                 }
5567 
5568                 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
5569                 //   must follow.  It's operand is the stack location
5570                 //   that holds the starting input index for the match of this [set]*
5571                 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
5572                 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
5573                 int32_t stackLoc = URX_VAL(loopcOp);
5574                 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
5575                 fp->fExtra[stackLoc] = fp->fInputIdx;
5576                 fp->fInputIdx = ix;
5577 
5578                 // Save State to the URX_LOOP_C op that follows this one,
5579                 //   so that match failures in the following code will return to there.
5580                 //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
5581                 fp = StateSave(fp, fp->fPatIdx, status);
5582                 fp->fPatIdx++;
5583             }
5584             break;
5585 
5586 
5587         case URX_LOOP_DOT_I:
5588             // Loop Initialization for the optimized implementation of .*
5589             //   This op scans through all remaining input.
5590             //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
5591             {
5592                 // Loop through input until the input is exhausted (we reach an end-of-line)
5593                 // In DOTALL mode, we can just go straight to the end of the input.
5594                 int32_t ix;
5595                 if ((opValue & 1) == 1) {
5596                     // Dot-matches-All mode.  Jump straight to the end of the string.
5597                     ix = (int32_t)fActiveLimit;
5598                     fHitEnd = TRUE;
5599                 } else {
5600                     // NOT DOT ALL mode.  Line endings do not match '.'
5601                     // Scan forward until a line ending or end of input.
5602                     ix = (int32_t)fp->fInputIdx;
5603                     for (;;) {
5604                         if (ix >= fActiveLimit) {
5605                             fHitEnd = TRUE;
5606                             break;
5607                         }
5608                         UChar32   c;
5609                         U16_NEXT(inputBuf, ix, fActiveLimit, c);   // c = inputBuf[ix++]
5610                         if ((c & 0x7f) <= 0x29) {          // Fast filter of non-new-line-s
5611                             if ((c == 0x0a) ||             //  0x0a is newline in both modes.
5612                                 (((opValue & 2) == 0) &&    // IF not UNIX_LINES mode
5613                                    isLineTerminator(c))) {
5614                                 //  char is a line ending.  Put the input pos back to the
5615                                 //    line ending char, and exit the scanning loop.
5616                                 U16_BACK_1(inputBuf, 0, ix);
5617                                 break;
5618                             }
5619                         }
5620                     }
5621                 }
5622 
5623                 // If there were no matching characters, skip over the loop altogether.
5624                 //   The loop doesn't run at all, a * op always succeeds.
5625                 if (ix == fp->fInputIdx) {
5626                     fp->fPatIdx++;   // skip the URX_LOOP_C op.
5627                     break;
5628                 }
5629 
5630                 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
5631                 //   must follow.  It's operand is the stack location
5632                 //   that holds the starting input index for the match of this .*
5633                 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
5634                 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
5635                 int32_t stackLoc = URX_VAL(loopcOp);
5636                 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
5637                 fp->fExtra[stackLoc] = fp->fInputIdx;
5638                 fp->fInputIdx = ix;
5639 
5640                 // Save State to the URX_LOOP_C op that follows this one,
5641                 //   so that match failures in the following code will return to there.
5642                 //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
5643                 fp = StateSave(fp, fp->fPatIdx, status);
5644                 fp->fPatIdx++;
5645             }
5646             break;
5647 
5648 
5649         case URX_LOOP_C:
5650             {
5651                 U_ASSERT(opValue>=0 && opValue<fFrameSize);
5652                 backSearchIndex = (int32_t)fp->fExtra[opValue];
5653                 U_ASSERT(backSearchIndex <= fp->fInputIdx);
5654                 if (backSearchIndex == fp->fInputIdx) {
5655                     // We've backed up the input idx to the point that the loop started.
5656                     // The loop is done.  Leave here without saving state.
5657                     //  Subsequent failures won't come back here.
5658                     break;
5659                 }
5660                 // Set up for the next iteration of the loop, with input index
5661                 //   backed up by one from the last time through,
5662                 //   and a state save to this instruction in case the following code fails again.
5663                 //   (We're going backwards because this loop emulates stack unwinding, not
5664                 //    the initial scan forward.)
5665                 U_ASSERT(fp->fInputIdx > 0);
5666                 UChar32 prevC;
5667                 U16_PREV(inputBuf, 0, fp->fInputIdx, prevC); // !!!: should this 0 be one of f*Limit?
5668 
5669                 if (prevC == 0x0a &&
5670                     fp->fInputIdx > backSearchIndex &&
5671                     inputBuf[fp->fInputIdx-1] == 0x0d) {
5672                     int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
5673                     if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
5674                         // .*, stepping back over CRLF pair.
5675                         U16_BACK_1(inputBuf, 0, fp->fInputIdx);
5676                     }
5677                 }
5678 
5679 
5680                 fp = StateSave(fp, fp->fPatIdx-1, status);
5681             }
5682             break;
5683 
5684 
5685 
5686         default:
5687             // Trouble.  The compiled pattern contains an entry with an
5688             //           unrecognized type tag.
5689             UPRV_UNREACHABLE_ASSERT;
5690             // Unknown opcode type in opType = URX_TYPE(pat[fp->fPatIdx]). But we have
5691             // reports of this in production code, don't use UPRV_UNREACHABLE_EXIT.
5692             // See ICU-21669.
5693             status = U_INTERNAL_PROGRAM_ERROR;
5694         }
5695 
5696         if (U_FAILURE(status)) {
5697             isMatch = FALSE;
5698             break;
5699         }
5700     }
5701 
5702 breakFromLoop:
5703     fMatch = isMatch;
5704     if (isMatch) {
5705         fLastMatchEnd = fMatchEnd;
5706         fMatchStart   = startIdx;
5707         fMatchEnd     = fp->fInputIdx;
5708     }
5709 
5710 #ifdef REGEX_RUN_DEBUG
5711     if (fTraceDebug) {
5712         if (isMatch) {
5713             printf("Match.  start=%ld   end=%ld\n\n", fMatchStart, fMatchEnd);
5714         } else {
5715             printf("No match\n\n");
5716         }
5717     }
5718 #endif
5719 
5720     fFrame = fp;                // The active stack frame when the engine stopped.
5721                                 //   Contains the capture group results that we need to
5722                                 //    access later.
5723 
5724     return;
5725 }
5726 
5727 
5728 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher)
5729 
5730 U_NAMESPACE_END
5731 
5732 #endif  // !UCONFIG_NO_REGULAR_EXPRESSIONS
5733 
5734